# Mentions

### Load the data and count.

In [1]:
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import tweet_load_iter, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Simply the tweet on load
def mention_transform(tweet):
    mentions = []
    if 'retweeted_status' not in tweet and 'quoted_status' not in tweet:
        for mention in tweet.get('entities', {}).get('user_mentions', []):
            mentions.append({
                'tweet_id': tweet['id_str'],
                'user_id': tweet['user']['id_str'],
                'screen_name': tweet['user']['screen_name'],
                'mention_user_id': mention['id_str'],
                'mention_screen_name': mention['screen_name'],
                'tweet_created_at': date_parse(tweet['created_at'])
            })
    return mentions

mention_df = pd.DataFrame(tweet_load_iter(tweet_transform_func=mention_transform))


INFO:root:Loading from tweets/8fd6e3d7bf9e41ad8898c175c314d78e_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
INFO:root:Loading from tweets/8fd6e3d7bf9e41ad8898c175c314d78e_002.json.gz
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
INFO:root:Loading from tweets/8fd6e3d7bf9e41ad8898c175c314d78e_003.json.gz
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
INFO:root:Loading from tweets/8fd6e3d7bf9e41ad8898c175c314d78e_004.json.gz
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
INFO:root:Loading from tweets/8fd6e3d7bf9e41ad8898c175c314d78e_005.json.gz
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 125

### Number of mentions found in the dataset

In [2]:
mention_df.count()

mention_screen_name    1737111
mention_user_id        1737111
screen_name            1737111
tweet_created_at       1737111
tweet_id               1737111
user_id                1737111
dtype: int64

### The mention data
Each mention consists of the tweet id, the screen name and user id that is mentioned,
and the screen_name and user_id that is mentioning.

In [3]:
mention_df.head()

Unnamed: 0,mention_screen_name,mention_user_id,screen_name,tweet_created_at,tweet_id,user_id
0,nielslesniewski,140286364,loren_duggan,2017-03-30 12:41:33+00:00,847428582821449730,780221130
1,BrianToddCNN,104851609,akesslerdc,2017-03-29 14:02:14+00:00,847086500944777220,285772181
2,JamesVGrimaldi,17178161,akesslerdc,2017-03-25 02:45:16+00:00,845466584625885184,285772181
3,realDonaldTrump,25073877,akesslerdc,2017-03-24 20:51:43+00:00,845377611165552640,285772181
4,POTUS,822215679726100480,akesslerdc,2017-03-24 20:46:38+00:00,845376332011913217,285772181


### Remove duplicates

In [4]:
dedupe_mention_df = mention_df.drop_duplicates()
dedupe_mention_df.count()

mention_screen_name    1719637
mention_user_id        1719637
screen_name            1719637
tweet_created_at       1719637
tweet_id               1719637
user_id                1719637
dtype: int64

In [5]:
# From the mentions, extract map of user ids to screen names
user_id_lookup_df = mention_df.loc[mention_df.groupby('mention_user_id')['tweet_created_at'].idxmax()].ix[:,['mention_user_id', 'mention_screen_name']].set_index(['mention_user_id'])
user_id_lookup_df.count()

mention_screen_name    173187
dtype: int64

In [6]:
user_id_lookup_df.head()

Unnamed: 0_level_0,mention_screen_name
mention_user_id,Unnamed: 1_level_1
1000010898,RoyScranton
100002112,whyyradiotimes
1000030188,jessieb747
100003141,NCCDtweets
100004577,Orange_France


In [7]:
# Group by user_id
# This count should match the user_id map count
mention_summary_user_id_df = pd.DataFrame(dedupe_mention_df.groupby('mention_user_id').size(), columns=['mention_count'])
mention_summary_user_id_df.count()

mention_count    173187
dtype: int64

In [8]:
mention_summary_user_id_df.head()

Unnamed: 0_level_0,mention_count
mention_user_id,Unnamed: 1_level_1
1000010898,20
100002112,15
1000030188,10
100003141,2
100004577,1


In [9]:
# Join with user id map
mention_summary_df = mention_summary_user_id_df.join(user_id_lookup_df)
mention_summary_df.count()

mention_count          173187
mention_screen_name    173187
dtype: int64

In [10]:
mention_summary_df.head()

Unnamed: 0_level_0,mention_count,mention_screen_name
mention_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000010898,20,RoyScranton
100002112,15,whyyradiotimes
1000030188,10,jessieb747
100003141,2,NCCDtweets
100004577,1,Orange_France


### Load known Twitter accounts

In [11]:
from utils import load_screen_name_lookup_df

screen_name_lookup_df = load_screen_name_lookup_df()
screen_name_lookup_df['type'].value_counts()

media          4167
government     2958
reporters      1867
politicians     601
Name: type, dtype: int64

### Join the mentions and known Twitter accounts

In [12]:
mention_join_df = mention_summary_df.join(screen_name_lookup_df, how='left')
mention_join_df['type'].fillna('unknown', inplace=True)
mention_join_df.index.name = 'user_id'
mention_join_df.head()

Unnamed: 0_level_0,mention_count,mention_screen_name,screen_name,type,screen_name_lower
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000010898,20,RoyScranton,,unknown,
100002112,15,whyyradiotimes,,unknown,
1000030188,10,jessieb747,,unknown,
100003141,2,NCCDtweets,,unknown,
100004577,1,Orange_France,,unknown,


## Top (by mention count) accounts that are matched against known Twitter accounts <----------

In [13]:
top_known_mentions_df = mention_join_df[pd.notnull(mention_join_df.screen_name)].sort_values('mention_count', ascending=False)
top_known_mentions_df[['mention_screen_name', 'mention_count', 'type']].head(20)

Unnamed: 0_level_0,mention_screen_name,mention_count,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25073877,realDonaldTrump,24448,politicians
51241574,AP,17134,media
822215679726100480,POTUS,14363,politicians
3108351,WSJ,13659,media
15754281,USATODAY,12766,media
1652541,Reuters,9953,media
34713362,business,8193,media
818927131883356161,PressSec,8101,politicians
807095,nytimes,8055,media
9300262,politico,7948,media


### Number of matched accounts <----------
mention_screen_name is the number of unique mentioned accounts. screen_name is the
number of matched unique accounts.

In [14]:
mention_join_df.count()

mention_count          173187
mention_screen_name    173187
screen_name              4194
type                   173187
screen_name_lower        4194
dtype: int64

## Top accounts by mentions <----------
Unknown for type indicates that it is not matched with an known Twitter account.

In [15]:
top_mentions_df = mention_join_df.sort_values('mention_count', ascending=False)
top_mentions_df[['mention_screen_name', 'mention_count', 'type']].head(50)

Unnamed: 0_level_0,mention_screen_name,mention_count,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25073877,realDonaldTrump,24448,politicians
51241574,AP,17134,media
822215679726100480,POTUS,14363,politicians
3108351,WSJ,13659,media
1339835893,HillaryClinton,12788,unknown
15754281,USATODAY,12766,media
2312829909,CQnow,11420,unknown
1652541,Reuters,9953,media
34713362,business,8193,media
818927131883356161,PressSec,8101,politicians


## Mentions by account type <----------

In [16]:
mention_by_type_df = mention_join_df.groupby('type').sum()
mention_by_type_df['type_percentage']= mention_by_type_df['mention_count'] / mention_by_type_df['mention_count'].sum()
mention_by_type_df

Unnamed: 0_level_0,mention_count,type_percentage
type,Unnamed: 1_level_1,Unnamed: 2_level_1
government,38078,0.022143
media,255587,0.148628
politicians,120399,0.070014
reporters,213905,0.12439
unknown,1091668,0.634825


## Top (by mentions) accounts that are not known. <----------
These are the accounts that we will want to categorize.

In [17]:
top_not_known_mention_df = mention_join_df[mention_join_df.type == 'unknown'].sort_values('mention_count', ascending=False)[['mention_screen_name', 'mention_count']]
top_not_known_mention_df['cumulative_mention_count'] = top_not_known_mention_df.mention_count.cumsum()
top_not_known_mention_df['cumulative_mention_percent'] = top_not_known_mention_df['cumulative_mention_count'] / top_not_known_mention_df['mention_count'].sum()
top_not_known_mention_df.head(100)

Unnamed: 0_level_0,mention_screen_name,mention_count,cumulative_mention_count,cumulative_mention_percent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1339835893,HillaryClinton,12788,12788,0.011714
2312829909,CQnow,11420,24208,0.022175
216776631,BernieSanders,5068,29276,0.026818
564111558,bpolitics,3899,33175,0.030389
813286,BarackObama,3435,36610,0.033536
23022687,tedcruz,3357,39967,0.036611
236891946,AJStream,2840,42807,0.039212
34613951,BloombergLaw,2693,45500,0.041679
15745368,marcorubio,2650,48150,0.044107
17197344,Nextgov,2417,50567,0.046321


### Number of unmatch accounts

In [18]:
top_not_known_mention_df.count()

mention_screen_name           168993
mention_count                 168993
cumulative_mention_count      168993
cumulative_mention_percent    168993
dtype: int64

### Number of unknowns with more than 15 mentions

In [19]:
top_not_known_mention_df[top_not_known_mention_df.mention_count >= 15].count()

mention_screen_name           11308
mention_count                 11308
cumulative_mention_count      11308
cumulative_mention_percent    11308
dtype: int64

### Percentage of unknown mentions covered by accounts with more than 15 mentions

In [20]:
top_not_known_mention_df[top_not_known_mention_df.mention_count >= 15].mention_count.sum() / top_not_known_mention_df.mention_count.sum()

0.65702026623478937

### Write unknown accounts with more than 15 mentions to a file

In [21]:
top_not_known_mention_df[['mention_screen_name', 'mention_count']][top_not_known_mention_df.mention_count >= 15].to_csv('unknown_mentions.csv')

## Mentions per user
For users that made any mentions. Also to possible to figure this out for all users.

In [22]:
dedupe_mention_df['user_id'].value_counts().describe()

count     1817.000000
mean       946.415520
std        852.119505
min          1.000000
25%        252.000000
50%        804.000000
75%       1428.000000
max      10790.000000
Name: user_id, dtype: float64

## Mentions by type per user

### Add type by merging screen name lookup

In [23]:
mention_all_join_df = pd.merge(dedupe_mention_df, screen_name_lookup_df[['type']], how='left', left_on='mention_user_id', right_index=True)
mention_all_join_df['type'].fillna('unknown', inplace=True)
mention_all_join_df.head()

Unnamed: 0,mention_screen_name,mention_user_id,screen_name,tweet_created_at,tweet_id,user_id,type
0,nielslesniewski,140286364,loren_duggan,2017-03-30 12:41:33+00:00,847428582821449730,780221130,reporters
1,BrianToddCNN,104851609,akesslerdc,2017-03-29 14:02:14+00:00,847086500944777220,285772181,unknown
2,JamesVGrimaldi,17178161,akesslerdc,2017-03-25 02:45:16+00:00,845466584625885184,285772181,reporters
3,realDonaldTrump,25073877,akesslerdc,2017-03-24 20:51:43+00:00,845377611165552640,285772181,politicians
4,POTUS,822215679726100480,akesslerdc,2017-03-24 20:46:38+00:00,845376332011913217,285772181,politicians


In [24]:
mention_summary_by_user_df = mention_all_join_df.groupby([mention_all_join_df.user_id, mention_all_join_df.type]).size().unstack().fillna(0)
# Add a total column
mention_summary_by_user_df['total'] = mention_summary_by_user_df.sum(axis=1)
for col_name in mention_summary_by_user_df.columns[:-1]:
    mention_summary_by_user_df['{}_percent'.format(col_name)] = mention_summary_by_user_df[col_name] / mention_summary_by_user_df.total
mention_summary_by_user_df.head(10)

type,government,media,politicians,reporters,unknown,total,government_percent,media_percent,politicians_percent,reporters_percent,unknown_percent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100165378,31.0,105.0,96.0,7.0,1023.0,1262.0,0.024564,0.083201,0.07607,0.005547,0.810618
1001991865,2.0,70.0,1.0,8.0,62.0,143.0,0.013986,0.48951,0.006993,0.055944,0.433566
1002229862,29.0,183.0,35.0,339.0,468.0,1054.0,0.027514,0.173624,0.033207,0.321632,0.444023
100802089,9.0,58.0,17.0,32.0,109.0,225.0,0.04,0.257778,0.075556,0.142222,0.484444
100860790,23.0,131.0,46.0,91.0,709.0,1000.0,0.023,0.131,0.046,0.091,0.709
1009749229,6.0,321.0,1.0,189.0,814.0,1331.0,0.004508,0.241172,0.000751,0.141998,0.61157
1013785220,7.0,9.0,13.0,214.0,403.0,646.0,0.010836,0.013932,0.020124,0.331269,0.623839
102171691,1.0,699.0,132.0,92.0,376.0,1300.0,0.000769,0.537692,0.101538,0.070769,0.289231
102238997,9.0,43.0,25.0,290.0,2052.0,2419.0,0.003721,0.017776,0.010335,0.119884,0.848284
102789488,3.0,47.0,69.0,280.0,1405.0,1804.0,0.001663,0.026053,0.038248,0.155211,0.778825


### Average of percent of mentions by type for each user
That is, for each user determine the percent of mentions by type. Then take the average of each type.

Thus, this mention analysis is on a per-user basis, accounting for how prolific a tweeter a user is. (That is, users who tweet aren't weighed more heavily.)


In [25]:
mention_summary_by_user_df.filter(axis=1, regex="_percent$").mean()

type
government_percent     0.019666
media_percent          0.159989
politicians_percent    0.056697
reporters_percent      0.130236
unknown_percent        0.633412
dtype: float64