# Mentions

### Load the data and count.

In [1]:
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import tweet_iter, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Filepaths of the files to load.
filepaths = ['d59d27e2f2ed4778881573df2ecf2fad_001.json.gz',
            '25319652321b4bb498b250ffc53aa0f0_001.json.gz']

# Simply the tweet on load
def mention_transform(tweet):
    mentions = []
    if 'retweeted_status' not in tweet and 'quoted_status' not in tweet:
        for mention in tweet.get('entities', {}).get('user_mentions', []):
            mentions.append({
                'tweet_id': tweet['id_str'],
                'user_id': tweet['user']['id_str'],
                'screen_name': tweet['user']['screen_name'],
                'mention_user_id': mention['id_str'],
                'mention_screen_name': mention['screen_name'],
                'tweet_created_at': date_parse(tweet['created_at'])
            })
    return mentions

mention_df = pd.DataFrame(tweet_iter(filepaths, tweet_transform_func=mention_transform))


DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
D

### Number of mentions found in the dataset

In [2]:
mention_df.count()

mention_screen_name    1363129
mention_user_id        1363129
screen_name            1363129
tweet_created_at       1363129
tweet_id               1363129
user_id                1363129
dtype: int64

### The mention data
Each mention consists of the tweet id, the screen name and user id that is mentioned,
and the screen_name and user_id that is mentioning.

In [3]:
mention_df.head()

Unnamed: 0,mention_screen_name,mention_user_id,screen_name,tweet_created_at,tweet_id,user_id
0,davidbschultz,53739928,A_Childers_,2017-03-31 01:52:09+00:00,847627543142219776,1638925448
1,davidbschultz,53739928,A_Childers_,2017-03-30 23:52:23+00:00,847597404719267841,1638925448
2,AriPeskoe,499013898,A_Childers_,2017-03-30 23:37:48+00:00,847593734896324608,1638925448
3,deantscott,134918286,A_Childers_,2017-03-30 23:37:48+00:00,847593734896324608,1638925448
4,Pat_Ambrosio,2497185313,A_Childers_,2017-03-30 19:41:27+00:00,847534254355599364,1638925448


### Remove duplicates

In [4]:
dedupe_mention_df = mention_df.drop_duplicates()
dedupe_mention_df.count()

mention_screen_name    1348153
mention_user_id        1348153
screen_name            1348153
tweet_created_at       1348153
tweet_id               1348153
user_id                1348153
dtype: int64

In [5]:
# From the mentions, extract map of user ids to screen names
user_id_lookup_df = mention_df.loc[mention_df.groupby('mention_user_id')['tweet_created_at'].idxmax()].ix[:,['mention_user_id', 'mention_screen_name']].set_index(['mention_user_id'])
user_id_lookup_df.count()

mention_screen_name    137344
dtype: int64

In [6]:
user_id_lookup_df.head()

Unnamed: 0_level_0,mention_screen_name
mention_user_id,Unnamed: 1_level_1
1000010898,RoyScranton
100002112,whyyradiotimes
1000030188,jessieb747
100003141,NCCDtweets
100004577,Orange_France


In [7]:
# Group by user_id
# This count should match the user_id map count
mention_summary_user_id_df = pd.DataFrame(dedupe_mention_df.groupby('mention_user_id').size(), columns=['mention_count'])
mention_summary_user_id_df.count()

mention_count    137344
dtype: int64

In [8]:
mention_summary_user_id_df.head()

Unnamed: 0_level_0,mention_count
mention_user_id,Unnamed: 1_level_1
1000010898,20
100002112,15
1000030188,10
100003141,2
100004577,1


In [9]:
# Join with user id map
mention_summary_df = mention_summary_user_id_df.join(user_id_lookup_df)
mention_summary_df.count()

mention_count          137344
mention_screen_name    137344
dtype: int64

In [10]:
mention_summary_df.head()

Unnamed: 0_level_0,mention_count,mention_screen_name
mention_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000010898,20,RoyScranton
100002112,15,whyyradiotimes
1000030188,10,jessieb747
100003141,2,NCCDtweets
100004577,1,Orange_France


### Load known Twitter accounts

In [11]:
from utils import load_screen_name_lookup_df

screen_name_lookup_df = load_screen_name_lookup_df()
screen_name_lookup_df['type'].value_counts()

media          5915
government     2959
reporters      1457
politicians     601
Name: type, dtype: int64

### Join the mentions and known Twitter accounts

In [12]:
mention_join_df = mention_summary_df.join(screen_name_lookup_df, how='left')
mention_join_df['type'].fillna('unknown', inplace=True)
mention_join_df.index.name = 'user_id'
mention_join_df.head()

Unnamed: 0_level_0,mention_count,mention_screen_name,screen_name,type,screen_name_lower
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000010898,20,RoyScranton,,unknown,
100002112,15,whyyradiotimes,,unknown,
1000030188,10,jessieb747,,unknown,
100003141,2,NCCDtweets,,unknown,
100004577,1,Orange_France,,unknown,


## Top (by mention count) accounts that are matched against known Twitter accounts <----------

In [13]:
top_known_mentions_df = mention_join_df[pd.notnull(mention_join_df.screen_name)].sort_values('mention_count', ascending=False)
top_known_mentions_df[['mention_screen_name', 'mention_count', 'type']].head(20)

Unnamed: 0_level_0,mention_screen_name,mention_count,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25073877,realDonaldTrump,19057,politicians
51241574,AP,15077,media
3108351,WSJ,12550,media
15754281,USATODAY,11999,media
822215679726100480,POTUS,9872,politicians
1652541,Reuters,9158,media
15922214,rollcall,7175,media
9300262,politico,7113,media
807095,nytimes,6335,media
818927131883356161,PressSec,5849,politicians


### Number of matched accounts <----------
mention_screen_name is the number of unique mentioned accounts. screen_name is the
number of matched unique accounts.

In [14]:
mention_join_df.count()

mention_count          137344
mention_screen_name    137344
screen_name              3721
type                   137344
screen_name_lower        3721
dtype: int64

## Top accounts by mentions <----------
Unknown for type indicates that it is not matched with an known Twitter account.

In [15]:
top_mentions_df = mention_join_df.sort_values('mention_count', ascending=False)
top_mentions_df[['mention_screen_name', 'mention_count', 'type']].head(50)

Unnamed: 0_level_0,mention_screen_name,mention_count,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25073877,realDonaldTrump,19057,politicians
51241574,AP,15077,media
3108351,WSJ,12550,media
15754281,USATODAY,11999,media
2312829909,CQnow,11286,unknown
1339835893,HillaryClinton,10526,unknown
822215679726100480,POTUS,9872,politicians
1652541,Reuters,9158,media
34713362,business,7976,unknown
15147042,educationweek,7514,unknown


## Mentions by account type <----------

In [16]:
mention_join_df.groupby('type').sum()

Unnamed: 0_level_0,mention_count
type,Unnamed: 1_level_1
government,33127
media,121848
politicians,94800
reporters,174725
unknown,923653


## Top (by mentions) accounts that are not known. <----------
These are the accounts that we will want to categorize.

In [17]:
top_not_known_mention_df = mention_join_df[mention_join_df.type == 'unknown'].sort_values('mention_count', ascending=False)
top_not_known_mention_df[['mention_screen_name', 'mention_count']].head(100)

Unnamed: 0_level_0,mention_screen_name,mention_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2312829909,CQnow,11286
1339835893,HillaryClinton,10526
34713362,business,7976
15147042,educationweek,7514
459277523,BloombergBNA,6710
18956073,dcexaminer,6253
564111558,bpolitics,3614
216776631,BernieSanders,3313
185817496,FERNnews,3197
23022687,tedcruz,2687


## Mentions per user
For users that made any mentions. Also to possible to figure this out for all users.

In [18]:
dedupe_mention_df['user_id'].value_counts().describe()

count    1429.000000
mean      943.424073
std       797.906480
min         1.000000
25%       319.000000
50%       806.000000
75%      1387.000000
max      6922.000000
Name: user_id, dtype: float64

## Mentions by type per user

### Add type by merging screen name lookup

In [19]:
mention_all_join_df = pd.merge(dedupe_mention_df, screen_name_lookup_df[['type']], how='left', left_on='mention_user_id', right_index=True)
mention_all_join_df['type'].fillna('unknown', inplace=True)
mention_all_join_df.head()

Unnamed: 0,mention_screen_name,mention_user_id,screen_name,tweet_created_at,tweet_id,user_id,type
0,davidbschultz,53739928,A_Childers_,2017-03-31 01:52:09+00:00,847627543142219776,1638925448,reporters
1,davidbschultz,53739928,A_Childers_,2017-03-30 23:52:23+00:00,847597404719267841,1638925448,reporters
2,AriPeskoe,499013898,A_Childers_,2017-03-30 23:37:48+00:00,847593734896324608,1638925448,unknown
3,deantscott,134918286,A_Childers_,2017-03-30 23:37:48+00:00,847593734896324608,1638925448,reporters
4,Pat_Ambrosio,2497185313,A_Childers_,2017-03-30 19:41:27+00:00,847534254355599364,1638925448,reporters


In [20]:
mention_summary_by_user_df = mention_all_join_df.groupby([mention_all_join_df.user_id, mention_all_join_df.type]).size().unstack().fillna(0)
# Add a total column
mention_summary_by_user_df['total'] = mention_summary_by_user_df.sum(axis=1)
for col_name in mention_summary_by_user_df.columns[:-1]:
    mention_summary_by_user_df['{}_percent'.format(col_name)] = mention_summary_by_user_df[col_name] / mention_summary_by_user_df.total
mention_summary_by_user_df.head(10)

type,government,media,politicians,reporters,unknown,total,government_percent,media_percent,politicians_percent,reporters_percent,unknown_percent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100165378,31.0,50.0,94.0,7.0,1040.0,1222.0,0.025368,0.040917,0.076923,0.005728,0.851064
1001991865,2.0,11.0,1.0,8.0,117.0,139.0,0.014388,0.079137,0.007194,0.057554,0.841727
1002229862,27.0,3.0,33.0,328.0,634.0,1025.0,0.026341,0.002927,0.032195,0.32,0.618537
100802089,9.0,55.0,16.0,31.0,111.0,222.0,0.040541,0.247748,0.072072,0.13964,0.5
100860790,23.0,92.0,45.0,84.0,721.0,965.0,0.023834,0.095337,0.046632,0.087047,0.74715
1009749229,6.0,19.0,1.0,183.0,1042.0,1251.0,0.004796,0.015188,0.000799,0.146283,0.832934
102171691,1.0,513.0,124.0,71.0,364.0,1073.0,0.000932,0.478099,0.115564,0.06617,0.339236
102789488,3.0,38.0,69.0,271.0,1421.0,1802.0,0.001665,0.021088,0.038291,0.150388,0.788568
102994740,0.0,17.0,23.0,0.0,302.0,342.0,0.0,0.049708,0.067251,0.0,0.883041
103016675,1.0,2.0,6.0,288.0,1059.0,1356.0,0.000737,0.001475,0.004425,0.212389,0.780973


### Average of percent of mentions by type for each user
That is, for each user determine the percent of mentions by type. Then take the average of each type.

Thus, this mention analysis is on a per-user basis, accounting for how prolific a tweeter a user is. (That is, users who tweet aren't weighed more heavily.)


In [21]:
mention_summary_by_user_df.filter(axis=1, regex="_percent$").mean()

type
government_percent     0.021666
media_percent          0.092097
politicians_percent    0.058533
reporters_percent      0.139507
unknown_percent        0.688196
dtype: float64