# Mentions

### Load the data and count.

In [1]:
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import tweet_load_iter, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Simply the tweet on load
def mention_transform(tweet):
    mentions = []
    if 'retweeted_status' not in tweet and 'quoted_status' not in tweet:
        for mention in tweet.get('entities', {}).get('user_mentions', []):
            mentions.append({
                'tweet_id': tweet['id_str'],
                'user_id': tweet['user']['id_str'],
                'screen_name': tweet['user']['screen_name'],
                'mention_user_id': mention['id_str'],
                'mention_screen_name': mention['screen_name'],
                'tweet_created_at': date_parse(tweet['created_at'])
            })
    return mentions

mention_df = pd.DataFrame(tweet_load_iter(tweet_transform_func=mention_transform))


INFO:root:Loading from tweets/6eea2088e010437da4b6031c2abffdc9_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
INFO:root:Loading from tweets/a7bcdbde7a104285b92fe26e286f2543_001.json.gz
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
INFO:root:Loading from tweets/e1c824ff2b3c4c5a9a93a16e5036d09a_001.json.gz
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000


### Number of mentions found in the dataset

In [2]:
mention_df.count()

mention_screen_name    277457
mention_user_id        277457
screen_name            277457
tweet_created_at       277457
tweet_id               277457
user_id                277457
dtype: int64

### The mention data
Each mention consists of the tweet id, the screen name and user id that is mentioned,
and the screen_name and user_id that is mentioning.

In [3]:
mention_df.head()

Unnamed: 0,mention_screen_name,mention_user_id,screen_name,tweet_created_at,tweet_id,user_id
0,nielslesniewski,140286364,loren_duggan,2017-03-30 12:41:33+00:00,847428582821449730,780221130
1,BrianToddCNN,104851609,akesslerdc,2017-03-29 14:02:14+00:00,847086500944777220,285772181
2,JamesVGrimaldi,17178161,akesslerdc,2017-03-25 02:45:16+00:00,845466584625885184,285772181
3,realDonaldTrump,25073877,akesslerdc,2017-03-24 20:51:43+00:00,845377611165552640,285772181
4,POTUS,822215679726100480,akesslerdc,2017-03-24 20:46:38+00:00,845376332011913217,285772181


### Remove duplicates

In [4]:
dedupe_mention_df = mention_df.drop_duplicates()
dedupe_mention_df.count()

mention_screen_name    274825
mention_user_id        274825
screen_name            274825
tweet_created_at       274825
tweet_id               274825
user_id                274825
dtype: int64

In [5]:
# From the mentions, extract map of user ids to screen names
mention_user_id_lookup_df = mention_df.loc[mention_df.groupby('mention_user_id')['tweet_created_at'].idxmax()].ix[:,['mention_user_id', 'mention_screen_name']].set_index(['mention_user_id'])
mention_user_id_lookup_df.count()

mention_screen_name    45206
dtype: int64

In [6]:
mention_user_id_lookup_df.head()

Unnamed: 0_level_0,mention_screen_name
mention_user_id,Unnamed: 1_level_1
100002112,whyyradiotimes
100005598,hotelkeys
10000772,JMoLawre
1000228238,adwooldridge
100025240,itsbull


In [7]:
# From the users (not the mentions), extract map of user ids to screen names
user_id_lookup_df = mention_df.loc[mention_df.groupby('user_id')['tweet_created_at'].idxmax()].ix[:,['user_id', 'screen_name']].set_index(['user_id'])
user_id_lookup_df.count()

screen_name    1817
dtype: int64

In [8]:
# Group by user_id
# This count should match the user_id map count
mention_summary_user_id_df = pd.DataFrame(dedupe_mention_df.groupby('mention_user_id').size(), columns=['mention_count'])
mention_summary_user_id_df.count()

mention_count    45206
dtype: int64

In [9]:
mention_summary_user_id_df.head()

Unnamed: 0_level_0,mention_count
mention_user_id,Unnamed: 1_level_1
100002112,1
100005598,5
10000772,1
1000228238,2
100025240,1


In [10]:
# Join with user id map
mention_summary_df = mention_summary_user_id_df.join(mention_user_id_lookup_df)
mention_summary_df.count()

mention_count          45206
mention_screen_name    45206
dtype: int64

In [11]:
mention_summary_df.head()

Unnamed: 0_level_0,mention_count,mention_screen_name
mention_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
100002112,1,whyyradiotimes
100005598,5,hotelkeys
10000772,1,JMoLawre
1000228238,2,adwooldridge
100025240,1,itsbull


### Load known Twitter accounts

In [12]:
from utils import load_user_type_lookup_df

user_type_lookup_df = load_user_type_lookup_df()
user_type_lookup_df['type'].value_counts()

media                4525
journalists          3360
government           3046
politicians           789
ngo                   222
pundit                169
other                 138
other_political       136
business              117
cultural              115
academic              114
foreign_political      24
Name: type, dtype: int64

### Join the mentions and known Twitter accounts

In [13]:
mention_join_df = mention_summary_df.join(user_type_lookup_df, how='left')
mention_join_df['type'].fillna('unknown', inplace=True)
mention_join_df.index.name = 'user_id'
mention_join_df.head()

Unnamed: 0_level_0,mention_count,mention_screen_name,screen_name,type,screen_name_lower
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100002112,1,whyyradiotimes,,unknown,
100005598,5,hotelkeys,,unknown,
10000772,1,JMoLawre,,unknown,
1000228238,2,adwooldridge,,unknown,
100025240,1,itsbull,,unknown,


## Mention summary

### Mentions per user
For users that made any mentions. Also to possible to figure this out for all users.

In [14]:
dedupe_mention_df['user_id'].value_counts().describe()

count     1817.000000
mean       151.252064
std        445.439562
min          1.000000
25%         14.000000
50%         48.000000
75%        155.000000
max      14997.000000
Name: user_id, dtype: float64

## Approach 1: By mention count

### Top accounts (by mention count)
Unknown for type indicates that it is not matched with an known Twitter account.

In [15]:
top_mentions_df = mention_join_df.sort_values('mention_count', ascending=False)
top_mentions_df[['mention_screen_name', 'mention_count', 'type']].head(50)

Unnamed: 0_level_0,mention_screen_name,mention_count,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25073877,realDonaldTrump,3229,politicians
822215679726100480,POTUS,3028,politicians
818927131883356161,PressSec,2507,politicians
51241574,AP,2435,media
3108351,WSJ,1963,media
15754281,USATODAY,1515,media
18916432,SpeakerRyan,1474,politicians
459277523,BloombergBNA,1218,media
18956073,dcexaminer,1065,media
807095,nytimes,1022,media


### Account types (by mention count)

In [16]:
mention_by_type_df = mention_join_df.groupby('type').sum()
mention_by_type_df['type_percentage']= mention_by_type_df['mention_count'] / mention_by_type_df['mention_count'].sum()
mention_by_type_df

Unnamed: 0_level_0,mention_count,type_percentage
type,Unnamed: 1_level_1,Unnamed: 2_level_1
academic,2625,0.009552
business,2597,0.00945
cultural,1518,0.005524
foreign_political,257,0.000935
government,5981,0.021763
journalists,76475,0.278268
media,39739,0.144597
ngo,3708,0.013492
other,4221,0.015359
other_political,3908,0.01422


## Approach 2: Per user
Mentions by type per user.

### Add type by merging screen name lookup

In [17]:
mention_all_join_df = pd.merge(dedupe_mention_df, user_type_lookup_df[['type']], how='left', left_on='mention_user_id', right_index=True)
mention_all_join_df['type'].fillna('unknown', inplace=True)
mention_all_join_df.head()

Unnamed: 0,mention_screen_name,mention_user_id,screen_name,tweet_created_at,tweet_id,user_id,type
0,nielslesniewski,140286364,loren_duggan,2017-03-30 12:41:33+00:00,847428582821449730,780221130,journalists
1,BrianToddCNN,104851609,akesslerdc,2017-03-29 14:02:14+00:00,847086500944777220,285772181,unknown
2,JamesVGrimaldi,17178161,akesslerdc,2017-03-25 02:45:16+00:00,845466584625885184,285772181,journalists
3,realDonaldTrump,25073877,akesslerdc,2017-03-24 20:51:43+00:00,845377611165552640,285772181,politicians
4,POTUS,822215679726100480,akesslerdc,2017-03-24 20:46:38+00:00,845376332011913217,285772181,politicians


In [18]:
mention_summary_by_user_df = mention_all_join_df.groupby([mention_all_join_df.user_id, mention_all_join_df.type]).size().unstack().fillna(0)
# Add a total column
mention_summary_by_user_df['total'] = mention_summary_by_user_df.sum(axis=1)
for col_name in mention_summary_by_user_df.columns[:-1]:
    mention_summary_by_user_df['{}_percent'.format(col_name)] = mention_summary_by_user_df[col_name] / mention_summary_by_user_df.total
mention_summary_by_user_df.head(10)

type,academic,business,cultural,foreign_political,government,journalists,media,ngo,other,other_political,...,foreign_political_percent,government_percent,journalists_percent,media_percent,ngo_percent,other_percent,other_political_percent,politicians_percent,pundit_percent,unknown_percent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100165378,1.0,2.0,14.0,0.0,0.0,3.0,2.0,0.0,0.0,1.0,...,0.0,0.0,0.032258,0.021505,0.0,0.0,0.010753,0.11828,0.0,0.634409
1001991865,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.875,0.0,0.0,0.0,0.0,0.0,0.125
1002229862,0.0,0.0,1.0,0.0,7.0,22.0,4.0,1.0,0.0,0.0,...,0.0,0.12069,0.37931,0.068966,0.017241,0.0,0.0,0.051724,0.0,0.344828
100802089,0.0,0.0,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.263158,0.157895,0.0,0.0,0.0,0.210526,0.0,0.368421
100860790,0.0,0.0,0.0,0.0,0.0,50.0,15.0,4.0,0.0,1.0,...,0.0,0.0,0.409836,0.122951,0.032787,0.0,0.008197,0.032787,0.0,0.393443
1009749229,2.0,0.0,0.0,0.0,3.0,59.0,46.0,33.0,0.0,0.0,...,0.0,0.01145,0.225191,0.175573,0.125954,0.0,0.0,0.0,0.007634,0.446565
1013785220,0.0,0.0,0.0,1.0,1.0,22.0,20.0,0.0,0.0,0.0,...,0.014925,0.014925,0.328358,0.298507,0.0,0.0,0.0,0.119403,0.0,0.223881
102171691,1.0,0.0,4.0,0.0,3.0,91.0,506.0,3.0,1.0,0.0,...,0.0,0.003932,0.119266,0.663172,0.003932,0.001311,0.0,0.053735,0.001311,0.146789
102238997,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.1,0.0,0.0,0.2,0.0,0.0,0.0,0.7
102789488,0.0,0.0,0.0,0.0,0.0,8.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.615385,0.153846,0.0,0.0,0.0,0.0,0.0,0.230769


### Average of percent of mentions by type for each user
That is, for each user determine the percent of mentions by type. Then take the average of each type.

Thus, this mention analysis is on a per-user basis, accounting for how prolific a tweeter a user is. (That is, users who tweet aren't weighed more heavily.)


In [19]:
mention_summary_by_user_df.filter(axis=1, regex="_percent$").mean()

type
academic_percent             0.006640
business_percent             0.012752
cultural_percent             0.007745
foreign_political_percent    0.001134
government_percent           0.027075
journalists_percent          0.238426
media_percent                0.200212
ngo_percent                  0.012752
other_percent                0.002399
other_political_percent      0.008474
politicians_percent          0.100099
pundit_percent               0.006595
unknown_percent              0.375697
dtype: float64

## Approach 3: By count of users mentioning
The number of users that mentioned an account. Thus, each user counts as 1, even if that user made multiple mentions of the account.

This weights an account that is mentioned by a 100 users more heavily than an account that is mentioned a 100 times by a single user.

In [20]:
mention_user_id_per_user_df = dedupe_mention_df[['mention_user_id', 'user_id']].drop_duplicates()
mention_user_id_per_user_summary_df = pd.DataFrame(mention_user_id_per_user_df.groupby('mention_user_id').size(), columns=['users_mentioning_count'])
# Join with user id map
mention_user_per_user_summary_df = mention_user_id_per_user_summary_df.join(mention_user_id_lookup_df)
mention_user_per_user_summary_df['percent_of_users_mentioning'] = mention_user_per_user_summary_df.users_mentioning_count / user_id_lookup_df['screen_name'].count()
# Join with user type lookup
mention_type_per_user_summary_df = mention_user_per_user_summary_df.join(user_type_lookup_df, how='left').sort_values('users_mentioning_count', ascending=False)
mention_type_per_user_summary_df['type'].fillna('unknown', inplace=True)
# Join with mention_summary_user_id_df to get mention_count
mention_per_user_summary_df = mention_type_per_user_summary_df.join(mention_summary_user_id_df)
mention_per_user_summary_df[['mention_screen_name', 'type', 'users_mentioning_count', 'percent_of_users_mentioning', 'mention_count']].head(20)

Unnamed: 0,mention_screen_name,type,users_mentioning_count,percent_of_users_mentioning,mention_count
25073877,realDonaldTrump,politicians,502,0.27628,3229
807095,nytimes,media,333,0.183269,1022
822215679726100480,POTUS,politicians,329,0.181068,3028
818927131883356161,PressSec,politicians,289,0.159053,2507
18916432,SpeakerRyan,politicians,252,0.13869,1474
2467791,washingtonpost,media,239,0.131535,601
759251,CNN,media,232,0.127683,760
3108351,WSJ,media,225,0.12383,1963
9300262,politico,media,220,0.121079,811
93069110,maggieNYT,journalists,175,0.096313,584


### Account types (by count of users mentioning)

In [21]:
mention_per_user_by_type_df = mention_per_user_summary_df[['type', 'users_mentioning_count']].groupby('type').sum()
mention_per_user_by_type_df['type_percentage']= mention_per_user_by_type_df['users_mentioning_count'] / mention_per_user_by_type_df['users_mentioning_count'].sum()
mention_per_user_by_type_df

Unnamed: 0_level_0,users_mentioning_count,type_percentage
type,Unnamed: 1_level_1,Unnamed: 2_level_1
academic,954,0.008329
business,1147,0.010014
cultural,805,0.007028
foreign_political,122,0.001065
government,2331,0.020352
journalists,29984,0.261789
media,8816,0.076972
ngo,1644,0.014354
other,323,0.00282
other_political,1649,0.014397


### Number of accounts mentioned by at least 1 user.

In [23]:
mention_per_user_summary_df[['mention_screen_name']].count()

mention_screen_name    45206
dtype: int64

### Number of accounts mentioned by at least 10 users.

In [25]:
mention_per_user_summary_df[mention_per_user_summary_df.users_mentioning_count >= 10][['mention_screen_name']].count()

mention_screen_name    1878
dtype: int64

## Unknown accounts

### Top by mention count mentioned by at least 10 users

In [27]:
top_not_known_mention_df = mention_per_user_summary_df[(mention_per_user_summary_df.type == 'unknown') & (mention_per_user_summary_df.users_mentioning_count >= 10)].sort_values('mention_count', ascending=False)[['mention_screen_name', 'mention_count', 'users_mentioning_count']]
top_not_known_mention_df.head(50)

Unnamed: 0,mention_screen_name,mention_count,users_mentioning_count
312800783,AmericaNewsroom,104,13
293314112,DavidWright_CNN,58,27
14304618,AmandaWills,57,12
1976143068,EmmanuelMacron,57,15
20975060,johnrobertsFox,55,23
15673703,karenhandel,54,22
842072478834909184,SecretarySonny,53,14
63188873,PamelaBrownCNN,53,20
46681100,marthamaccallum,53,18
824797212425191425,ALT_uscis,42,13


### Write top accounts to file

In [28]:
top_not_known_mention_df.head(3000).to_csv('unknown_mentions.csv')