# Retweets and quotes

In [1]:
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import tweet_iter, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Filepaths of the files to load.
filepaths = ['d59d27e2f2ed4778881573df2ecf2fad_001.json.gz',
            '25319652321b4bb498b250ffc53aa0f0_001.json.gz']

# Simply the tweet on load
def retweet_transform(tweet):
    retweet = tweet.get('retweeted_status') or tweet.get('quoted_status')
    if retweet:
        return {
            'tweet_id': tweet['id_str'],
            'user_id': tweet['user']['id_str'],
            'screen_name': tweet['user']['screen_name'],
            'retweet_user_id': retweet['user']['id_str'],
            'retweet_screen_name': retweet['user']['screen_name'],
            'tweet_created_at': date_parse(tweet['created_at'])            
        }
    return None

retweet_df = pd.DataFrame(tweet_iter(filepaths, tweet_transform_func=retweet_transform))
retweet_df.count()


DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
D

retweet_screen_name    1361711
retweet_user_id        1361711
screen_name            1361711
tweet_created_at       1361711
tweet_id               1361711
user_id                1361711
dtype: int64

In [2]:
retweet_df.head()

Unnamed: 0,retweet_screen_name,retweet_user_id,screen_name,tweet_created_at,tweet_id,user_id
0,paulconndc,64502388,A_Childers_,2017-03-31 14:41:35+00:00,847821180832804864,1638925448
1,azevin,14744078,A_Childers_,2017-03-31 14:15:34+00:00,847814632643473411,1638925448
2,TiffanyStecker,17679229,A_Childers_,2017-03-30 17:47:04+00:00,847505467995693057,1638925448
3,Calvinn_Hobbes,1579422614,A_Childers_,2017-03-30 13:50:02+00:00,847445818072317952,1638925448
4,business,34713362,A_Childers_,2017-03-30 13:17:17+00:00,847437576856330241,1638925448


### Remove duplicates

In [3]:
dedupe_retweet_df = retweet_df.drop_duplicates()
dedupe_retweet_df.count()

retweet_screen_name    1348290
retweet_user_id        1348290
screen_name            1348290
tweet_created_at       1348290
tweet_id               1348290
user_id                1348290
dtype: int64

In [4]:
# From the retweets, extract map of user ids to screen names
retweet_user_id_lookup_df = dedupe_retweet_df.loc[dedupe_retweet_df.groupby('retweet_user_id')['tweet_created_at'].idxmax()].ix[:,['retweet_user_id', 'retweet_screen_name']].set_index(['retweet_user_id'])
retweet_user_id_lookup_df.count()

retweet_screen_name    108098
dtype: int64

In [5]:
retweet_user_id_lookup_df.head()

Unnamed: 0_level_0,retweet_screen_name
retweet_user_id,Unnamed: 1_level_1
1000010898,RoyScranton
100002112,whyyradiotimes
100003141,NCCDtweets
100005598,hotelkeys
100007369,signixsolutions


In [6]:
# Group by user_id
# This count should match the retweet_user_id map count
retweet_summary_user_id_df = pd.DataFrame(dedupe_retweet_df.groupby('retweet_user_id').size(), columns=['retweet_count'])
retweet_summary_user_id_df.count()

retweet_count    108098
dtype: int64

In [7]:
retweet_summary_user_id_df.head()

Unnamed: 0_level_0,retweet_count
retweet_user_id,Unnamed: 1_level_1
1000010898,2
100002112,37
100003141,5
100005598,9
100007369,1


In [8]:
# Join with user id map
retweet_summary_df = retweet_summary_user_id_df.join(retweet_user_id_lookup_df)
retweet_summary_df.count()

retweet_count          108098
retweet_screen_name    108098
dtype: int64

In [9]:
retweet_summary_df.head()

Unnamed: 0_level_0,retweet_count,retweet_screen_name
retweet_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000010898,2,RoyScranton
100002112,37,whyyradiotimes
100003141,5,NCCDtweets
100005598,9,hotelkeys
100007369,1,signixsolutions


### Load known Twitter accounts

In [10]:
from utils import load_screen_name_lookup_df

screen_name_lookup_df = load_screen_name_lookup_df()
screen_name_lookup_df['type'].value_counts()

media          5915
government     2959
reporters      1457
politicians     601
Name: type, dtype: int64

### Join the retweets and known Twitter accounts

In [11]:
retweet_join_df = retweet_summary_df.join(screen_name_lookup_df, how='left')
retweet_join_df['type'].fillna('unknown', inplace=True)
retweet_join_df.index.name = 'user_id'
retweet_join_df.head()

Unnamed: 0_level_0,retweet_count,retweet_screen_name,screen_name,type,screen_name_lower
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000010898,2,RoyScranton,,unknown,
100002112,37,whyyradiotimes,,unknown,
100003141,5,NCCDtweets,,unknown,
100005598,9,hotelkeys,,unknown,
100007369,1,signixsolutions,,unknown,


In [12]:
top_known_retweets_df = retweet_join_df[pd.notnull(retweet_join_df.screen_name)].sort_values('retweet_count', ascending=False)
top_known_retweets_df[['retweet_screen_name', 'retweet_count', 'type']].head(20)

Unnamed: 0_level_0,retweet_screen_name,retweet_count,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
51241574,AP,8998,media
426802833,AP_Politics,8436,media
25073877,realDonaldTrump,8320,politicians
1917731,thehill,6307,media
9300262,politico,6024,media
2467791,washingtonpost,5552,media
15922214,rollcall,4769,media
807095,nytimes,4062,media
21316253,ZekeJMiller,3860,reporters
13524182,daveweigel,3437,reporters


### Number of matched accounts <----------
retweet_screen_name is the number of unique mentioned accounts. screen_name is the
number of matched unique accounts.

In [13]:
retweet_join_df.count()

retweet_count          108098
retweet_screen_name    108098
screen_name              3542
type                   108098
screen_name_lower        3542
dtype: int64

## Top accounts by retweets <----------
Unknown for type indicates that it is not matched with an known Twitter account.

In [14]:
top_retweets_df = retweet_join_df.sort_values('retweet_count', ascending=False)
top_retweets_df[['retweet_screen_name', 'retweet_count', 'type']].head(50)

Unnamed: 0_level_0,retweet_screen_name,retweet_count,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
51241574,AP,8998,media
426802833,AP_Politics,8436,media
25073877,realDonaldTrump,8320,politicians
90614279,EENewsUpdates,6548,unknown
2312829909,CQnow,6495,unknown
1917731,thehill,6307,media
9300262,politico,6024,media
93069110,maggieNYT,5751,unknown
2467791,washingtonpost,5552,media
15922214,rollcall,4769,media


## Retweets by account type <----------

In [15]:
retweet_join_df.groupby('type').sum()

Unnamed: 0_level_0,retweet_count
type,Unnamed: 1_level_1
government,10892
media,84886
politicians,24630
reporters,299580
unknown,928302


## Top (by retweets) accounts that are not known. <----------
These are the accounts that we will want to categorize.

In [16]:
top_not_known_retweets_df = retweet_join_df[retweet_join_df.type == 'unknown'].sort_values('retweet_count', ascending=False)
top_not_known_retweets_df[['retweet_screen_name', 'retweet_count']].head(100)

Unnamed: 0_level_0,retweet_screen_name,retweet_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
90614279,EENewsUpdates,6548
2312829909,CQnow,6495
93069110,maggieNYT,5751
34713362,business,4105
299802277,BraddJaffy,3056
59331128,PhilipRucker,2843
14529929,jaketapper,2807
207660339,POLITICOPro,2729
14412533,CillizzaCNN,2567
17243582,blakehounshell,2509
