# Replies

In [1]:
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import tweet_iter, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Filepaths of the files to load.
filepaths = ['d59d27e2f2ed4778881573df2ecf2fad_001.json.gz',
            '25319652321b4bb498b250ffc53aa0f0_001.json.gz']

# Simply the tweet on load
def reply_transform(tweet):
    if tweet.get('in_reply_to_status_id'):
        return {
            'tweet_id': tweet['id_str'],
            'user_id': tweet['user']['id_str'],
            'screen_name': tweet['user']['screen_name'],
            'reply_to_user_id': tweet['in_reply_to_user_id_str'],
            'reply_to_screen_name': tweet['in_reply_to_screen_name'],
            'reply_to_tweet_id': tweet['in_reply_to_status_id_str'],
            'tweet_created_at': date_parse(tweet['created_at'])            
        }
    return None

reply_df = pd.DataFrame(tweet_iter(filepaths, tweet_transform_func=reply_transform))
reply_df.count()

DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
D

reply_to_screen_name    398593
reply_to_tweet_id       398593
reply_to_user_id        398593
screen_name             398593
tweet_created_at        398593
tweet_id                398593
user_id                 398593
dtype: int64

In [2]:
reply_df.head()

Unnamed: 0,reply_to_screen_name,reply_to_tweet_id,reply_to_user_id,screen_name,tweet_created_at,tweet_id,user_id
0,davidbschultz,847622348777771008,53739928,A_Childers_,2017-03-31 01:52:09+00:00,847627543142219776,1638925448
1,davidbschultz,847587744830427137,53739928,A_Childers_,2017-03-30 23:52:23+00:00,847597404719267841,1638925448
2,AriPeskoe,847575250598494209,499013898,A_Childers_,2017-03-30 23:37:48+00:00,847593734896324608,1638925448
3,Pat_Ambrosio,847533984833777664,2497185313,A_Childers_,2017-03-30 19:41:27+00:00,847534254355599364,1638925448
4,ellisromance,847190236174176256,533335518,A_Childers_,2017-03-29 20:57:37+00:00,847191036527067136,1638925448


### Remove duplicates

In [3]:
dedupe_reply_df = reply_df.drop_duplicates()
dedupe_reply_df.count()

reply_to_screen_name    396296
reply_to_tweet_id       396296
reply_to_user_id        396296
screen_name             396296
tweet_created_at        396296
tweet_id                396296
user_id                 396296
dtype: int64

In [4]:
# From the replies, extract map of user ids to screen names
reply_user_id_lookup_df = dedupe_reply_df.loc[dedupe_reply_df.groupby('reply_to_user_id')['tweet_created_at'].idxmax()].ix[:,['reply_to_user_id', 'reply_to_screen_name']].set_index(['reply_to_user_id'])
reply_user_id_lookup_df.count()

reply_to_screen_name    74638
dtype: int64

In [5]:
reply_user_id_lookup_df.head()

Unnamed: 0_level_0,reply_to_screen_name
reply_to_user_id,Unnamed: 1_level_1
1000010898,RoyScranton
1000030188,jessieb747
100003141,NCCDtweets
100005598,hotelkeys
1000228238,adwooldridge


In [6]:
# Group by user_id
# This count should match the reply_user_id map count
reply_summary_user_id_df = pd.DataFrame(dedupe_reply_df.groupby('reply_to_user_id').size(), columns=['reply_count'])
reply_summary_user_id_df.count()

reply_count    74638
dtype: int64

In [7]:
reply_summary_user_id_df.head()

Unnamed: 0_level_0,reply_count
reply_to_user_id,Unnamed: 1_level_1
1000010898,5
1000030188,8
100003141,2
100005598,18
1000228238,3


In [8]:
# Join with user id map
reply_summary_df = reply_summary_user_id_df.join(reply_user_id_lookup_df)
reply_summary_df.count()

reply_count             74638
reply_to_screen_name    74638
dtype: int64

In [9]:
reply_summary_df.head()

Unnamed: 0_level_0,reply_count,reply_to_screen_name
reply_to_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000010898,5,RoyScranton
1000030188,8,jessieb747
100003141,2,NCCDtweets
100005598,18,hotelkeys
1000228238,3,adwooldridge


### Load known Twitter accounts

In [10]:
from utils import load_screen_name_lookup_df

screen_name_lookup_df = load_screen_name_lookup_df()
screen_name_lookup_df['type'].value_counts()

media          5915
government     2959
reporters      1457
politicians     601
Name: type, dtype: int64

### Join the replies and known Twitter accounts

In [11]:
reply_join_df = reply_summary_df.join(screen_name_lookup_df, how='left')
reply_join_df['type'].fillna('unknown', inplace=True)
reply_join_df.index.name = 'user_id'
reply_join_df.head()

Unnamed: 0_level_0,reply_count,reply_to_screen_name,screen_name,type,screen_name_lower
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000010898,5,RoyScranton,,unknown,
1000030188,8,jessieb747,,unknown,
100003141,2,NCCDtweets,,unknown,
100005598,18,hotelkeys,,unknown,
1000228238,3,adwooldridge,,unknown,


### Top (by reply count) accounts that are matched against known Twitter accounts <----------

In [12]:
top_known_reply_df = reply_join_df[pd.notnull(reply_join_df.screen_name)].sort_values('reply_count', ascending=False)
top_known_reply_df[['reply_to_screen_name', 'reply_count', 'type']].head(20)

Unnamed: 0_level_0,reply_to_screen_name,reply_count,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3817401,ericgeller,1760,reporters
22891564,chrisgeidner,1652,reporters
398088661,MEPFuller,1533,reporters
906734342,KimberlyRobinsn,1525,reporters
118130765,dylanlscott,1501,reporters
103016675,AaronMehta,1278,reporters
46557945,StevenTDennis,1209,reporters
14597239,TonyRomm,1180,reporters
47758416,marissaaevans,1167,reporters
123738314,greggiroux,1167,reporters


### Number of matched accounts <----------
reply_screen_name is the number of unique mentioned accounts. screen_name is the
number of matched unique accounts.

In [13]:
reply_join_df.count()

reply_count             74638
reply_to_screen_name    74638
screen_name              1763
type                    74638
screen_name_lower        1763
dtype: int64

## Top accounts by replies <----------
Unknown for type indicates that it is not matched with an known Twitter account.

In [14]:
top_replies_df = reply_join_df.sort_values('reply_count', ascending=False)
top_replies_df[['reply_to_screen_name', 'reply_count', 'type']].head(50)

Unnamed: 0_level_0,reply_to_screen_name,reply_count,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3817401,ericgeller,1760,reporters
22891564,chrisgeidner,1652,reporters
398088661,MEPFuller,1533,reporters
906734342,KimberlyRobinsn,1525,reporters
118130765,dylanlscott,1501,reporters
103016675,AaronMehta,1278,reporters
46557945,StevenTDennis,1209,reporters
14597239,TonyRomm,1180,reporters
123738314,greggiroux,1167,reporters
47758416,marissaaevans,1167,reporters


## Replies by account type <----------

In [15]:
reply_join_df.groupby('type').sum()

Unnamed: 0_level_0,reply_count
type,Unnamed: 1_level_1
government,393
media,1557
politicians,882
reporters,113377
unknown,280087


## Top (by replies) accounts that are not known. <----------
These are the accounts that we will want to categorize.

In [16]:
top_not_known_replies_df = reply_join_df[reply_join_df.type == 'unknown'].sort_values('reply_count', ascending=False)
top_not_known_replies_df[['reply_to_screen_name', 'reply_count']].head(100)

Unnamed: 0_level_0,reply_to_screen_name,reply_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
154562655,KateMereand,732
16868756,ddiamond,515
142721190,elisefoley,468
14412533,CillizzaCNN,461
16244449,jbarro,427
97371315,LoganDobson,421
135575282,morningmoneyben,413
15446531,mattyglesias,406
17243582,blakehounshell,390
51462013,lizzieohreally,383
