# Tweet summary

## Prepare the tweet data

### Load the tweets

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

def tweet_transform(tweet):
    return {
        'tweet_id': tweet['id_str'], 
        'tweet_created_at': date_parse(tweet['created_at']),
        'user_id': tweet['user']['id_str'],
        'screen_name': tweet['user']['screen_name'],
        'user_created_at': date_parse(tweet['user']['created_at']),
        'tweets_to_date': tweet['user']['statuses_count'],
        'tweet_type': tweet_type(tweet)
    }

tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 
                                           'user_created_at', 'tweets_to_date', 'tweet_type'])
tweet_df.count()

INFO:root:Loading from tweets/6eea2088e010437da4b6031c2abffdc9_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
INFO:root:Loading from tweets/a7bcdbde7a104285b92fe26e286f2543_001.json.gz
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
INFO:root:Loading from tweets/e1c824ff2b3c4c5a9a93a16e5036d09a_001.json.gz
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000


tweet_id            761078
user_id             761078
screen_name         761078
tweet_created_at    761078
user_created_at     761078
tweets_to_date      761078
tweet_type          761078
dtype: int64

### View the top of the data.

In [2]:
tweet_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
0,847428582821449730,780221130,loren_duggan,2017-03-30 12:41:33+00:00,2012-08-25 12:32:20+00:00,886,reply
1,847787664963239936,285772181,akesslerdc,2017-03-31 12:28:25+00:00,2011-04-21 19:15:21+00:00,8604,retweet
2,847634105118318594,285772181,akesslerdc,2017-03-31 02:18:13+00:00,2011-04-21 19:15:21+00:00,8604,quote
3,847617579627630592,285772181,akesslerdc,2017-03-31 01:12:33+00:00,2011-04-21 19:15:21+00:00,8604,retweet
4,847601029654880258,285772181,akesslerdc,2017-03-31 00:06:47+00:00,2011-04-21 19:15:21+00:00,8604,retweet


## Prepare the user data

### Tweets in dataset for each user

In [3]:
user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
user_tweet_count_df.fillna(0, inplace=True)
user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet
user_tweet_count_df['tweets_in_dataset_bin'] = pd.qcut(user_tweet_count_df.tweets_in_dataset, [0, .9, .99, 1.], labels=['Bottom 90%', 'Middle 9%', 'Top 1%'])
user_tweet_count_df.head()

tweet_type,original,quote,reply,retweet,tweets_in_dataset,tweets_in_dataset_bin
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100165378,53.0,21.0,54.0,68.0,196.0,Bottom 90%
1001991865,12.0,0.0,2.0,21.0,35.0,Bottom 90%
1002229862,47.0,9.0,5.0,92.0,153.0,Bottom 90%
100802089,6.0,4.0,8.0,3.0,21.0,Bottom 90%
100860790,153.0,18.0,13.0,244.0,428.0,Bottom 90%


### Load and join user info
This is information that was coded in the spreadsheet or looked up for each user via API.

In [4]:
user_info_df = pd.read_csv('user_info_lookup.csv', names=['screen_name', 'user_id', 'name', 'organization', 'position', 'gender',
                                            'race', 'followers_count', 'following_count', 'tweet_count',
                                            'user_created_at', 'verified', 'protected'],
                          dtype={'user_id': str}).set_index(['user_id'])
user_info_df.count()

screen_name        2319
name               2319
organization       2255
position           2207
gender             2317
race               2243
followers_count    2319
following_count    2319
tweet_count        2319
user_created_at    2319
verified           2319
protected          2319
dtype: int64

In [5]:
user_info_df.head()

Unnamed: 0_level_0,screen_name,name,organization,position,gender,race,followers_count,following_count,tweet_count,user_created_at,verified,protected
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1539675883,dcjournojo,"Anderson, Joanna",,,F,White,412,341,2314,Sat Jun 22 22:45:01 +0000 2013,False,False
146524387,saunique,"Anderson, Stacy",,,F,Black,93,102,3909,Fri May 21 17:06:52 +0000 2010,False,False
347395654,melissaattias,"Attias, Melissa",,,F,White,1527,1488,2468,Tue Aug 02 18:49:04 +0000 2011,False,False
16191760,marisol_bello,"Bello, Marisol",,,F,Hispanic,3538,1549,6589,Mon Sep 08 20:42:28 +0000 2008,True,False
442319975,autumnsan1,"Brewington, Autumn",,Freelance Reporter,F,White,2122,482,5194,Wed Dec 21 00:29:54 +0000 2011,True,False


In [6]:
# Join
user_summary_df = user_info_df.join(user_tweet_count_df, how='left')
user_summary_df.count()

screen_name              2319
name                     2319
organization             2255
position                 2207
gender                   2317
race                     2243
followers_count          2319
following_count          2319
tweet_count              2319
user_created_at          2319
verified                 2319
protected                2319
original                 1943
quote                    1943
reply                    1943
retweet                  1943
tweets_in_dataset        1943
tweets_in_dataset_bin    1943
dtype: int64

In [7]:
user_summary_df.head()

Unnamed: 0_level_0,screen_name,name,organization,position,gender,race,followers_count,following_count,tweet_count,user_created_at,verified,protected,original,quote,reply,retweet,tweets_in_dataset,tweets_in_dataset_bin
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1539675883,dcjournojo,"Anderson, Joanna",,,F,White,412,341,2314,Sat Jun 22 22:45:01 +0000 2013,False,False,1.0,0.0,0.0,0.0,1.0,Bottom 90%
146524387,saunique,"Anderson, Stacy",,,F,Black,93,102,3909,Fri May 21 17:06:52 +0000 2010,False,False,7.0,4.0,0.0,13.0,24.0,Bottom 90%
347395654,melissaattias,"Attias, Melissa",,,F,White,1527,1488,2468,Tue Aug 02 18:49:04 +0000 2011,False,False,21.0,0.0,0.0,96.0,117.0,Bottom 90%
16191760,marisol_bello,"Bello, Marisol",,,F,Hispanic,3538,1549,6589,Mon Sep 08 20:42:28 +0000 2008,True,False,60.0,4.0,16.0,172.0,252.0,Bottom 90%
442319975,autumnsan1,"Brewington, Autumn",,Freelance Reporter,F,White,2122,482,5194,Wed Dec 21 00:29:54 +0000 2011,True,False,20.0,19.0,10.0,4.0,53.0,Bottom 90%


## Prepare the organization data
This is for users that are members of each organization.

In [8]:
org_summary_df = user_summary_df[['organization', 'followers_count', 'following_count', 'tweet_count', 'tweets_in_dataset']].groupby('organization').agg([np.sum, np.size, np.average])
org_summary_df.count()

followers_count    sum        326
                   size       326
                   average    326
following_count    sum        326
                   size       326
                   average    326
tweet_count        sum        326
                   size       326
                   average    326
tweets_in_dataset  sum        296
                   size       326
                   average    233
dtype: int64

In [9]:
org_summary_df.head()

Unnamed: 0_level_0,followers_count,followers_count,followers_count,following_count,following_count,following_count,tweet_count,tweet_count,tweet_count,tweets_in_dataset,tweets_in_dataset,tweets_in_dataset
Unnamed: 0_level_1,sum,size,average,sum,size,average,sum,size,average,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
ABC News,591841,61,9702.311475,75106,61,1231.245902,357711,61,5864.114754,10932.0,61.0,
AOL Huffington Post,796,1,796.0,1047,1,1047.0,4416,1,4416.0,,1.0,
AP–Broadcast,7031,26,270.423077,8602,26,330.846154,18098,26,696.076923,613.0,26.0,
Afro American Newspapers,185,1,185.0,202,1,202.0,582,1,582.0,27.0,1.0,27.0
Agence France Presse (AFP–TV),3574,7,510.571429,3900,7,557.142857,12486,7,1783.714286,342.0,7.0,48.857143


### List of organizations <--- This probably requires some cleanup

In [10]:
org_summary_df.index.tolist()

['ABC News',
 'AOL Huffington Post',
 'AP–Broadcast',
 'Afro American Newspapers',
 'Agence France Presse (AFP–TV)',
 'Agence France-Presse',
 'Agri-Pulse',
 'Air Force Magazine',
 'Al-Arab News Channel',
 'Alaska Dispatch News',
 'Alaska Energy Desk',
 'Alaska Public Radio Network',
 'Albuquerque Journal',
 'Aljazeera America',
 'Aljazeera English',
 'Allentown Morning Call',
 'American Banker',
 'American Prospect',
 'Argus Media',
 'Army Times',
 'Artists & Writers Syndicate',
 'Associated Press',
 'Atlanta Journal-Consitution',
 'Austin American-Statesman',
 'Axios',
 'BBC',
 'BET Nightly News',
 'Balkan Insight',
 'Baltimore Sun',
 'Bankrate',
 'Bloomberg BNA',
 'Bloomberg Government',
 'Bloomberg News',
 'Bloomberg Radio & TV',
 'Bloomberg TV',
 'Bond Buyer',
 'Boston Globe',
 'Boston Herald',
 'Breitbart News',
 'Broadcasting & Cable',
 'Buffalo News',
 'BuzzFeed',
 'CBN News',
 'CBS News',
 'CDC Gaming Reports',
 'CEO Update',
 'CGTN America',
 'CNBC',
 'CNN',
 'CNN Internation

## Tweet summary
For tweets in dataset.

### Types of tweets

In [11]:
tweet_df['tweet_type'].value_counts()

retweet     300175
original    244983
reply       118570
quote        97350
Name: tweet_type, dtype: int64

## User tweet summary

### Types of tweets in dataset for each user

In [12]:
user_summary_df[['original', 'quote', 'reply', 'retweet']].describe()

Unnamed: 0,original,quote,reply,retweet
count,1943.0,1943.0,1943.0,1943.0
mean,125.39475,49.88317,60.521359,154.244982
std,204.46527,123.507619,257.789549,361.992793
min,0.0,0.0,0.0,0.0
25%,12.0,1.0,1.0,8.0
50%,50.0,8.0,6.0,42.0
75%,158.0,45.5,33.0,146.5
max,3210.0,1769.0,8009.0,5410.0


### 1/9/90 rule
For top 1%, 9%, 90% of tweeters, the number of tweets and types of tweets they account for.

In [13]:
user_summary_df[user_summary_df.tweets_in_dataset_bin == 'Top 1%']

Unnamed: 0_level_0,screen_name,name,organization,position,gender,race,followers_count,following_count,tweet_count,user_created_at,verified,protected,original,quote,reply,retweet,tweets_in_dataset,tweets_in_dataset_bin
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
456994513,maria_e_recio,"Recio, Maria",Austin American-Statesman,Political Reporter,F,White,1037,518,36922,Fri Jan 06 22:22:40 +0000 2012,False,False,380.0,556.0,115.0,4801.0,5852.0,Top 1%
22891564,chrisgeidner,"Geidner, Chris",BuzzFeed,Legal Editor & Supreme Court Correspondent,M,White,76308,4709,198543,Thu Mar 05 06:48:00 +0000 2009,True,False,955.0,695.0,4314.0,1280.0,7244.0,Top 1%
21810329,sdonnan,"Donnan, Shawn",Financial Times,Wolrd Trade Editor,M,White,11402,5379,74017,Tue Feb 24 23:10:17 +0000 2009,True,False,371.0,393.0,244.0,3979.0,4987.0,Top 1%
18678924,jmartnyt,"Martin, Jonathan",New York Times,National Political Correspondent,M,White,185223,5579,103861,Tue Jan 06 15:20:59 +0000 2009,True,False,536.0,488.0,219.0,2242.0,3485.0,Top 1%
3817401,ericgeller,"Geller, Eric",Politico,Cybersecurity Reporter,M,White,47478,734,196072,Sun Apr 08 20:27:11 +0000 2007,True,False,865.0,1769.0,8009.0,0.0,10643.0,Top 1%
593813785,donnayoungdc,"Young, Donna",S&P Global Market Intelligence,Senior Reporter,F,White,5559,1574,45025,Tue May 29 15:45:45 +0000 2012,False,False,1937.0,1104.0,16.0,1578.0,4635.0,Top 1%
104299137,davidmdrucker,"Drucker, David",Washington Examiner,Senior Political Correspondent,M,White,32409,2472,99574,Tue Jan 12 22:56:50 +0000 2010,True,False,710.0,1391.0,768.0,1408.0,4277.0,Top 1%
61734492,fahrenthold,"Fahrenthold, David",Washington Post,Political Reporter,M,White,398481,3275,24238,Fri Jul 31 09:29:37 +0000 2009,True,False,209.0,287.0,141.0,2833.0,3470.0,Top 1%
13524182,daveweigel,"Weigel, David",Washington Post,Political Reporter,M,White,311572,10042,165016,Fri Feb 15 17:58:23 +0000 2008,True,False,898.0,960.0,303.0,2767.0,4928.0,Top 1%
275207082,alexparkerdc,"Parker, Alexander M.",Bloomberg BNA,Tax Reporter,M,White,3717,3276,137792,Thu Mar 31 20:53:10 +0000 2011,False,False,1042.0,370.0,1833.0,917.0,4162.0,Top 1%


In [14]:
tweets_in_dataset_bin_summary_df = user_summary_df[['original', 'quote', 'reply', 'retweet', 'tweets_in_dataset', 'tweets_in_dataset_bin']].groupby('tweets_in_dataset_bin').sum()
tweets_in_dataset_bin_summary_df['percent_of_original'] = tweets_in_dataset_bin_summary_df.original / tweets_in_dataset_bin_summary_df.original.sum()
tweets_in_dataset_bin_summary_df['percent_of_quote'] = tweets_in_dataset_bin_summary_df.quote / tweets_in_dataset_bin_summary_df.quote.sum()
tweets_in_dataset_bin_summary_df['percent_of_reply'] = tweets_in_dataset_bin_summary_df.reply / tweets_in_dataset_bin_summary_df.reply.sum()
tweets_in_dataset_bin_summary_df['percent_of_retweets'] = tweets_in_dataset_bin_summary_df.retweet / tweets_in_dataset_bin_summary_df.retweet.sum()
tweets_in_dataset_bin_summary_df['percent_of_tweets_in_dataset'] = tweets_in_dataset_bin_summary_df.tweets_in_dataset / tweets_in_dataset_bin_summary_df.tweets_in_dataset.sum()
tweets_in_dataset_bin_summary_df['users_in_bin'] = user_summary_df[['tweets_in_dataset_bin', 'tweets_in_dataset']].groupby('tweets_in_dataset_bin').count()
tweets_in_dataset_bin_summary_df


Unnamed: 0_level_0,original,quote,reply,retweet,tweets_in_dataset,percent_of_original,percent_of_quote,percent_of_reply,percent_of_retweets,percent_of_tweets_in_dataset,users_in_bin
tweets_in_dataset_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Bottom 90%,144235.0,41865.0,40841.0,136573.0,363514.0,0.591996,0.431941,0.347308,0.455702,0.479661,1749
Middle 9%,83989.0,40849.0,53536.0,116748.0,295122.0,0.344723,0.421458,0.455265,0.389552,0.389417,174
Top 1%,15418.0,14209.0,23216.0,46377.0,99220.0,0.063281,0.146601,0.197427,0.154746,0.130922,20


## User summary

In [15]:
user_summary_df[['followers_count', 'following_count', 'tweet_count']].describe()

Unnamed: 0,followers_count,following_count,tweet_count
count,2319.0,2319.0,2319.0
mean,12283.73,1179.84088,7983.921087
std,71245.47,1693.152353,15591.792791
min,0.0,0.0,0.0
25%,508.5,345.5,640.5
50%,1840.0,837.0,2869.0
75%,5594.5,1482.5,8558.0
max,2070629.0,51513.0,198543.0


### Gender

In [16]:
user_summary_df['gender'].value_counts()

M    1335
F     982
Name: gender, dtype: int64

#### Race <--- Looks like we need some data cleanup.

In [17]:
user_summary_df['race'].value_counts()

White                     1897
Black                      109
Asian                       78
Middle Eastern              43
Hispanic                    41
Unsure                      36
W                           15
HIspanic                     4
white                        3
Armenian                     3
Asian/Pacific Islander       3
African American             2
B?                           1
Whiite                       1
Asian/White                  1
WHite                        1
Whitie                       1
H                            1
White                        1
N                            1
South Asian                  1
Name: race, dtype: int64

## Organization

### Top by average followers

In [18]:
org_summary_df[['followers_count']].sort_values([('followers_count', 'average')], ascending=False).head()

Unnamed: 0_level_0,followers_count,followers_count,followers_count
Unnamed: 0_level_1,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
MSNBC,1164351,3,388117.0
Toronto Star,158179,1,158179.0
New York,122971,1,122971.0
New Yorker,112182,1,112182.0
MTV News,99492,1,99492.0


### Top by average following

In [19]:
org_summary_df[['following_count']].sort_values([('following_count', 'average')], ascending=False).head()

Unnamed: 0_level_0,following_count,following_count,following_count
Unnamed: 0_level_1,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
White House Dossier,7416,1,7416.0
Snapchat,5994,1,5994.0
Bankrate,5692,1,5692.0
New York Daily News,4228,1,4228.0
Recode,4084,1,4084.0


### Top by average tweet count

In [20]:
org_summary_df[['tweet_count']].sort_values([('tweet_count', 'average')], ascending=False).head()

Unnamed: 0_level_0,tweet_count,tweet_count,tweet_count
Unnamed: 0_level_1,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
New Republic,93979,1,93979.0
Mic,91725,1,91725.0
MTV News,80350,1,80350.0
ProPublica,77557,1,77557.0
Toronto Star,67062,1,67062.0


### Top by number of tweets in dataset

In [21]:
org_summary_df[['tweets_in_dataset']].sort_values([('tweets_in_dataset', 'sum')], ascending=False).head()

Unnamed: 0_level_0,tweets_in_dataset,tweets_in_dataset,tweets_in_dataset
Unnamed: 0_level_1,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Politico,63236.0,115.0,
CNN,44387.0,168.0,
Washington Post,29798.0,64.0,
Bloomberg BNA,26994.0,120.0,
Bloomberg News,25849.0,79.0,


## First tweet for each user

In [22]:
# Get the first tweet for each user
first_tweet_df = tweet_df.loc[tweet_df.groupby('user_id')['tweet_created_at'].idxmin()].set_index(['user_id'])
first_tweet_df.count()

tweet_id            1951
screen_name         1951
tweet_created_at    1951
user_created_at     1951
tweets_to_date      1951
tweet_type          1951
dtype: int64

In [23]:
first_tweet_df.sort_values('tweet_created_at', ascending=False).head()

Unnamed: 0_level_0,tweet_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
16030469,864576328422617088,lynsea,2017-05-16 20:20:34+00:00,2008-08-28 18:19:06+00:00,612,original
198577938,864286970473848832,kalannigan,2017-05-16 01:10:46+00:00,2010-10-04 17:45:20+00:00,981,retweet
19049530,863783559273820160,padmananda,2017-05-14 15:50:23+00:00,2009-01-16 01:42:47+00:00,276,original
15727125,863548225961308160,jameygraydon,2017-05-14 00:15:15+00:00,2008-08-04 20:10:51+00:00,20,retweet
109639153,862679623724933120,tedbarrettcnn,2017-05-11 14:43:44+00:00,2010-01-29 19:10:04+00:00,25,original


### Most recent first tweet

In [24]:
first_tweet_df['tweet_created_at'].max()

Timestamp('2017-05-16 20:20:34+0000', tz='UTC')