# Tweet summary

### Load the data

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import tweet_load_iter, tweet_type
import matplotlib.pyplot as plt


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

def tweet_transform(tweet):
    return {
        'tweet_id': tweet['id_str'], 
        'tweet_created_at': date_parse(tweet['created_at']),
        'user_id': tweet['user']['id_str'],
        'screen_name': tweet['user']['screen_name'],
        'user_created_at': date_parse(tweet['user']['created_at']),
        'tweets_to_date': tweet['user']['statuses_count'],
        'tweet_type': tweet_type(tweet)
    }

tweet_df = pd.DataFrame(tweet_load_iter(tweet_transform_func=tweet_transform), 
                        columns=['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 
                                 'user_created_at', 'tweets_to_date', 'tweet_type'])
tweet_df.count()

INFO:root:Loading from tweets/6eea2088e010437da4b6031c2abffdc9_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
INFO:root:Loading from tweets/a7bcdbde7a104285b92fe26e286f2543_001.json.gz
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
INFO:root:Loading from tweets/e1c824ff2b3c4c5a9a93a16e5036d09a_001.json.gz
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000


tweet_id            765036
user_id             765036
screen_name         765036
tweet_created_at    765036
user_created_at     765036
tweets_to_date      765036
tweet_type          765036
dtype: int64

### View the top of the data.

In [2]:
tweet_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
0,847428582821449730,780221130,loren_duggan,2017-03-30 12:41:33+00:00,2012-08-25 12:32:20+00:00,886,reply
1,847787664963239936,285772181,akesslerdc,2017-03-31 12:28:25+00:00,2011-04-21 19:15:21+00:00,8604,retweet
2,847634105118318594,285772181,akesslerdc,2017-03-31 02:18:13+00:00,2011-04-21 19:15:21+00:00,8604,quote
3,847617579627630592,285772181,akesslerdc,2017-03-31 01:12:33+00:00,2011-04-21 19:15:21+00:00,8604,retweet
4,847601029654880258,285772181,akesslerdc,2017-03-31 00:06:47+00:00,2011-04-21 19:15:21+00:00,8604,retweet


### Remove duplicates
Dupes happen when collecting data from Twitter API and because some reporters may existing in multiple lists.

In [3]:
len(tweet_df['tweet_id'].unique())

761078

In [4]:
dedupe_tweet_df = tweet_df.drop_duplicates(['tweet_id'], keep='last')
len(dedupe_tweet_df)

761078

### Create lookup of screen names

In [6]:
# From the tweets, extract map of user ids to screen names
user_id_lookup_df = dedupe_tweet_df.loc[dedupe_tweet_df.groupby('user_id')['tweet_created_at'].idxmax()].ix[:,['user_id', 'screen_name']].set_index(['user_id'])
user_id_lookup_df.count()

screen_name    1951
dtype: int64

In [7]:
user_id_lookup_df.head()

Unnamed: 0_level_0,screen_name
user_id,Unnamed: 1_level_1
100165378,ChristineSisto
1001991865,FredSchulte
1002229862,HMRothmandc
100802089,ayesharascoe
100860790,DionNissenbaum


## Tweets in dataset for each user

In [8]:
tweet_count_df = dedupe_tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
tweet_count_df.fillna(0, inplace=True)
tweet_count_df['tweets_in_dataset'] = tweet_count_df.original + tweet_count_df.quote + tweet_count_df.reply + tweet_count_df.retweet
tweet_count_df['bin'] = pd.qcut(tweet_count_df.tweets_in_dataset, [0, .9, .99, 1.], labels=['Bottom 90%', 'Middle 9%', 'Top 1%'])
tweet_count_df.head()

tweet_type,original,quote,reply,retweet,tweets_in_dataset,bin
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100165378,53.0,21.0,54.0,68.0,196.0,Bottom 90%
1001991865,12.0,0.0,2.0,21.0,35.0,Bottom 90%
1002229862,47.0,9.0,5.0,92.0,153.0,Bottom 90%
100802089,6.0,4.0,8.0,3.0,21.0,Bottom 90%
100860790,153.0,18.0,13.0,244.0,428.0,Bottom 90%


### Statistics on number of tweets in dataset for each user

In [9]:
tweet_count_df.describe()

tweet_type,original,quote,reply,retweet,tweets_in_dataset
count,1951.0,1951.0,1951.0,1951.0,1951.0
mean,125.567914,49.897488,60.773962,153.856996,390.096361
std,204.25668,123.289215,257.559753,361.313261,717.134366
min,0.0,0.0,0.0,0.0,1.0
25%,12.0,1.0,1.0,8.0,36.0
50%,50.0,8.0,6.0,42.0,136.0
75%,159.0,46.0,34.0,145.5,449.0
max,3210.0,1769.0,8009.0,5410.0,10643.0


## Types of tweets
While all the tweets are loaded, summarize the types of tweets.

In [10]:
dedupe_tweet_df['tweet_type'].value_counts()

retweet     300175
original    244983
reply       118570
quote        97350
Name: tweet_type, dtype: int64

## 1/9/90 rule
For top 1%, 9%, 90% of tweeters, the number of tweets and types of tweets they account for.

In [11]:
tweet_count_df[tweet_count_df.bin == 'Top 1%']

tweet_type,original,quote,reply,retweet,tweets_in_dataset,bin
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
103016675,448.0,780.0,1089.0,1341.0,3658.0,Top 1%
104299137,710.0,1391.0,768.0,1408.0,4277.0,Top 1%
13524182,898.0,960.0,303.0,2767.0,4928.0,Top 1%
14529929,1253.0,134.0,781.0,1220.0,3388.0,Top 1%
15146659,856.0,432.0,1746.0,1113.0,4147.0,Top 1%
15730608,367.0,486.0,589.0,1909.0,3351.0,Top 1%
16459325,870.0,720.0,818.0,1657.0,4065.0,Top 1%
18678924,536.0,488.0,219.0,2242.0,3485.0,Top 1%
18825339,1237.0,1554.0,414.0,3078.0,6283.0,Top 1%
191964162,595.0,102.0,314.0,4279.0,5290.0,Top 1%


In [12]:
tweet_count_bin_summary_df = tweet_count_df.groupby('bin').sum()
tweet_count_bin_summary_df['percent_of_original'] = tweet_count_bin_summary_df.original / tweet_count_bin_summary_df.original.sum()
tweet_count_bin_summary_df['percent_of_quote'] = tweet_count_bin_summary_df.quote / tweet_count_bin_summary_df.quote.sum()
tweet_count_bin_summary_df['percent_of_reply'] = tweet_count_bin_summary_df.reply / tweet_count_bin_summary_df.reply.sum()
tweet_count_bin_summary_df['percent_of_retweets'] = tweet_count_bin_summary_df.retweet / tweet_count_bin_summary_df.retweet.sum()
tweet_count_bin_summary_df['percent_of_tweets_in_dataset'] = tweet_count_bin_summary_df.tweets_in_dataset / tweet_count_bin_summary_df.tweets_in_dataset.sum()
tweet_count_bin_summary_df['user_count'] = tweet_count_df[['bin', 'tweets_in_dataset']].groupby('bin').count()
tweet_count_bin_summary_df


tweet_type,original,quote,reply,retweet,tweets_in_dataset,percent_of_original,percent_of_quote,percent_of_reply,percent_of_retweets,percent_of_tweets_in_dataset,user_count
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Bottom 90%,145086.0,42151.0,41227.0,136964.0,365428.0,0.592229,0.432984,0.347702,0.456281,0.480145,1756
Middle 9%,84479.0,40990.0,54127.0,116834.0,296430.0,0.344836,0.421058,0.456498,0.38922,0.389487,175
Top 1%,15418.0,14209.0,23216.0,46377.0,99220.0,0.062935,0.145958,0.1958,0.1545,0.130368,20


## First tweet for each user

In [13]:
# Get the first tweet for each user
first_tweet_df = dedupe_tweet_df.loc[dedupe_tweet_df.groupby('user_id')['tweet_created_at'].idxmin()].set_index(['user_id'])
first_tweet_df.count()

tweet_id            1951
screen_name         1951
tweet_created_at    1951
user_created_at     1951
tweets_to_date      1951
tweet_type          1951
dtype: int64

In [14]:
first_tweet_df.sort_values('tweet_created_at', ascending=False).head()

Unnamed: 0_level_0,tweet_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
16030469,864576328422617088,lynsea,2017-05-16 20:20:34+00:00,2008-08-28 18:19:06+00:00,612,original
198577938,864286970473848832,kalannigan,2017-05-16 01:10:46+00:00,2010-10-04 17:45:20+00:00,981,retweet
19049530,863783559273820160,padmananda,2017-05-14 15:50:23+00:00,2009-01-16 01:42:47+00:00,276,original
15727125,863548225961308160,jameygraydon,2017-05-14 00:15:15+00:00,2008-08-04 20:10:51+00:00,20,retweet
109639153,862679623724933120,tedbarrettcnn,2017-05-11 14:43:44+00:00,2010-01-29 19:10:04+00:00,25,original


### Most recent first tweet

In [15]:
first_tweet_df['tweet_created_at'].max()

Timestamp('2017-05-16 20:20:34+0000', tz='UTC')