# Tweet summary

### Load the data

In [35]:
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import tweet_iter, tweet_type
import matplotlib.pyplot as plt


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Filepaths of the files to load.
filepaths = ['d59d27e2f2ed4778881573df2ecf2fad_001.json.gz',
            '25319652321b4bb498b250ffc53aa0f0_001.json.gz']

def tweet_transform(tweet):
    return {
        'tweet_id': tweet['id_str'], 
        'tweet_created_at': date_parse(tweet['created_at']),
        'user_id': tweet['user']['id_str'],
        'screen_name': tweet['user']['screen_name'],
        'user_created_at': date_parse(tweet['user']['created_at']),
        'tweets_to_date': tweet['user']['statuses_count'],
        'tweet_type': tweet_type(tweet)
    }

tweet_df = pd.DataFrame(tweet_iter(filepaths, tweet_transform_func=tweet_transform), 
                        columns=['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 
                                 'user_created_at', 'tweets_to_date', 'tweet_type'])
tweet_df.count()

DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
D

tweet_id            3364440
user_id             3364440
screen_name         3364440
tweet_created_at    3364440
user_created_at     3364440
tweets_to_date      3364440
tweet_type          3364440
dtype: int64

### View the top of the data.

In [2]:
tweet_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
0,847821180832804864,1638925448,A_Childers_,2017-03-31 14:41:35+00:00,2013-08-01 21:44:28+00:00,6071,retweet
1,847814632643473411,1638925448,A_Childers_,2017-03-31 14:15:34+00:00,2013-08-01 21:44:28+00:00,6071,retweet
2,847627543142219776,1638925448,A_Childers_,2017-03-31 01:52:09+00:00,2013-08-01 21:44:28+00:00,6071,reply
3,847597404719267841,1638925448,A_Childers_,2017-03-30 23:52:23+00:00,2013-08-01 21:44:28+00:00,6071,reply
4,847593734896324608,1638925448,A_Childers_,2017-03-30 23:37:48+00:00,2013-08-01 21:44:28+00:00,6071,reply


### Remove duplicates
Dupes happen when collecting data from Twitter API and because some reporters may existing in multiple lists.

In [3]:
len(tweet_df['tweet_id'].unique())

99985

In [4]:
dedupe_tweet_df = tweet_df.drop_duplicates(['tweet_id'], keep='last')
len(dedupe_tweet_df)

99985

In [5]:
### Create lookup of screen names

In [6]:
# From the tweets, extract map of user ids to screen names
user_id_lookup_df = dedupe_tweet_df.loc[dedupe_tweet_df.groupby('user_id')['tweet_created_at'].idxmax()].ix[:,['user_id', 'screen_name']].set_index(['user_id'])
user_id_lookup_df.count()

screen_name    847
dtype: int64

In [7]:
user_id_lookup_df.head()

Unnamed: 0_level_0,screen_name
user_id,Unnamed: 1_level_1
100165378,ChristineSisto
1001991865,FredSchulte
1002229862,HMRothmandc
100802089,ayesharascoe
100860790,DionNissenbaum


## Tweets in dataset for each user

In [8]:
tweet_count_df = pd.DataFrame(dedupe_tweet_df['user_id'].value_counts()).rename(columns={'user_id': 'tweets_in_dataset'})
tweet_count_df.index.name = 'user_id'
tweet_count_df.count()

tweets_in_dataset    847
dtype: int64

### Statistics on number of tweets in dataset for each user <---------------

In [9]:
tweet_count_df.describe()

Unnamed: 0,tweets_in_dataset
count,847.0
mean,118.046045
std,453.243787
min,1.0
25%,7.0
50%,22.0
75%,57.5
max,3540.0


### Grouping number of tweets in dataset for each user

In [14]:
binned_tweet_count_count_df = pd.qcut(tweet_count_df.tweets_in_dataset, 10).value_counts(sort=False)
binned_tweet_count_count_df


[1, 2.6]        85
(2.6, 5]        87
(5, 10]        102
(10, 15]        81
(15, 22]        75
(22, 31]        79
(31, 46]        88
(46, 73.6]      80
(73.6, 151]     86
(151, 3540]     84
Name: tweets_in_dataset, dtype: int64

## First tweet for each user

In [32]:
# Get the first tweet for each user
first_tweet_df = dedupe_tweet_df.loc[dedupe_tweet_df.groupby('user_id')['tweet_created_at'].idxmin()].set_index(['user_id'])
first_tweet_df.count()

tweet_id            847
screen_name         847
tweet_created_at    847
user_created_at     847
tweets_to_date      847
tweet_type          847
dtype: int64

In [33]:
first_tweet_df.sort_values('tweet_created_at', ascending=False).head()

Unnamed: 0_level_0,tweet_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
414936779,847824382441533440,abdulshaz,2017-03-31 14:54:19+00:00,2011-11-17 17:48:34+00:00,4894,original
137789852,847641018178519042,JanieVelencia,2017-03-31 02:45:41+00:00,2010-04-27 19:36:12+00:00,8816,retweet
2193026041,847550172980801537,Schank_A,2017-03-30 20:44:42+00:00,2013-11-13 22:13:29+00:00,559,reply
27068141,847502075898478595,sswestfall,2017-03-30 17:33:35+00:00,2009-03-27 18:48:36+00:00,197,retweet
589198817,847500358989381633,reuterslambert,2017-03-30 17:26:46+00:00,2012-05-24 15:22:54+00:00,1819,original


### Most recent first tweet

In [34]:
first_tweet_df['tweet_created_at'].max()

Timestamp('2017-03-31 14:54:19+0000', tz='UTC')

## Types of tweets <----------
While all the tweets are loaded, summarize the types of tweets.

In [25]:
dedupe_tweet_df['tweet_type'].value_counts()

original    39188
retweet     36688
reply       14208
quote        9901
Name: tweet_type, dtype: int64