# Tweet summary

## Prepare the tweet data

### Load the tweets

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type
import matplotlib.pyplot as plt


logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Set float format so doesn't display scientific notation
pd.options.display.float_format = '{:20,.2f}'.format

def tweet_transform(tweet):
    return {
        'tweet_id': tweet['id_str'], 
        'tweet_created_at': date_parse(tweet['created_at']),
        'user_id': tweet['user']['id_str'],
        'screen_name': tweet['user']['screen_name'],
        'user_created_at': date_parse(tweet['user']['created_at']),
        'tweets_to_date': tweet['user']['statuses_count'],
        'tweet_type': tweet_type(tweet)
    }

tweet_df = load_tweet_df(tweet_transform, ['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 
                                           'user_created_at', 'tweets_to_date', 'tweet_type'])
tweet_df.count()

INFO:root:Loading from tweets/6b6a0be4f70640648b56447b387f17a2_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
INFO:root:Loading from tweets/6b6a0be4f70640648b56447b387f17a2_002.json.gz
INFO:root:Loading from tweets/7bff8603fb4a49d5953197361d548346_001.json.gz
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
INFO:root:Loading from tweets/b3f330f5b6cc4572b6d7dabc3752b2b9_001.json.gz
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000


tweet_id            650350
user_id             650350
screen_name         650350
tweet_created_at    650350
user_created_at     650350
tweets_to_date      650350
tweet_type          650350
dtype: int64

### View the top of the data.

In [2]:
tweet_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
0,859463382042378240,2343897943,AmberCStrong,2017-05-02 17:43:32+00:00,2014-02-14 17:33:36+00:00,1701,original
1,859803200152588288,307982591,JaxAlemany,2017-05-03 16:13:51+00:00,2011-05-30 16:43:13+00:00,6328,original
2,859788527705493504,307982591,JaxAlemany,2017-05-03 15:15:33+00:00,2011-05-30 16:43:13+00:00,6328,quote
3,859788479076732930,307982591,JaxAlemany,2017-05-03 15:15:22+00:00,2011-05-30 16:43:13+00:00,6328,original
4,859781841955500032,307982591,JaxAlemany,2017-05-03 14:48:59+00:00,2011-05-30 16:43:13+00:00,6328,retweet


## Prepare the user data

### Tweets in dataset for each user

In [3]:
user_tweet_count_df = tweet_df[['user_id', 'tweet_type']].groupby(['user_id', 'tweet_type']).size().unstack()
user_tweet_count_df.fillna(0, inplace=True)
user_tweet_count_df['tweets_in_dataset'] = user_tweet_count_df.original + user_tweet_count_df.quote + user_tweet_count_df.reply + user_tweet_count_df.retweet
user_tweet_count_df['tweets_in_dataset_bin'] = pd.qcut(user_tweet_count_df.tweets_in_dataset, [0, .9, .99, 1.], labels=['Bottom 90%', 'Middle 9%', 'Top 1%'])
user_tweet_count_df.head()

tweet_type,original,quote,reply,retweet,tweets_in_dataset,tweets_in_dataset_bin
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1001991865,12.0,1.0,3.0,35.0,51.0,Bottom 90%
1002229862,35.0,5.0,2.0,99.0,141.0,Bottom 90%
100802089,4.0,3.0,5.0,12.0,24.0,Bottom 90%
100860790,117.0,19.0,9.0,215.0,360.0,Bottom 90%
1009749229,79.0,85.0,34.0,156.0,354.0,Bottom 90%


### Load and join user info
This is information that was coded in the spreadsheet or looked up for each user via API.

In [4]:
user_info_df = pd.read_csv('source_data/user_info_lookup.csv', names=['screen_name', 'user_id', 'name', 'organization', 'position',
                                            'gender', 'followers_count', 'following_count', 'tweet_count',
                                            'user_created_at', 'verified', 'protected'],
                          dtype={'user_id': str}).set_index(['user_id'])
user_info_df.count()

screen_name        2484
name               2484
organization       2455
position           2481
gender             2483
followers_count    2484
following_count    2484
tweet_count        2484
user_created_at    2484
verified           2484
protected          2484
dtype: int64

In [5]:
user_info_df.head()

Unnamed: 0_level_0,screen_name,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20711445,ninglin,"Glinski, Nina",,Freelance Reporter,F,968,507,909,Thu Feb 12 20:00:53 +0000 2009,False,False
258917371,davidjenders,"Enders, David",,Journalist,M,1451,480,6299,Mon Feb 28 19:52:03 +0000 2011,True,False
297046834,mattbarakat,"Barakat, Matthew",Associated Press,Northern Virginia Correspondent,M,754,349,620,Wed May 11 20:55:24 +0000 2011,True,False
455585786,kimberlyeatkins,"Atkins, Kimberly",Boston Herald,Chief Washington Reporter/Columnist,F,2399,2661,5846,Thu Jan 05 08:26:46 +0000 2012,True,False
42584840,toulavlahou,"Vlahou, Toula",CQ Roll Call,Editor & Podcast Producer,F,2713,198,6325,Tue May 26 07:41:38 +0000 2009,False,False


In [6]:
# Join
user_summary_df = user_info_df.join(user_tweet_count_df, how='left')
# Fill Nans
user_summary_df['organization'].fillna('', inplace=True)
user_summary_df['original'].fillna(0, inplace=True)
user_summary_df['quote'].fillna(0, inplace=True)
user_summary_df['reply'].fillna(0, inplace=True)
user_summary_df['retweet'].fillna(0, inplace=True)
user_summary_df['tweets_in_dataset'].fillna(0, inplace=True)
user_summary_df.count()

screen_name              2484
name                     2484
organization             2484
position                 2481
gender                   2483
followers_count          2484
following_count          2484
tweet_count              2484
user_created_at          2484
verified                 2484
protected                2484
original                 2484
quote                    2484
reply                    2484
retweet                  2484
tweets_in_dataset        2484
tweets_in_dataset_bin    2272
dtype: int64

In [7]:
user_summary_df.head()

Unnamed: 0_level_0,screen_name,name,organization,position,gender,followers_count,following_count,tweet_count,user_created_at,verified,protected,original,quote,reply,retweet,tweets_in_dataset,tweets_in_dataset_bin
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
20711445,ninglin,"Glinski, Nina",,Freelance Reporter,F,968,507,909,Thu Feb 12 20:00:53 +0000 2009,False,False,0.0,0.0,0.0,0.0,0.0,
258917371,davidjenders,"Enders, David",,Journalist,M,1451,480,6299,Mon Feb 28 19:52:03 +0000 2011,True,False,0.0,0.0,0.0,0.0,0.0,
297046834,mattbarakat,"Barakat, Matthew",Associated Press,Northern Virginia Correspondent,M,754,349,620,Wed May 11 20:55:24 +0000 2011,True,False,12.0,0.0,0.0,2.0,14.0,Bottom 90%
455585786,kimberlyeatkins,"Atkins, Kimberly",Boston Herald,Chief Washington Reporter/Columnist,F,2399,2661,5846,Thu Jan 05 08:26:46 +0000 2012,True,False,228.0,144.0,39.0,196.0,607.0,Bottom 90%
42584840,toulavlahou,"Vlahou, Toula",CQ Roll Call,Editor & Podcast Producer,F,2713,198,6325,Tue May 26 07:41:38 +0000 2009,False,False,32.0,25.0,0.0,25.0,82.0,Bottom 90%


### Write to file as output/user_summary.csv

In [8]:
user_summary_df.to_csv('output/user_summary.csv')

## Prepare the organization data
This is for users that are members of each organization.

In [9]:
org_summary_df = user_summary_df[['organization', 'followers_count', 'following_count', 'tweet_count', 'tweets_in_dataset']].groupby('organization').agg([np.sum, np.size, np.average])
org_summary_df.count()

followers_count    sum        347
                   size       347
                   average    347
following_count    sum        347
                   size       347
                   average    347
tweet_count        sum        347
                   size       347
                   average    347
tweets_in_dataset  sum        347
                   size       347
                   average    347
dtype: int64

In [10]:
org_summary_df.head()

Unnamed: 0_level_0,followers_count,followers_count,followers_count,following_count,following_count,following_count,tweet_count,tweet_count,tweet_count,tweets_in_dataset,tweets_in_dataset,tweets_in_dataset
Unnamed: 0_level_1,sum,size,average,sum,size,average,sum,size,average,sum,size,average
organization,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
,57347,29,1977.48,30788,29,1061.66,151441,29,5222.1,2767.0,29.0,95.41
ABC 7,889,1,889.0,1092,1,1092.0,1946,1,1946.0,464.0,1.0,464.0
ABC News,602790,52,11592.12,72154,52,1387.58,372200,52,7157.69,8629.0,52.0,165.94
AP–Broadcast,5305,15,353.67,7974,15,531.6,16794,15,1119.6,527.0,15.0,35.13
Afro American Newspapers,189,1,189.0,202,1,202.0,596,1,596.0,14.0,1.0,14.0


### Write to file as output/organization_summary.csv

In [11]:
org_df.to_csv('output/organization_summary.csv')

NameError: name 'org_df' is not defined

### List of organizations <--- This probably requires some cleanup

In [None]:
org_summary_df.index.tolist()

## Tweet summary
For tweets in dataset.

### Types of tweets

In [None]:
tweet_df['tweet_type'].value_counts()

## User tweet summary

### Types of tweets in dataset for each user

In [None]:
user_summary_df[['original', 'quote', 'reply', 'retweet']].describe()

### 1/9/90 rule
For top 1%, 9%, 90% of tweeters, the number of tweets and types of tweets they account for.

In [None]:
user_summary_df[user_summary_df.tweets_in_dataset_bin == 'Top 1%']

In [None]:
tweets_in_dataset_bin_summary_df = user_summary_df[['original', 'quote', 'reply', 'retweet', 'tweets_in_dataset', 'tweets_in_dataset_bin']].groupby('tweets_in_dataset_bin').sum()
tweets_in_dataset_bin_summary_df['percent_of_original'] = tweets_in_dataset_bin_summary_df.original / tweets_in_dataset_bin_summary_df.original.sum()
tweets_in_dataset_bin_summary_df['percent_of_quote'] = tweets_in_dataset_bin_summary_df.quote / tweets_in_dataset_bin_summary_df.quote.sum()
tweets_in_dataset_bin_summary_df['percent_of_reply'] = tweets_in_dataset_bin_summary_df.reply / tweets_in_dataset_bin_summary_df.reply.sum()
tweets_in_dataset_bin_summary_df['percent_of_retweets'] = tweets_in_dataset_bin_summary_df.retweet / tweets_in_dataset_bin_summary_df.retweet.sum()
tweets_in_dataset_bin_summary_df['percent_of_tweets_in_dataset'] = tweets_in_dataset_bin_summary_df.tweets_in_dataset / tweets_in_dataset_bin_summary_df.tweets_in_dataset.sum()
tweets_in_dataset_bin_summary_df['users_in_bin'] = user_summary_df[['tweets_in_dataset_bin', 'tweets_in_dataset']].groupby('tweets_in_dataset_bin').count()
tweets_in_dataset_bin_summary_df


## User summary

In [None]:
user_summary_df[['followers_count', 'following_count', 'tweet_count']].describe()

### Gender

In [None]:
user_summary_df['gender'].value_counts()

## Organization

### Top by average followers

In [None]:
org_summary_df[['followers_count']].sort_values([('followers_count', 'average')], ascending=False).head()

### Top by average following

In [None]:
org_summary_df[['following_count']].sort_values([('following_count', 'average')], ascending=False).head()

### Top by average tweet count

In [None]:
org_summary_df[['tweet_count']].sort_values([('tweet_count', 'average')], ascending=False).head()

### Top by number of tweets in dataset

In [None]:
org_summary_df[['tweets_in_dataset']].sort_values([('tweets_in_dataset', 'sum')], ascending=False).head()

## First tweet for each user

In [None]:
# Get the first tweet for each user
first_tweet_df = tweet_df.loc[tweet_df.groupby('user_id')['tweet_created_at'].idxmin()].set_index(['user_id'])
first_tweet_df.count()

In [None]:
first_tweet_df.sort_values('tweet_created_at', ascending=False).head()

### Most recent first tweet

In [None]:
first_tweet_df['tweet_created_at'].max()