# Summary Statistics

**CS109a**: Fall 2018

**Authors**: Gordon Hew, Wenqin Hu, Blair Leduc

**TF**: Ken Arnold

## Set up

### Load spaCy, a natural language processing library

In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm


[93m    Linking successful[0m
    /Users/blair/.pyenv/versions/3.6.7/lib/python3.6/site-packages/en_core_web_sm
    -->
    /Users/blair/.pyenv/versions/3.6.7/lib/python3.6/site-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



### Import common libraries that we will be using

In [2]:
import numpy as np
import pandas as pd
import spacy
import os
from pathlib import Path

pd.set_option('max_seq_items', 4000)
pd.set_option('max_rows',20)

### File locations for input/output of this notebook

In [3]:
final_users_df_gz_file = os.path.join('tmp',
                                      'users_final_df.pkl.gz')
final_tweets_df_gz_file = os.path.join('tmp',
                                       'tweets_final_df.pkl.gz')
clean_tweets_df_gz_file = os.path.join('tmp', 
                                       'tweets_clean_df.pkl.gz')
users_summary_df_gz_file = os.path.join('data',
                                        'users_final_agg_df.pkl.gz')

## Load User and Tweet Dataframes

In [4]:
# Reload from binary file
users_pkl_df = pd.read_pickle(final_users_df_gz_file, 
                              compression='gzip')
tweets_pkl_df = pd.read_pickle(final_tweets_df_gz_file, 
                               compression='gzip')

In [5]:
# Make a copy here just incase we corrupt the datasets, we can 
# start over quickly
users_df = users_pkl_df.copy(deep=True)
tweets_df = tweets_pkl_df.copy(deep=True)

### Smoke test to make sure we loaded things correctly

In [6]:
print(f'Total number of users: {len(users_df)}')
unique_users = tweets_df.user_id.unique()
print(f'Number of unique users in tweets: {len(unique_users)}')
print(f"User {unique_users[0]} has " \
      + "{len(tweets_df[tweets_df['user_id'] == unique_users[0]])}")

Total number of users: 1000
Number of unique users in tweets: 1000
User 934576158305345536 has {len(tweets_df[tweets_df['user_id'] == unique_users[0]])}


### Change the index of users_df to the unique id of the user

In [7]:
users_df = users_df.set_index('id')

### Add columns to tweets_df that will speed up processing

In [8]:
if not Path(clean_tweets_df_gz_file).exists():
    tweets_df['clean_text'] = tweets_df['text']\
                                .apply(lambda x: ' '.join(\
                                   [t for t in x.split() 
                                    if (t[0].isalpha() or t[0]=='(')
                                        and not t.lower()\
                                                    .startswith('http') 
                                        and not t == 'RT']))

    tweets_df['created_at_hour'] = tweets_df['created_at']\
                                    .apply(lambda x: x[11:13])\
                                    .astype(int)

### Natural Language Processing

Preprocess named entities once, here, for use in the rules to follow.

In [9]:
if not Path(clean_tweets_df_gz_file).exists():
    nlp = spacy.load('en_core_web_sm')
    # This column will have an list of named entities
    # We can filter on this later
    tweets_df['named_entities'] = tweets_df['clean_text']\
        .apply(lambda x: [e.label_ for e in nlp(x).ents])

### Save clean and digested tweets_df to save time

_Or load the previously saved DataFrame to save time_

In [10]:
if not Path(clean_tweets_df_gz_file).exists():
    tweets_df.to_pickle(clean_tweets_df_gz_file, 
                        compression='gzip')
else:
    tweets_df = pd.read_pickle(clean_tweets_df_gz_file, 
                               compression='gzip')    

## Code to process tweets to and add metrics to users

In [11]:
class CollectSummaryMetrics:
    
    def __init__(self):
        self.columns = []
        
    def add_column(self, name, processor):
        self.columns.append({'name': name, 'func': processor})
        
    def run_processor(self, processor, s, df):
        for i in s.index:
            s[i] = processor(df[df['user_id'] == i])
        return s
    
    def run(self, users_df, tweets_df):
        df = users_df.copy()
        print('Processing:')
        unique_users = tweets_df.user_id.unique();
        for column in self.columns:
            print(f"Adding column {column['name']}...")
            
            df[column['name']] = self.run_processor(column['func'], 
                        pd.Series(index = unique_users), tweets_df)
        print('Done.')
        return df
    

## Create Summary Metrics
* tweets per hour &#x2713;
* histogram array for tweets per hour &#x2713;
* average number of links per Tweet &#x2713;
* average number of contributors per Tweet &#x2717; (no contribs in any tweet)
* average tweet status word length per Tweet &#x2713;
* average number of hashtags per Tweet &#x2713;
* average user mentions per Tweet &#x2713;
* average favorite count per Tweet &#x2713;
* average media per Tweet &#x2713;
* average symbols per Tweet &#x2713;
* average retweet count per Tweet &#x2713;
* average number of truncated tweets &#x2713;
* Total links for each account per Pew reserach Category &#x2713;
* Retweet ratio &#x2713;
* Natural Language Processing columns (PERSON, NORP, ORG, GPE, PRODUCT, EVENT, LAW, MONEY) &#x2713;

In [12]:
new_metrics = CollectSummaryMetrics()

### Tweets per hour

In [13]:
def count_on_hour(df, hour):
    s = df['created_at_hour'].apply(lambda x: x[11:13]).astype(int)
    return s[s == hour].count() 

for hour in range(0,24):
    new_metrics.add_column(f'tweets_per_hour_{hour:02}', 
                           lambda df, hour=hour: 
                               df['created_at_hour'][\
                                df['created_at_hour'] == hour].count())

### Tweets per hour histogram array

In [14]:
new_metrics.add_column('tweets_per_hour', 
                       lambda df: 
                           np.histogram(df['created_at_hour'], 
                                        range(0,24)))

### Mean links per tweet

In [15]:
new_metrics.add_column('mean_links_per_tweet', 
                       lambda df: 
                           df['entities.urls'].apply(lambda x: 
                                                     len(x)).mean())

### Mean number of words per tweet

In [16]:
new_metrics.add_column('mean_words_per_tweet',
                       lambda df: 
                           df['clean_text'].apply(lambda x: 
                                                 len(x.split())).mean())

### Mean number of hashtags per Tweet

In [17]:
new_metrics.add_column('mean_hashtags_per_tweet', 
                       lambda df: 
                           df['entities.hashtags'].apply(lambda x: 
                                                         len(x)).mean())

### Mean user mentions per Tweet

In [18]:
new_metrics.add_column('mean_user_mentions_per_tweet', 
                       lambda df: 
                           df['entities.user_mentions']\
                               .apply(lambda x: 
                                      len(x)).mean())

### Mean favorite count per Tweet

In [19]:
new_metrics.add_column('mean_favourites_per_tweet', 
                       lambda df: 
                           df['favorite_count'].mean())

### Mean media per Tweet

In [20]:
new_metrics.add_column('mean_media_per_tweet', 
                       lambda df: 
                           df['entities.media'].apply(lambda x: 
                                                    0 if x != x 
                                                    else len(x)).mean())

### Mean symbols per Tweet

In [21]:
new_metrics.add_column('mean_user_symbols_per_tweet', 
                       lambda df: 
                           df['entities.symbols'].apply(lambda x: 
                                                    0 if x != x 
                                                    else len(x)).mean())

### Mean retweet count per tweet

In [22]:
new_metrics.add_column('mean_retweets_per_tweet', 
                       lambda df: df['retweet_count'].mean())

### Mean truncated text per tweets

In [23]:
new_metrics.add_column('mean_truncations_per_tweet', 
                       lambda df: 
                           df['truncated'].apply(lambda x: 
                                                 1 if x else 0).mean())

### Mean number of links per tweet source

In [24]:
sources = [c[9:] for c in tweets_df.columns if c.startswith('links_to')]

for source in sources:
    new_metrics.add_column(f'mean_links_to_{source}', 
                           lambda df, source=source: 
                               df[f'links_to_{source}'].mean())

### Retweet ratio

In [25]:
new_metrics.add_column('retweet_ratio', 
                       lambda df: df['text']\
                           .apply(lambda x: 
                                  (1 if x.strip().startswith('RT @') 
                                     else 0)).mean())

### Natural Language Processing

We will collect statistics on these named entities:
- **PERSON**: People, including fictional.
- **NORP**: Nationalities or religious or political groups.
- **ORG**: Companies, agencies, institutions, etc.
- **GPE**: Countries, cities, states.
- **PRODUCT**: Objects, vehicles, foods, etc. (Not services.)
- **LAW**: Named documents made into laws.
- **MONEY**: Monetary values, including unit.

In [26]:
named_entities = ['PERSON', 'NORP', 'ORG', 'GPE', 
                  'PRODUCT', 'LAW', 'MONEY']

for entity in named_entities:
    new_metrics.add_column(f'mean_ref_to_{entity.lower()}', 
                          lambda df,entity=entity: df['named_entities']\
                               .apply(lambda x: x.count(entity)).mean())

## Add Summary Metrics Columns

In [27]:
users_summary_df = new_metrics.run(users_df, tweets_df)

Processing:
Adding column tweets_per_hour_00...
Adding column tweets_per_hour_01...
Adding column tweets_per_hour_02...
Adding column tweets_per_hour_03...
Adding column tweets_per_hour_04...
Adding column tweets_per_hour_05...
Adding column tweets_per_hour_06...
Adding column tweets_per_hour_07...
Adding column tweets_per_hour_08...
Adding column tweets_per_hour_09...
Adding column tweets_per_hour_10...
Adding column tweets_per_hour_11...
Adding column tweets_per_hour_12...
Adding column tweets_per_hour_13...
Adding column tweets_per_hour_14...
Adding column tweets_per_hour_15...
Adding column tweets_per_hour_16...
Adding column tweets_per_hour_17...
Adding column tweets_per_hour_18...
Adding column tweets_per_hour_19...
Adding column tweets_per_hour_20...
Adding column tweets_per_hour_21...
Adding column tweets_per_hour_22...
Adding column tweets_per_hour_23...
Adding column tweets_per_hour...
Adding column mean_links_per_tweet...
Adding column mean_words_per_tweet...
Adding column m

### Smoke test to check to see if columns added with correct content

In [28]:
users_summary_df.head()

Unnamed: 0_level_0,contributors_enabled,created_at,default_profile,default_profile_image,description,entities.description.urls,entities.url.urls,favourites_count,follow_request_sent,followers_count,...,mean_links_to_top_sports,mean_links_to_top_adult,retweet_ratio,mean_ref_to_person,mean_ref_to_norp,mean_ref_to_org,mean_ref_to_gpe,mean_ref_to_product,mean_ref_to_law,mean_ref_to_money
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
934576158305345536,False,Sun Nov 26 00:14:54 +0000 2017,True,False,"Unapologetic advocate of common law, constitut...","[{'url': 'https://t.co/oOeCSGuJbs', 'expanded_...","[{'url': 'https://t.co/oOeCSGuJbs', 'expanded_...",101647,False,1063,...,0.0,0.0,0.61,0.28,0.05,0.27,0.07,0.0,0.0,0.0
3133965632,False,Fri Apr 03 02:54:56 +0000 2015,True,False,definitely a real human woman and not three du...,[],,2805,False,55,...,0.0,0.0,0.57,0.11,0.01,0.15,0.02,0.0,0.0,0.0
3167730160,False,Wed Apr 15 00:56:50 +0000 2015,True,False,"where i end, you’ll begin.",[],,70639,False,161,...,0.0,0.0,1.0,0.26,0.03,0.19,0.19,0.02,0.0,0.0
893957155,False,Sat Oct 20 20:11:16 +0000 2012,False,False,it's Kah-doom • UNC • Nigerian • it’s not by f...,[],"[{'url': 'https://t.co/jl4BtTkGyL', 'expanded_...",56239,False,429,...,0.0,0.0,0.84,0.22,0.04,0.15,0.11,0.01,0.0,0.01
540553113,False,Fri Mar 30 02:47:09 +0000 2012,False,False,21.,[],,11727,False,454,...,0.0,0.0,0.39,0.09,0.03,0.23,0.04,0.0,0.0,0.0


## Save updated users' dataframe for the next step

In [29]:
users_summary_df.to_pickle(users_summary_df_gz_file, compression='gzip')