# Exploring the Complete Twitter Dataset

* The purpose of this notebook is to explore the full dataset of 400k tweets relating to #bcpoli
* Tweet created dates range from August 14, 2020 to November 19, 2020
* Columns not required for analysis will be dropped here.
 * The remaining data will be exported for preprocessing in "classify_unlabelled_tweets.ipynb"

In [1]:
import sys  
sys.path.insert(0, 'PATH/TO/data_bootcamp/data-science-final-project/scripts/')

import pandas as pd
import numpy as np
import pdpipe as pdp

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from IPython.display import JSON
import matplotlib.pyplot as plt
import pickle

# Pandas Display Settings, if you wish

pd.set_option('display.max_colwidth', None)
#pd.set_option("display.max_columns", 30)

# Import custom functions 
from functions import *

## ~398K Tweets from August 14th, 2020 - November 19th, 2020

In [2]:
%%time

df = pd.read_json('/Volumes/My Passport/Tweets/bcpoli_400k_extended.jsonl', lines=True)

CPU times: user 2min 9s, sys: 2min 29s, total: 4min 38s
Wall time: 5min 58s


In [3]:
df.info(memory_usage='deep')
# It appears that over ten thousand. tweets have been deleted since August

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384221 entries, 0 to 384220
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype              
---  ------                     --------------   -----              
 0   created_at                 384221 non-null  datetime64[ns, UTC]
 1   id                         384221 non-null  int64              
 2   id_str                     384221 non-null  int64              
 3   full_text                  384221 non-null  object             
 4   truncated                  384221 non-null  bool               
 5   display_text_range         384221 non-null  object             
 6   entities                   384221 non-null  object             
 7   source                     384221 non-null  object             
 8   in_reply_to_status_id      26530 non-null   float64            
 9   in_reply_to_status_id_str  26530 non-null   float64            
 10  in_reply_to_user_id        29115 non-null   float64     

In [43]:
%%time
# Make copy of imported data and set index to unique tweet ID

raw = df.copy()
raw.set_index('id_str', inplace=True)

# Filter out columns

raw = raw[['created_at','user','full_text','retweet_count']]

# Extract features from user column dict with .get

raw = extract_username(raw)

CPU times: user 467 ms, sys: 19.2 ms, total: 486 ms
Wall time: 486 ms


## Creating a Data Processing Pipeline

In [45]:
%%time

# Pandas Processing Pipeline

pipeline = pdp.ColDrop('user')
pipeline+= pdp.ApplyByCols('full_text', lower_case, 'full_lower', drop=False)
pipeline+= pdp.ApplyByCols('full_lower', covid_mention, 'covid_mention', drop=True)
pipeline+= pdp.ApplyByCols('full_text', is_retweet, 'is_retweet', drop=False)
pipeline+= pdp.ApplyByCols('full_text', preprocess, 'full_clean', drop=False)
pipeline+= pdp.ApplyByCols('full_text', (lambda x: preprocess(x, hashtags=True)), 'no_hashtags', drop=False) 
raw = pipeline(raw)

CPU times: user 2min 16s, sys: 1.38 s, total: 2min 17s
Wall time: 2min 18s


## Create new DataFrames separate analysis

In [49]:
# Create new DataFrame of only original tweets

df_no_rt = raw[raw['is_retweet'] == 0]
df_no_rt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101571 entries, 1296949926543335424 to 1328850997704237056
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   created_at     101571 non-null  datetime64[ns, UTC]
 1   full_text      101571 non-null  object             
 2   no_hashtags    101571 non-null  object             
 3   full_clean     101571 non-null  object             
 4   is_retweet     101571 non-null  int64              
 5   covid_mention  101571 non-null  int64              
 6   retweet_count  101571 non-null  int64              
 7   user_name      101571 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(4)
memory usage: 7.0+ MB


In [50]:
# Create a new DatFrame with only original non-covid tweets
# This will be used to guage the covids impact on sentiment

df_no_rt_no_covid = df_no_rt[df_no_rt['covid_mention'] == 0]
df_no_rt_no_covid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73394 entries, 1296950909239357440 to 1328850935632719872
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   created_at     73394 non-null  datetime64[ns, UTC]
 1   full_text      73394 non-null  object             
 2   no_hashtags    73394 non-null  object             
 3   full_clean     73394 non-null  object             
 4   is_retweet     73394 non-null  int64              
 5   covid_mention  73394 non-null  int64              
 6   retweet_count  73394 non-null  int64              
 7   user_name      73394 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(4)
memory usage: 5.0+ MB


In [51]:
# Create a new DatFrame with only original covid tweets
# This will be used to guage the covids impact on sentiment

df_no_rt_covid_mention = df_no_rt[df_no_rt['covid_mention'] == 1]
df_no_rt_covid_mention.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28177 entries, 1296949926543335424 to 1328850997704237056
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   created_at     28177 non-null  datetime64[ns, UTC]
 1   full_text      28177 non-null  object             
 2   no_hashtags    28177 non-null  object             
 3   full_clean     28177 non-null  object             
 4   is_retweet     28177 non-null  int64              
 5   covid_mention  28177 non-null  int64              
 6   retweet_count  28177 non-null  int64              
 7   user_name      28177 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(4)
memory usage: 1.9+ MB


## Examining some metrics

In [52]:
# Total retweet count of all 384221 tweets

raw.is_retweet.sum()

282650

In [53]:
# Estimated total covid/pandemic mentions in 384221 tweets

raw.covid_mention.sum()

84096

In [60]:
df_no_rt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101571 entries, 1296949926543335424 to 1328850997704237056
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   created_at     101571 non-null  datetime64[ns, UTC]
 1   full_text      101571 non-null  object             
 2   no_hashtags    101571 non-null  object             
 3   full_clean     101571 non-null  object             
 4   is_retweet     101571 non-null  int64              
 5   covid_mention  101571 non-null  int64              
 6   retweet_count  101571 non-null  int64              
 7   user_name      101571 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(4)
memory usage: 7.0+ MB


In [55]:
# Estimated total covid/pandemic mentions in 101569 original tweets

covid_no_rt = df_no_rt.covid_mention.sum()
mention_ratio_no_rt = (covid_no_rt/101571) * 100

print('Estimated percentage of tweets related to #bcpoli that mention covid or the pandemic in some way:', '%0.2f'% mention_ratio_no_rt,'%')
print('Total of 101569 original tweets related to #bcpoli that mention covid or the pandemic in some way:', covid_no_rt)

Estimated percentage of tweets related to #bcpoli that mention covid or the pandemic in some way: 27.74 %
Total of 101569 original tweets related to #bcpoli that mention covid or the pandemic in some way: 28177


In [56]:
# Most frequent bigrams, hastags removed, stop words removed - Includes original tweets and retweets

top_bigrams(raw, 20)

(bc, liberals)           11017
(john, horgan)            8474
(bc, liberal)             7886
(bonnie, henry)           7668
(dr, bonnie)              7358
(new, cases)              6807
(bc, ndp)                 6088
(andrew, wilkinson)       5667
(british, columbians)     5046
(dr, henry)               4971
(british, columbia)       4951
(bc, election)            3401
(henry, says)             3088
(snap, election)          2597
(public, health)          2476
(health, care)            2363
(site, c)                 2316
(fraser, health)          2273
(old, growth)             2070
(bc, greens)              2042
dtype: int64

In [57]:
# Top bigrams from original tweets only

top_bigrams(df_no_rt,20)

(bc, liberals)            2448
(bonnie, henry)           1709
(dr, bonnie)              1697
(john, horgan)            1548
(bc, liberal)             1535
(bc, ndp)                 1493
(new, cases)              1466
(dr, henry)               1305
(british, columbians)     1180
(british, columbia)       1067
(andrew, wilkinson)       1027
(bc, election)             916
(snap, election)           858
(public, health)           797
(active, cases)            769
(henry, says)              699
(health, care)             630
(provincial, election)     614
(fraser, health)           604
(old, growth)              574
dtype: int64

In [58]:
# Top bigrams from original tweets only, without covid mentioned
# This is a good example of when stemming is benficial - See pluralized words below
# Also a great example of why trigrams are useful - (old, growth)  (growth, forests)

top_bigrams(df_no_rt_no_covid,20)

(bc, liberals)            2258
(bc, liberal)             1434
(bc, ndp)                 1331
(john, horgan)            1198
(andrew, wilkinson)        935
(british, columbians)      868
(bc, election)             783
(british, columbia)        725
(old, growth)              562
(provincial, election)     534
(snap, election)           523
(green, party)             521
(bc, greens)               441
(bc, green)                434
(site, c)                  407
(climate, change)          400
(mental, health)           398
(sign, petition)           391
(health, care)             385
(growth, forests)          374
dtype: int64

In [59]:
# Pickle DataFrames for later use

#df_no_rt.to_pickle('~/data_bootcamp/data-science-final-project/data/df_original_tweets.pkl')

#df_no_rt_covid_mention.to_pickle('~/data_bootcamp/data-science-final-project/data/df_original_tweets_covid_mention.pkl')

#df_no_rt_no_covid.to_pickle('/Users/lclark/data_bootcamp/data-science-final-project/data/df_original_tweets_no_covid.pkl')

#raw.to_pickle('/Users/lclark/data_bootcamp/data-science-final-project/data/df_filtered_tweets_384k.pkl')