# Exploring the Complete Twitter Dataset

* The purpose of this notebook is to explore the full dataset of 400k tweets relating to #bcpoli
* Tweet created dates range from August 14, 2020 to November 19, 2020
* Columns not required for analysis will be dropped here.
 * The remaining data will be exported for preprocessing in "classify_unlabelled_tweets.ipynb"

In [1]:
import sys  
sys.path.insert(0, '~/data_bootcamp/data-science-final-project/scripts/')

import pandas as pd
import numpy as np
import pdpipe as pdp

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from IPython.display import JSON
import matplotlib.pyplot as plt
import pickle

# Pandas Display Settings, if you wish

#pd.set_option('display.max_colwidth', None)
#pd.set_option("display.max_columns", 30)

# Import custom functions 
from functions import *

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lclark/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## ~398K Tweets from August 14th, 2020 - November 19th, 2020

In [2]:
%%time

df = pd.read_json('/Volumes/My Passport/Tweets/bcpoli_400k_extended.jsonl', lines=True)

CPU times: user 2min 19s, sys: 3min 14s, total: 5min 34s
Wall time: 7min 25s


In [3]:
df.info(memory_usage='deep')
# It appears that over ten thousand. tweets have been deleted since August

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384221 entries, 0 to 384220
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype              
---  ------                     --------------   -----              
 0   created_at                 384221 non-null  datetime64[ns, UTC]
 1   id                         384221 non-null  int64              
 2   id_str                     384221 non-null  int64              
 3   full_text                  384221 non-null  object             
 4   truncated                  384221 non-null  bool               
 5   display_text_range         384221 non-null  object             
 6   entities                   384221 non-null  object             
 7   source                     384221 non-null  object             
 8   in_reply_to_status_id      26530 non-null   float64            
 9   in_reply_to_status_id_str  26530 non-null   float64            
 10  in_reply_to_user_id        29115 non-null   float64     

In [4]:
%%time

# Make copy of imported data and set index to unique tweet ID
raw = df.copy()
raw = raw[~raw.index.duplicated(keep='first')]
# Filter out columns
raw = col_filter(raw)
# Extract features from user column dict with .get
raw = extract_username(raw)
# Create is_retweet column
raw['is_retweet'] = raw['full_text'].apply(is_retweet) # This was originally for pdpipe and could be rewrittten
# Create new col "rt_full_text" from dict column "retweet_status"
raw = extract_full_text(raw)
# Repalce truncated retweet full_text
raw = replace_retweet_text(raw)

ValueError: cannot reindex from a duplicate axis

## Creating a Data Processing Pipeline

In [27]:
%%time

# Pandas Processing Pipeline

pipeline = pdp.ColDrop('user')
pipeline+= pdp.ApplyByCols('full_text', lower_case, 'full_lower', drop=False)
pipeline+= pdp.ApplyByCols('full_lower', covid_mention, 'covid_mention', drop=True)
pipeline+= pdp.ApplyByCols('full_text', preprocess, 'full_clean', drop=False)
pipeline+= pdp.ApplyByCols('full_text', (lambda x: preprocess(x, hashtags=True)), 'no_hashtags', drop=False) 
pipeline+= pdp.ApplyByCols('full_text', vader_preprocess, 'vader_text', drop=False)
pipeline+= pdp.ColDrop('retweeted_status') 
pipeline+= pdp.ColDrop('rt_full_text')
raw = pipeline(raw)
raw.sample(n=5)

CPU times: user 276 ms, sys: 30 ms, total: 306 ms
Wall time: 307 ms


Unnamed: 0_level_0,created_at,full_text,vader_text,no_hashtags,full_clean,covid_mention,retweet_count,user_name,is_retweet
id_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1306334233523179520,2020-09-16 20:48:46+00:00,In the summer Boot destroyed all of the bandan...,In the summer Boot destroyed all of the bandan...,"[summer, boot, destroyed, bandannas, confedera...","[summer, boot, destroyed, bandannas, confedera...",0,8,7adamandrews,1
1327113858251571200,2020-11-13 04:59:35+00:00,All that money they're blowing on the #SiteC d...,All that money they're blowing on the #SiteC d...,"[money, theyre, blowing, dam, yeah, gon, na, pay]","[money, theyre, blowing, sitec, dam, yeah, gon...",0,54,koenigcomm,1
1318789405692239872,2020-10-21 05:41:10+00:00,"Douglas Todd: Finally, the party's over for no...","Douglas Todd: Finally, the party's over for no...","[douglas, todd, finally, partys, nolimit, poli...","[douglas, todd, finally, partys, nolimit, poli...",0,4,suestroud,1
1318947440611844096,2020-10-21 16:09:09+00:00,In BC its #frackingLNG #deforestation #sitec i...,In BC its #frackingLNG #deforestation #sitec i...,"[bc, industries, pls, sake, amp, vote, riding]","[bc, frackinglng, deforestation, sitec, indust...",0,1,cindian1,0
1309232401504063488,2020-09-24 20:45:03+00:00,I am so excited and so honoured to be the @bcn...,I am so excited and so honoured to be the @bcn...,"[excited, honoured, candidate, victoriabeacon,...","[excited, honoured, candidate, victoriabeacon,...",0,44,briancampbellC1,1


In [64]:
raw.user_name.nunique()

41935

## Create new DataFrames separate analysis

In [65]:
# Using the updated DataFrame of tweet.
# df_filtered_tweets_master has been processed identically as above
# df_filtered_tweets_master will always be the most current DataFrame
# Reproduciibility still possible with /data/tweet_ids.txt. It is updated with the tweet_ids from df_filtered_tweets_master
raw = pd.read_pickle('~/data_bootcamp/data-science-final-project/data/df_filtered_tweets_master.pkl')

In [66]:
# Create a new column with word lemma
# This will drastically improve the qauilty and variance of ngrams

raw['lemma'] = raw.no_hashtags.apply(lambda x: lemmatize_text(x))

In [86]:
raw.head()

Unnamed: 0_level_0,created_at,full_text,vader_text,no_hashtags,full_clean,covid_mention,retweet_count,user_name,is_retweet,lemma
id_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1294232573636304896,2020-08-14 11:21:05+00:00,Dr. Bonnie Henry is one of the greatest leader...,Dr. Bonnie Henry is one of the greatest leader...,"[dr, bonnie, henry, one, greatest, leaders, ti...","[dr, bonnie, henry, one, greatest, leaders, ti...",1,0,bcpoli,0,"[dr, bonnie, henry, one, greatest, leader, tim..."
1294233211262783488,2020-08-14 11:23:37+00:00,"""the child is already feeling better... But in...","""the child is already feeling better... But in...","[child, already, feeling, better, family, pare...","[child, already, feeling, better, family, pare...",0,0,nspector4,0,"[child, already, feeling, better, family, pare..."
1294233585491144704,2020-08-14 11:25:07+00:00,@RickAnderson Better BC news... #bcpoli #cdnpo...,@RickAnderson Better BC news... #bcpoli #cdnpoli,"[better, bc, news]","[better, bc, news, bcpoli, cdnpoli]",0,2,nspector4,0,"[better, bc, news]"
1294235457040142336,2020-08-14 11:32:33+00:00,"#Fortnite developer #EpicGames sues #Apple, #G...","#Fortnite developer #EpicGames sues #Apple, #G...","[developer, sues, removal, video, game, app, s...","[fortnite, developer, epicgames, sues, apple, ...",0,0,Tammy_Richard,0,"[developer, sue, removal, video, game, app, st..."
1294237971592261632,2020-08-14 11:42:32+00:00,Sweden never shut down and it is doing better ...,Sweden never shut down and it is doing better ...,"[sweden, never, shut, better, us, canadas, dam...","[sweden, never, shut, better, us, canadas, dam...",0,0,bcpoli,0,"[sweden, never, shut, better, u, canada, damag..."


In [67]:
# Create new DataFrame of only original tweets

df_no_rt = raw[raw['is_retweet'] == 0]
df_no_rt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112734 entries, 1294232573636304896 to 1333143355165913088
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   created_at     112734 non-null  datetime64[ns, UTC]
 1   full_text      112734 non-null  object             
 2   vader_text     112734 non-null  object             
 3   no_hashtags    112734 non-null  object             
 4   full_clean     112734 non-null  object             
 5   covid_mention  112734 non-null  int64              
 6   retweet_count  112734 non-null  int64              
 7   user_name      112734 non-null  object             
 8   is_retweet     112734 non-null  int64              
 9   lemma          112734 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(6)
memory usage: 9.5+ MB


In [68]:
# Create a new DatFrame with only original non-covid tweets
# This will be used to guage the covids impact on sentiment

df_no_rt_no_covid = df_no_rt[df_no_rt['covid_mention'] == 0]
df_no_rt_no_covid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 79311 entries, 1294233211262783488 to 1333143355165913088
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   created_at     79311 non-null  datetime64[ns, UTC]
 1   full_text      79311 non-null  object             
 2   vader_text     79311 non-null  object             
 3   no_hashtags    79311 non-null  object             
 4   full_clean     79311 non-null  object             
 5   covid_mention  79311 non-null  int64              
 6   retweet_count  79311 non-null  int64              
 7   user_name      79311 non-null  object             
 8   is_retweet     79311 non-null  int64              
 9   lemma          79311 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(6)
memory usage: 6.7+ MB


In [69]:
# Create a new DataFrame with only original covid, mentioning tweets
# This will be used to guage the covids impact on sentiment

df_no_rt_covid_mention = df_no_rt[df_no_rt['covid_mention'] == 1]
df_no_rt_covid_mention.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33423 entries, 1294232573636304896 to 1333146115932209152
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   created_at     33423 non-null  datetime64[ns, UTC]
 1   full_text      33423 non-null  object             
 2   vader_text     33423 non-null  object             
 3   no_hashtags    33423 non-null  object             
 4   full_clean     33423 non-null  object             
 5   covid_mention  33423 non-null  int64              
 6   retweet_count  33423 non-null  int64              
 7   user_name      33423 non-null  object             
 8   is_retweet     33423 non-null  int64              
 9   lemma          33423 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(6)
memory usage: 2.8+ MB


## Examining some metrics

In [84]:
# Total retweet count of all 384221 tweets

raw.is_retweet.sum()

307900

In [85]:
# Estimated total covid/pandemic mentions

raw.covid_mention.sum()

128746

In [75]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 420634 entries, 1294232573636304896 to 1333143090723319808
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   created_at     420634 non-null  datetime64[ns, UTC]
 1   full_text      420634 non-null  object             
 2   vader_text     420634 non-null  object             
 3   no_hashtags    420634 non-null  object             
 4   full_clean     420634 non-null  object             
 5   covid_mention  420634 non-null  int64              
 6   retweet_count  420634 non-null  int64              
 7   user_name      420634 non-null  object             
 8   is_retweet     420634 non-null  int64              
 9   lemma          420634 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(6)
memory usage: 35.3+ MB


In [76]:
# Estimated total covid/pandemic mentions in 112131 original tweets

no_rt_count = df_no_rt.shape[0]
no_rt_covid_count = df_no_rt.covid_mention.sum()
mention_ratio_no_rt = (no_rt_covid_count/no_rt_count) * 100

print('Estimated percentage of tweets related to #bcpoli that mention covid or the pandemic in some way:', '%0.2f'% mention_ratio_no_rt,'%')
print(f'Total of {no_rt_count} original tweets related to #bcpoli that mention covid or the pandemic in some way:', no_rt_covid_count)

Estimated percentage of tweets related to #bcpoli that mention covid or the pandemic in some way: 29.65 %
Total of 112734 original tweets related to #bcpoli that mention covid or the pandemic in some way: 33423


## Bigrams, Trigrams and Topics

A new column for lemmatized words should be created when extracting ngrams, as the ngrams will be diluted with plural and non-plural forms of words

In [77]:
# Most frequent bigrams, hastags removed, stop words removed - Includes original tweets and retweets
# This will be more interesting with data grouped by week

top_ngrams(raw, n=2, ngrams=20)

(bc, liberal)             22869
(bonnie, henry)           11302
(dr, bonnie)              10327
(john, horgan)            10315
(new, case)                9851
(british, columbia)        8730
(british, columbians)      8511
(andrew, wilkinson)        7733
(dr, henry)                7407
(bc, ndp)                  7385
(bc, green)                5034
(henry, say)               4342
(public, health)           4088
(site, c)                  4043
(bc, election)             3826
(fraser, health)           3812
(snap, election)           3617
(health, care)             3600
(wear, mask)               3211
(physical, distancing)     3163
dtype: int64

In [78]:
# Top bigrams from original tweets only

top_ngrams(df_no_rt, n=2, ngrams=20)

(bc, liberal)             4093
(bonnie, henry)           2229
(dr, bonnie)              2030
(new, case)               1744
(dr, henry)               1640
(john, horgan)            1630
(bc, ndp)                 1545
(british, columbia)       1346
(british, columbians)     1293
(andrew, wilkinson)       1183
(bc, green)                964
(bc, election)             964
(public, health)           927
(snap, election)           915
(active, case)             869
(henry, say)               845
(fraser, health)           742
(health, care)             675
(provincial, election)     661
(look, like)               650
dtype: int64

In [79]:
# Top bigrams from original tweets only, without covid mentioned
# This is a good example of when stemming is benficial - See pluralized words below

top_ngrams(df_no_rt_no_covid, n=2, ngrams=20)

(bc, liberal)             3796
(bc, ndp)                 1366
(john, horgan)            1252
(andrew, wilkinson)       1079
(british, columbians)      945
(british, columbia)        890
(bc, green)                881
(bc, election)             819
(old, growth)              624
(provincial, election)     577
(snap, election)           556
(green, party)             553
(election, bc)             493
(look, like)               492
(site, c)                  472
(mental, health)           443
(climate, change)          428
(growth, forest)           427
(sign, petition)           405
(health, care)             396
dtype: int64

In [80]:
# Most frequent trigrams, hastags removed, stop words removed - Includes original tweets and retweets
# This will be more interesting with data grouped by week

top_ngrams(raw, n=3, ngrams=20)

(dr, bonnie, henry)              10066
(dr, henry, say)                  2484
(premier, john, horgan)           1996
(bc, liberal, candidate)          1590
(new, case, covid19)              1367
(bc, liberal, government)         1353
(bc, liberal, party)              1333
(old, growth, forest)             1324
(bonnie, henry, say)              1146
(new, covid19, case)              1130
(breaking, dr, bonnie)            1102
(leader, john, horgan)            1068
(british, columbia, death)        1031
(ndp, leader, john)               1023
(death, far, year)                1016
(288, overdoses, 1202)            1012
(columbia, death, far)            1012
(coronavirus, 288, overdoses)     1012
(year, coronavirus, 288)          1012
(far, year, coronavirus)          1012
dtype: int64

In [81]:
# Top trigrams from original tweets only

top_ngrams(df_no_rt, n=3, ngrams=20)

(dr, bonnie, henry)          1949
(dr, henry, say)              456
(old, growth, forest)         430
(one, best, thing)            339
(protecting, old, growth)     338
(sign, petition, protect)     335
(last, giant, tree)           334
(petition, protect, last)     333
(protect, last, giant)        333
(thing, mitigate, impact)     333
(best, thing, mitigate)       333
(giant, tree, logging)        333
(mitigate, impact, sign)      332
(impact, sign, petition)      332
(forest, one, best)           332
(growth, forest, one)         328
(bc, liberal, party)          314
(going, back, school)         264
(bc, green, party)            257
(back, school, september)     247
dtype: int64

In [82]:
# Top trigrams from original tweets only, without covid mentioned
# This is a good example of when stemming is benficial - See pluralized words below
# Also a great example of why trigrams are useful - (old, growth)  (growth, forests)

top_ngrams(df_no_rt_no_covid, n=3, ngrams=20)

(old, growth, forest)          426
(protecting, old, growth)      338
(one, best, thing)             337
(sign, petition, protect)      335
(last, giant, tree)            334
(protect, last, giant)         333
(thing, mitigate, impact)      333
(petition, protect, last)      333
(best, thing, mitigate)        333
(giant, tree, logging)         333
(forest, one, best)            332
(impact, sign, petition)       332
(mitigate, impact, sign)       332
(growth, forest, one)          328
(bc, liberal, party)           299
(bc, green, party)             250
(bc, liberal, candidate)       229
(bc, liberal, leader)          209
(leader, andrew, wilkinson)    188
(premier, john, horgan)        171
dtype: int64

In [88]:
# Pickle DataFrames for later use

#df_no_rt.to_pickle('~/data_bootcamp/data-science-final-project/data/df_original_tweets.pkl')

#df_no_rt_covid_mention.to_pickle('~/data_bootcamp/data-science-final-project/data/df_original_tweets_covid_mention.pkl')

#df_no_rt_no_covid.to_pickle('~/data_bootcamp/data-science-final-project/data/df_original_tweets_no_covid.pkl')