# Exploring the Complete Twitter Dataset

* The purpose of this notebook is to explore the full dataset of 400k tweets relating to #bcpoli
* Tweet created dates range from August 14, 2020 to November 19, 2020
* Columns not required for analysis will be dropped here.
 * The remaining data will be exported for preprocessing in "classify_unlabelled_tweets.ipynb"

In [2]:
import sys  
sys.path.insert(0, '/Users/lclark/data_bootcamp/data-science-final-project/scripts/')

import pandas as pd
import numpy as np
import pdpipe as pdp

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from IPython.display import JSON
import matplotlib.pyplot as plt
import pickle

# Pandas Display Settings, if you wish

#pd.set_option('display.max_colwidth', None)
#pd.set_option("display.max_columns", 30)

# Import custom functions 
from functions import *

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lclark/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## ~398K Tweets from August 14th, 2020 - November 19th, 2020

In [2]:
%%time

df = pd.read_json('/Volumes/My Passport/Tweets/bcpoli_400k_extended.jsonl', lines=True)

CPU times: user 2min 19s, sys: 3min 14s, total: 5min 34s
Wall time: 7min 25s


In [3]:
df.info(memory_usage='deep')
# It appears that over ten thousand. tweets have been deleted since August

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384221 entries, 0 to 384220
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype              
---  ------                     --------------   -----              
 0   created_at                 384221 non-null  datetime64[ns, UTC]
 1   id                         384221 non-null  int64              
 2   id_str                     384221 non-null  int64              
 3   full_text                  384221 non-null  object             
 4   truncated                  384221 non-null  bool               
 5   display_text_range         384221 non-null  object             
 6   entities                   384221 non-null  object             
 7   source                     384221 non-null  object             
 8   in_reply_to_status_id      26530 non-null   float64            
 9   in_reply_to_status_id_str  26530 non-null   float64            
 10  in_reply_to_user_id        29115 non-null   float64     

In [4]:
%%time

# Make copy of imported data and set index to unique tweet ID
raw = df.copy()
raw = raw[~raw.index.duplicated(keep='first')]
# Filter out columns
raw = col_filter(raw)
# Extract features from user column dict with .get
raw = extract_username(raw)
# Create is_retweet column
raw['is_retweet'] = raw['full_text'].apply(is_retweet) # This was originally for pdpipe and could be rewrittten
# Create new col "rt_full_text" from dict column "retweet_status"
raw = extract_full_text(raw)
# Repalce truncated retweet full_text
raw = replace_retweet_text(raw)

ValueError: cannot reindex from a duplicate axis

## Creating a Data Processing Pipeline

In [27]:
%%time

# Pandas Processing Pipeline

pipeline = pdp.ColDrop('user')
pipeline+= pdp.ApplyByCols('full_text', lower_case, 'full_lower', drop=False)
pipeline+= pdp.ApplyByCols('full_lower', covid_mention, 'covid_mention', drop=True)
pipeline+= pdp.ApplyByCols('full_text', preprocess, 'full_clean', drop=False)
pipeline+= pdp.ApplyByCols('full_text', (lambda x: preprocess(x, hashtags=True)), 'no_hashtags', drop=False) 
pipeline+= pdp.ApplyByCols('full_text', vader_preprocess, 'vader_text', drop=False)
pipeline+= pdp.ColDrop('retweeted_status') 
pipeline+= pdp.ColDrop('rt_full_text')
raw = pipeline(raw)
raw.sample(n=5)

CPU times: user 276 ms, sys: 30 ms, total: 306 ms
Wall time: 307 ms


Unnamed: 0_level_0,created_at,full_text,vader_text,no_hashtags,full_clean,covid_mention,retweet_count,user_name,is_retweet
id_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1306334233523179520,2020-09-16 20:48:46+00:00,In the summer Boot destroyed all of the bandan...,In the summer Boot destroyed all of the bandan...,"[summer, boot, destroyed, bandannas, confedera...","[summer, boot, destroyed, bandannas, confedera...",0,8,7adamandrews,1
1327113858251571200,2020-11-13 04:59:35+00:00,All that money they're blowing on the #SiteC d...,All that money they're blowing on the #SiteC d...,"[money, theyre, blowing, dam, yeah, gon, na, pay]","[money, theyre, blowing, sitec, dam, yeah, gon...",0,54,koenigcomm,1
1318789405692239872,2020-10-21 05:41:10+00:00,"Douglas Todd: Finally, the party's over for no...","Douglas Todd: Finally, the party's over for no...","[douglas, todd, finally, partys, nolimit, poli...","[douglas, todd, finally, partys, nolimit, poli...",0,4,suestroud,1
1318947440611844096,2020-10-21 16:09:09+00:00,In BC its #frackingLNG #deforestation #sitec i...,In BC its #frackingLNG #deforestation #sitec i...,"[bc, industries, pls, sake, amp, vote, riding]","[bc, frackinglng, deforestation, sitec, indust...",0,1,cindian1,0
1309232401504063488,2020-09-24 20:45:03+00:00,I am so excited and so honoured to be the @bcn...,I am so excited and so honoured to be the @bcn...,"[excited, honoured, candidate, victoriabeacon,...","[excited, honoured, candidate, victoriabeacon,...",0,44,briancampbellC1,1


In [7]:
%%time

# Pandas Processing Pipeline

pipeline = pdp.ColDrop('user')
pipeline+= pdp.ApplyByCols('full_text', lower_case, 'full_lower', drop=False)
pipeline+= pdp.ApplyByCols('full_lower', covid_mention, 'covid_mention', drop=True)
pipeline+= pdp.ApplyByCols('full_text', is_retweet, 'is_retweet', drop=False)
pipeline+= pdp.ApplyByCols('full_text', preprocess, 'full_clean', drop=False)
pipeline+= pdp.ApplyByCols('full_text', (lambda x: preprocess(x, hashtags=True)), 'no_hashtags', drop=False) 
raw = pipeline(raw)

CPU times: user 2.04 s, sys: 98.2 ms, total: 2.14 s
Wall time: 2.14 s


## Create new DataFrames separate analysis

In [4]:
# Using the updated DataFrame of tweet.
# df_filtered_tweets_master has been processed identically as above
# df_filtered_tweets_master will always be the most current DataFrame
# Reproduciibility still possible with /data/tweet_ids.txt. It is updated with the tweet_ids from df_filtered_tweets_master
raw = pd.read_pickle('~/data_bootcamp/data-science-final-project/data/df_filtered_tweets_master.pkl')

In [5]:
# Create new DataFrame of only original tweets

df_no_rt = raw[raw['is_retweet'] == 0]
df_no_rt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106077 entries, 1294232573636304896 to 1329212767929200640
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   created_at     106077 non-null  datetime64[ns, UTC]
 1   full_text      106077 non-null  object             
 2   vader_text     106077 non-null  object             
 3   no_hashtags    106077 non-null  object             
 4   full_clean     106077 non-null  object             
 5   covid_mention  106077 non-null  int64              
 6   retweet_count  106077 non-null  int64              
 7   user_name      106077 non-null  object             
 8   is_retweet     106077 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(3), object(5)
memory usage: 8.1+ MB


In [6]:
# Create a new DatFrame with only original non-covid tweets
# This will be used to guage the covids impact on sentiment

df_no_rt_no_covid = df_no_rt[df_no_rt['covid_mention'] == 0]
df_no_rt_no_covid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75462 entries, 1294233211262783488 to 1329212785058836480
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   created_at     75462 non-null  datetime64[ns, UTC]
 1   full_text      75462 non-null  object             
 2   vader_text     75462 non-null  object             
 3   no_hashtags    75462 non-null  object             
 4   full_clean     75462 non-null  object             
 5   covid_mention  75462 non-null  int64              
 6   retweet_count  75462 non-null  int64              
 7   user_name      75462 non-null  object             
 8   is_retweet     75462 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(3), object(5)
memory usage: 5.8+ MB


In [7]:
# Create a new DatFrame with only original covid mentioning tweets
# This will be used to guage the covids impact on sentiment

df_no_rt_covid_mention = df_no_rt[df_no_rt['covid_mention'] == 1]
df_no_rt_covid_mention.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30615 entries, 1294232573636304896 to 1329212767929200640
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   created_at     30615 non-null  datetime64[ns, UTC]
 1   full_text      30615 non-null  object             
 2   vader_text     30615 non-null  object             
 3   no_hashtags    30615 non-null  object             
 4   full_clean     30615 non-null  object             
 5   covid_mention  30615 non-null  int64              
 6   retweet_count  30615 non-null  int64              
 7   user_name      30615 non-null  object             
 8   is_retweet     30615 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(3), object(5)
memory usage: 2.3+ MB


## Examining some metrics

In [8]:
# Total retweet count of all 384221 tweets

raw.is_retweet.sum()

293514

In [9]:
# Estimated total covid/pandemic mentions

raw.covid_mention.sum()

118140

In [10]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 399591 entries, 1294232573636304896 to 1329212767929200640
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   created_at     399591 non-null  datetime64[ns, UTC]
 1   full_text      399591 non-null  object             
 2   vader_text     399591 non-null  object             
 3   no_hashtags    399591 non-null  object             
 4   full_clean     399591 non-null  object             
 5   covid_mention  399591 non-null  int64              
 6   retweet_count  399591 non-null  int64              
 7   user_name      399591 non-null  object             
 8   is_retweet     399591 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(3), object(5)
memory usage: 30.5+ MB


In [10]:
no_rt_count = df_no_rt.shape[0]

In [11]:
# Estimated total covid/pandemic mentions in 101569 original tweets

no_rt_count = df_no_rt.shape[0]
no_rt_covid_count = df_no_rt.covid_mention.sum()
mention_ratio_no_rt = (no_rt_covid_count/no_rt_count) * 100

print('Estimated percentage of tweets related to #bcpoli that mention covid or the pandemic in some way:', '%0.2f'% mention_ratio_no_rt,'%')
print('Total of 101569 original tweets related to #bcpoli that mention covid or the pandemic in some way:', no_rt_covid_count)

Estimated percentage of tweets related to #bcpoli that mention covid or the pandemic in some way: 28.86 %
Total of 101569 original tweets related to #bcpoli that mention covid or the pandemic in some way: 30615


In [65]:
# Most frequent bigrams, hastags removed, stop words removed - Includes original tweets and retweets
# This will be more interesting with data grouped by week

top_ngrams(raw, n=2, ngrams=20)

(bc, liberals)            13170
(john, horgan)            10070
(bc, liberal)              9567
(bonnie, henry)            9481
(dr, bonnie)               8994
(new, cases)               8608
(british, columbians)      8291
(british, columbia)        7433
(bc, ndp)                  7159
(andrew, wilkinson)        6915
(dr, henry)                6103
(site, c)                  3798
(public, health)           3787
(bc, election)             3667
(henry, says)              3598
(health, care)             3498
(fraser, health)           3452
(snap, election)           3407
(physical, distancing)     3125
(old, growth)              3025
dtype: int64

In [66]:
# Top bigrams from original tweets only

top_ngrams(df_no_rt, n=2, ngrams=20)

(bc, liberals)            2481
(bonnie, henry)           1888
(dr, bonnie)              1860
(bc, liberal)             1569
(john, horgan)            1568
(new, cases)              1546
(bc, ndp)                 1507
(dr, henry)               1413
(british, columbians)     1230
(british, columbia)       1127
(andrew, wilkinson)       1058
(bc, election)             917
(snap, election)           871
(public, health)           851
(active, cases)            801
(henry, says)              767
(fraser, health)           659
(health, care)             654
(provincial, election)     627
(old, growth)              615
dtype: int64

In [67]:
# Top bigrams from original tweets only, without covid mentioned
# This is a good example of when stemming is benficial - See pluralized words below

top_ngrams(df_no_rt_no_covid, n=2, ngrams=20)

(bc, liberals)            2290
(bc, liberal)             1466
(bc, ndp)                 1339
(john, horgan)            1204
(andrew, wilkinson)        963
(british, columbians)      902
(bc, election)             784
(british, columbia)        751
(old, growth)              603
(provincial, election)     547
(snap, election)           526
(green, party)             523
(bc, greens)               443
(bc, green)                436
(site, c)                  426
(climate, change)          409
(mental, health)           402
(sign, petition)           399
(growth, forests)          389
(health, care)             389
dtype: int64

In [14]:
# Most frequent trigrams, hastags removed, stop words removed - Includes original tweets and retweets
# This will be more interesting with data grouped by week

top_ngrams(raw, n=3, ngrams=20)

(dr, bonnie, henry)              8433
(dr, henry, says)                2288
(premier, john, horgan)          1895
(bc, liberal, government)        1292
(bc, liberal, party)             1244
(bc, liberal, candidate)         1237
(new, cases, covid19)            1199
(old, growth, forests)           1122
(breaking, dr, bonnie)           1102
(leader, john, horgan)           1067
(ndp, leader, john)              1022
(deaths, far, year)              1015
(british, columbia, deaths)      1013
(coronavirus, 288, overdoses)    1011
(columbia, deaths, far)          1011
(288, overdoses, 1202)           1011
(year, coronavirus, 288)         1011
(far, year, coronavirus)         1011
(new, covid19, cases)            1010
(site, c, dam)                    933
dtype: int64

In [15]:
# Top trigrams from original tweets only

top_ngrams(df_no_rt, n=3, ngrams=20)

(dr, bonnie, henry)            1654
(dr, henry, says)               420
(old, growth, forests)          393
(one, best, things)             338
(protecting, old, growth)       337
(sign, petition, protect)       334
(last, giant, trees)            333
(giant, trees, logging)         332
(protect, last, giant)          332
(things, mitigate, impacts)     332
(petition, protect, last)       332
(best, things, mitigate)        332
(impacts, sign, petition)       331
(mitigate, impacts, sign)       331
(forests, one, best)            331
(growth, forests, one)          327
(bc, liberal, party)            284
(going, back, school)           263
(back, school, september)       247
(everyone, going, back)         246
dtype: int64

In [16]:
# Top trigrams from original tweets only, without covid mentioned
# This is a good example of when stemming is benficial - See pluralized words below
# Also a great example of why trigrams are useful - (old, growth)  (growth, forests)

top_ngrams(df_no_rt_no_covid, n=3, ngrams=20)

(old, growth, forests)         389
(protecting, old, growth)      337
(one, best, things)            336
(sign, petition, protect)      334
(last, giant, trees)           333
(petition, protect, last)      332
(giant, trees, logging)        332
(protect, last, giant)         332
(best, things, mitigate)       332
(things, mitigate, impacts)    332
(forests, one, best)           331
(impacts, sign, petition)      331
(mitigate, impacts, sign)      331
(growth, forests, one)         327
(bc, liberal, party)           271
(bc, green, party)             238
(bc, liberal, leader)          192
(leader, andrew, wilkinson)    179
(bc, liberal, candidate)       172
(premier, john, horgan)        152
dtype: int64

In [68]:
# Pickle DataFrames for later use

#df_no_rt.to_pickle('/Users/lclark/data_bootcamp/data-science-final-project/data/df_original_tweets.pkl')

#df_no_rt_covid_mention.to_pickle('/Users/lclark/data_bootcamp/data-science-final-project/data/df_original_tweets_covid_mention.pkl')

#df_no_rt_no_covid.to_pickle('/Users/lclark/data_bootcamp/data-science-final-project/data/df_original_tweets_no_covid.pkl')