# Exploring the Complete Twitter Dataset

* The purpose of this notebook is to explore the full dataset of 400k tweets relating to #bcpoli
* Tweet created dates range from August 14, 2020 to November 19, 2020
* Columns not required for analysis will be dropped here.
 * The remaining data will be exported for preprocessing in "classify_unlabelled_tweets.ipynb"

In [1]:
import sys  
sys.path.insert(0, 'PATH/TO/data_bootcamp/data-science-final-project/scripts/')

import pandas as pd
import numpy as np
import pdpipe as pdp

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from IPython.display import JSON
import matplotlib.pyplot as plt
import pickle

# Pandas Display Settings, if you wish

pd.set_option('display.max_colwidth', None)
#pd.set_option("display.max_columns", 30)

# Import custom functions 
from functions import *

## ~398K Tweets from August 14th, 2020 - November 19th, 2020

In [2]:
%%time

df = pd.read_json('/Volumes/My Passport/Tweets/bcpoli_400k_extended.jsonl', lines=True)

CPU times: user 2min 11s, sys: 2min 48s, total: 5min
Wall time: 6min 37s


In [5]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384221 entries, 0 to 384220
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype              
---  ------                     --------------   -----              
 0   created_at                 384221 non-null  datetime64[ns, UTC]
 1   id                         384221 non-null  int64              
 2   id_str                     384221 non-null  int64              
 3   full_text                  384221 non-null  object             
 4   truncated                  384221 non-null  bool               
 5   display_text_range         384221 non-null  object             
 6   entities                   384221 non-null  object             
 7   source                     384221 non-null  object             
 8   in_reply_to_status_id      26530 non-null   float64            
 9   in_reply_to_status_id_str  26530 non-null   float64            
 10  in_reply_to_user_id        29115 non-null   float64     

In [3]:
%%time
# Make copy of imported data and set index to unique tweet ID

raw = df.copy()
raw.set_index('id_str', inplace=True)

# Filter out columns

raw = raw[['created_at','user','full_text','retweet_count']]

# Extract features from user column dict with .get

raw['user_name'] = raw['user'].apply(lambda x: x.get('screen_name'))

# Drop user column

raw.drop('user', axis=1, inplace=True)

# Remove case sensitivity

raw["full_text"] = raw["full_text"].str.lower()

CPU times: user 1.51 s, sys: 4.04 s, total: 5.55 s
Wall time: 9.42 s


## Create a Data Cleaning Pipeline

In [68]:
%%time

# Pandas Processing Pipeline # Timed to compare against pure .apply above

pipeline = pdp.ApplyByCols('full_text', covid_mention, 'covid_mention', drop=False)
pipeline+= pdp.ApplyByCols('full_text', is_retweet, 'is_retweet', drop=False)
pipeline+= pdp.ApplyByCols('full_text', preprocess, 'full_clean', drop=False)
pipeline+= pdp.ApplyByCols('full_text', (lambda x: preprocess(x, hashtags=True)), 'no_hashtags', drop=False)
raw = pipeline(raw)

CPU times: user 1.51 s, sys: 1.29 s, total: 2.8 s
Wall time: 3.61 s


In [89]:
raw.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 384221 entries, 1296949558409416704 to 1328850840669495296
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   created_at     384221 non-null  datetime64[ns, UTC]
 1   full_text      384221 non-null  object             
 2   covid_mention  384221 non-null  int64              
 3   no_hashtags    384221 non-null  object             
 4   full_clean     384221 non-null  object             
 5   is_retweet     384221 non-null  int64              
 6   retweet_count  384221 non-null  int64              
 7   user_name      384221 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(4)
memory usage: 298.0 MB


In [60]:
# Most frequent bigrams, hastags removed, stop words removed
top_bigrams(raw, 20)

(bc, liberals)           11017
(john, horgan)            8474
(bc, liberal)             7886
(bonnie, henry)           7668
(dr, bonnie)              7358
(new, cases)              6807
(bc, ndp)                 6088
(andrew, wilkinson)       5667
(british, columbians)     5046
(dr, henry)               4971
(british, columbia)       4951
(bc, election)            3401
(henry, says)             3088
(snap, election)          2597
(public, health)          2476
(health, care)            2363
(site, c)                 2316
(fraser, health)          2273
(old, growth)             2070
(bc, greens)              2042
dtype: int64

In [73]:
# Total Retweets of 384221 tweets

raw.is_retweet.sum()

282652

In [74]:
# Estimated total covid/pandemic mentions in 384221 tweets

raw.covid_mention.sum()

84035

In [70]:
#Create a new DataFrame of only original tweets
df_no_rt = raw[raw['is_retweet'] == 0]

In [71]:
df_no_rt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101569 entries, 1296949926543335424 to 1328850997704237056
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   created_at     101569 non-null  datetime64[ns, UTC]
 1   full_text      101569 non-null  object             
 2   covid_mention  101569 non-null  int64              
 3   no_hashtags    101569 non-null  object             
 4   full_clean     101569 non-null  object             
 5   is_retweet     101569 non-null  int64              
 6   retweet_count  101569 non-null  int64              
 7   user_name      101569 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(4)
memory usage: 7.0+ MB


In [83]:
# Estimated total covid/pandemic mentions in 101569 original tweets

covid_no_rt = df_no_rt.covid_mention.sum()
mention_ratio_no_rt = (covid_no_rt/101569) * 100

print('Estimated percentage of tweets related to #bcpoli that mention covid or the pandemic in some way:', '%0.2f'% mention_ratio_no_rt,'%')
print('Total of 101569 original tweets related to #bcpoli that mention covid or the pandemic in some way:', covid_no_rt)

Estimated percentage of tweets related to #bcpoli that mention covid or the pandemic in some way: 27.72 %
Total of 101569 original tweets related to #bcpoli that mention covid or the pandemic in some way: 28152


In [87]:
# Top bigrams from original tweets only

top_bigrams(df_no_rt,20)

(bc, liberals)            2448
(bonnie, henry)           1709
(dr, bonnie)              1697
(john, horgan)            1548
(bc, liberal)             1535
(bc, ndp)                 1493
(new, cases)              1466
(dr, henry)               1305
(british, columbians)     1180
(british, columbia)       1067
(andrew, wilkinson)       1027
(bc, election)             916
(snap, election)           858
(public, health)           797
(active, cases)            769
(henry, says)              699
(health, care)             630
(provincial, election)     614
(fraser, health)           604
(old, growth)              574
dtype: int64

In [92]:
# Create a new DatFrame with only non-covid tweets
# This will be used to guage the covids impact on sentiment

df_no_rt_no_covid = df_no_rt[df_no_rt['covid_mention'] == 0]
df_no_rt_no_covid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73417 entries, 1296950909239357440 to 1328850935632719872
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   created_at     73417 non-null  datetime64[ns, UTC]
 1   full_text      73417 non-null  object             
 2   covid_mention  73417 non-null  int64              
 3   no_hashtags    73417 non-null  object             
 4   full_clean     73417 non-null  object             
 5   is_retweet     73417 non-null  int64              
 6   retweet_count  73417 non-null  int64              
 7   user_name      73417 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(4)
memory usage: 5.0+ MB


In [93]:
# Create a new DatFrame with only covid tweets
# This will be used to guage the covids impact on sentiment

df_no_rt_covid_mention = df_no_rt[df_no_rt['covid_mention'] == 1]
df_no_rt_covid_mention.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28152 entries, 1296949926543335424 to 1328850997704237056
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   created_at     28152 non-null  datetime64[ns, UTC]
 1   full_text      28152 non-null  object             
 2   covid_mention  28152 non-null  int64              
 3   no_hashtags    28152 non-null  object             
 4   full_clean     28152 non-null  object             
 5   is_retweet     28152 non-null  int64              
 6   retweet_count  28152 non-null  int64              
 7   user_name      28152 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(4)
memory usage: 1.9+ MB
