# Exploring the Complete Twitter Dataset

* The purpose of this notebook is to explore the full dataset of 400k tweets relating to #bcpoli
* Tweet created dates range from August 14, 2020 to November 19, 2020
* Columns not required for analysis will be dropped here.
 * The remaining data will be exported for preprocessing in "classify_unlabelled_tweets.ipynb"

In [1]:
import sys  
sys.path.insert(0, 'PATH/TO/data_bootcamp/data-science-final-project/scripts/')

import pandas as pd
import numpy as np
import pdpipe as pdp

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from IPython.display import JSON
import matplotlib.pyplot as plt
import pickle

# Pandas Display Settings, if you wish

pd.set_option('display.max_colwidth', None)
#pd.set_option("display.max_columns", 30)

# Import custom functions 
from functions import *

## ~398K Tweets from August 14th, 2020 - November 19th, 2020

In [2]:
%%time

df = pd.read_json('/Volumes/My Passport/Tweets/bcpoli_400k_extended.jsonl', lines=True)

CPU times: user 2min 11s, sys: 2min 48s, total: 5min
Wall time: 6min 37s


In [5]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384221 entries, 0 to 384220
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype              
---  ------                     --------------   -----              
 0   created_at                 384221 non-null  datetime64[ns, UTC]
 1   id                         384221 non-null  int64              
 2   id_str                     384221 non-null  int64              
 3   full_text                  384221 non-null  object             
 4   truncated                  384221 non-null  bool               
 5   display_text_range         384221 non-null  object             
 6   entities                   384221 non-null  object             
 7   source                     384221 non-null  object             
 8   in_reply_to_status_id      26530 non-null   float64            
 9   in_reply_to_status_id_str  26530 non-null   float64            
 10  in_reply_to_user_id        29115 non-null   float64     

In [3]:
%%time
# Make copy of imported data and set index to unique tweet ID

raw = df.copy()
raw.set_index('id_str', inplace=True)

# Filter out columns

raw = raw[['created_at','user','full_text','retweet_count']]

# Extract features from user column dict with .get

raw['user_name'] = raw['user'].apply(lambda x: x.get('screen_name'))

# Drop user column

raw.drop('user', axis=1, inplace=True)

# Remove case sensitivity

raw["full_text"] = raw["full_text"].str.lower()

CPU times: user 1.51 s, sys: 4.04 s, total: 5.55 s
Wall time: 9.42 s


## Processing with Functions from functions.py

In [14]:
%%time

# This may be refactored into an sklearn pipeline

# Create binary column for covid_mention

raw['covid_mention'] = raw['full_text'].apply(covid_mention)

# Create binary column for is_retweet
raw['is_retweet'] = raw['full_text'].apply(is_retweet)

# Create a list of all the tweets
raw['full_clean'] = raw['full_text'].apply(preprocess)

# Create column without hastags
raw['no_hashtags'] = raw['full_text'].apply(lambda x: preprocess(x, hashtags=True))


CPU times: user 2min 18s, sys: 1.71 s, total: 2min 20s
Wall time: 2min 20s


In [23]:
%%time

# Pandas Processing Pipeline # Timed to compare against pure .apply above
# Currently, when using pdpipe, the preprocess argument 'hashtags' isn't working within the pipeline. 

pipeline = pdp.ApplyByCols('full_text', covid_mention, 'covid_mention', drop=False)
pipeline+= pdp.ApplyByCols('full_text', is_retweet, 'is_retweet', drop=False)
pipeline+= pdp.ApplyByCols('full_text', preprocess, 'full_clean', drop=False)
# Create column without hastags
raw['no_hashtags'] = raw['full_text'].apply(lambda x: preprocess(x, hashtags=True))
#pipeline+= pdp.ApplyByCols('full_text', preprocess, 'no_hashtags', drop=False)
raw = pipeline(raw)

CPU times: user 1.2 s, sys: 6.19 ms, total: 1.2 s
Wall time: 1.2 s


In [17]:
raw.head()

Unnamed: 0_level_0,created_at,full_text,full_clean,is_retweet,covid_mention,retweet_count,user_name,no_hashtags
id_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1296949558409416704,2020-08-21 23:17:25+00:00,"rt @davemccr: hey @jjhorgan not sure how close gatherings of students in schools is any different than 30 people at a party, sure we’ll hav…","[hey, sure, close, gatherings, students, schools, different, 30, people, party, sure, well, hav]",1,0,14,crickets01,"[hey, sure, close, gatherings, students, schools, different, 30, people, party, sure, well, hav]"
1296949926543335424,2020-08-21 23:18:53+00:00,"hot take:\n\nif teachers were paid what they're actually worth, they wouldn't be pissing their pants about going back to work right now.\n\n#bced\n#bcpoli\n#cdnpoli\n#covid19psyop","[hot, take, teachers, paid, theyre, actually, worth, wouldnt, pissing, pants, going, back, work, right, bced, bcpoli, cdnpoli, covid19psyop]",0,1,0,ie_mack,"[hot, take, teachers, paid, theyre, actually, worth, wouldnt, pissing, pants, going, back, work, right]"
1296949948483682304,2020-08-21 23:18:58+00:00,"rt @bcedprobs: if bc’s school superintendents stood up &amp; declared that what they are being told to do puts their students, staff &amp; communit…","[bcs, school, superintendents, stood, amp, declared, told, puts, students, staff, amp, communit]",1,0,37,TanisMaxfield,"[bcs, school, superintendents, stood, amp, declared, told, puts, students, staff, amp, communit]"
1296950296657068032,2020-08-21 23:20:21+00:00,5 months after starting containment measures and we're back to where we began.\n\n5 months wasted. all our hard work down the drain because govt is rushing to re-open. @cdcofbc \n#bcpoli #covid19 https://t.co/2rznis3zjy,"[5, months, starting, containment, measures, back, began, 5, months, wasted, hard, work, drain, govt, rushing, reopen, bcpoli, covid19]",0,1,0,DavidWa59907969,"[5, months, starting, containment, measures, back, began, 5, months, wasted, hard, work, drain, govt, rushing, reopen]"
1296950387337920512,2020-08-21 23:20:43+00:00,rt @impishchimp: how about the ministry of education not force kids and teachers into dangerously packed classes with no distancing during…,"[ministry, education, force, kids, teachers, dangerously, packed, classes, distancing]",1,0,18,McdonaldMcdo54,"[ministry, education, force, kids, teachers, dangerously, packed, classes, distancing]"


In [18]:
# Total Retweets of 384221 tweets

raw.is_retweet.sum()

282652

In [24]:
# Estimated total covid/pandemic mentions in 384221 tweets

raw.covid_mention.sum()

77873

In [9]:
#Create a new DataFrame of only original tweets
df_no_rt = raw[raw['is_retweet'] == 0]

In [11]:
df_no_rt.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101569 entries, 1296949926543335424 to 1328850997704237056
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   created_at     101569 non-null  datetime64[ns, UTC]
 1   full_text      101569 non-null  object             
 2   retweet_count  101569 non-null  int64              
 3   user_name      101569 non-null  object             
 4   covid_mention  101569 non-null  int64              
 5   is_retweet     101569 non-null  int64              
 6   full_clean     101569 non-null  object             
 7   no_hashtags    101569 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(3), object(4)
memory usage: 7.0+ MB


In [49]:
# Estimated total covid/pandemic mentions in 101569 original tweets

covid_no_rt = df_no_rt.covid_mention.sum()
mention_ratio_no_rt = (covid_no_rt/101569) * 100

print(f'Percentage of Tweets related to bcpoli that mention covid or the pandemic in some way:', '%0.2f'% mention_ratio_no_rt,'%')

Percentage of Tweets related to bcpoli that mention covid or the pandemic in some way: 26.36 %
