# Twitter Data - Exploring a Sample of 160k Tweets

* The purpose of this notebook is to take a glance at sample of the collected twitter data.
* Minimal text processing will take place here.
* Columns not required for analysis will be dropped here.
 * The remaining data will be exported for preprocessing in "classify_unlabelled_tweets.ipynb"
* The full ~400K tweet dataset will be processed in another notebook

In [2]:
import pandas as pd
import numpy as np

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from IPython.display import JSON
import matplotlib.pyplot as plt
import pickle

# Pandas Display Settings, if you wish

pd.set_option('display.max_colwidth', None)
#pd.set_option("display.max_columns", 30)

## 164K Tweets - August 14th, 2020 - October 1, 2020

In [3]:
df = pd.read_json('/Volumes/My Passport/Tweets/bc_tweets.jsonl', lines=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164548 entries, 0 to 164547
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype              
---  ------                     --------------   -----              
 0   created_at                 164548 non-null  datetime64[ns, UTC]
 1   id                         164548 non-null  int64              
 2   id_str                     164548 non-null  int64              
 3   full_text                  164548 non-null  object             
 4   truncated                  164548 non-null  bool               
 5   display_text_range         164548 non-null  object             
 6   entities                   164548 non-null  object             
 7   source                     164548 non-null  object             
 8   in_reply_to_status_id      11682 non-null   float64            
 9   in_reply_to_status_id_str  11682 non-null   float64            
 10  in_reply_to_user_id        12791 non-null   float64     

In [5]:
# Make copy of imported data and set index to unique tweet ID

raw = df.copy()
raw.set_index('id_str', inplace=True)

# Filter out columns

raw = raw[['created_at','user','full_text','retweet_count']]

In [6]:
# Extract features from user column dict with .get

raw['user_name'] = raw['user'].apply(lambda x: x.get('screen_name'))

# Drop user column

raw.drop('user', axis=1, inplace=True)

In [7]:
# Is covid or anything pandemic related mentioned in the tweet?

# All lowercase*
# This list of terms may be expanded.
covid_list = ['covid','virus', 'corona','ncov','sars', 'super spread', 'super-spread', 'pandemic', 'epidemic', 'outbreak', 'new case', 'new death', 'active case', 'community spread', 'contact trac', 'social distanc','self isolat', 'self-isolat', 'mask', 'ppe', 'quarantine', 'lockdown', 'symptomatic', 'vaccine', 'bonnie']

# For tweet, if any term in covid_list is present in tweet, return 1. If not, return 0
def covid_mention(text, synonyms=covid_list):
    for term in synonyms:
        if term in text:
            return 1
        continue
    return 0

In [8]:
# Create binary column for is_retweet

rt_regex = "^rt"

def is_retweet(text):
    if re.match(rt_regex, text) is not None:
        return 1
    return 0

In [9]:
# Find most common n-grams
## Remove stop words, URLs, usernames and punctuation. I am considering leaving the plain text usernames

def preprocess(text, hashtags=False, join=False):
    text = text.lower()
    if hashtags:
        text = ' '.join(re.sub(r"\#\w*[a-zA-Z]+\w*","",text).split())
    text = ' '.join(re.sub("((www\.[\S]+)|(https?://[\S]+))","",text).split())
    text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)","",text).split())
    text = ' '.join(re.sub("^\n","",text).split())
    text = ' '.join(re.sub("^rt","",text).split())
    punc = ''.join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(punc)
    stops = [word for word in tokens if word not in stop_words]
    if join:
        stops = (' ').join(stops)
    return stops

# Join Tokenized strings

def joiner(text):
    string = (' ').join(text)
    return string

In [10]:
# Remove case sensitivity

raw["full_text"] = raw["full_text"].str.lower()

# Create binary column for covid_mention

raw['covid_mention'] = raw['full_text'].apply(covid_mention)

# Create binary column for is_retweet
raw['is_retweet'] = raw['full_text'].apply(is_retweet)

In [11]:
# Create a list of all the tweets
raw['full_clean'] = raw['full_text'].apply(preprocess)

# Create column without hastags
raw['no_hashtags'] = raw['full_text'].apply(lambda x: preprocess(x, hashtags=True))


In [12]:
print('Estimated Mentions of Covid-19 or the Pandemic in 164K Tweets:', raw.covid_mention.sum())

Estimated Mentions of Covid-19 or the Pandemic in 164K Tweets: 33937


In [206]:
# Most frequent bigrams - Hashtags included, stop words removed

words = preprocess(''.join(str(raw['full_clean'].tolist())))
(pd.Series(nltk.ngrams(words, 2)).value_counts())[:20]

(bced, bcpoli)           5682
(cdnpoli, bcpoli)        5387
(john, horgan)           3924
(bcpoli, cdnpoli)        3683
(bc, liberals)           3567
(bonnie, henry)          3503
(dr, bonnie)             3241
(new, cases)             3082
(dr, henry)              2562
(bcpoli, bcelxn2020)     2529
(british, columbians)    2186
(bc, liberal)            2120
(bcpoli, bc)             2094
(bcpoli, covid19)        2061
(covid19, cases)         1888
(bc, ndp)                1723
(breaking, bc)           1619
(bcpoli, bced)           1591
(bcpoli, vanpoli)        1588
(henry, says)            1578
dtype: int64

In [13]:
# Most frequent bigrams, hastags removed, stop words removed

words_no_ht = preprocess(''.join(str(raw['no_hashtags'].tolist())))
(pd.Series(nltk.ngrams(words_no_ht, 2)).value_counts())[:20]

(john, horgan)           3924
(bc, liberals)           3568
(bonnie, henry)          3500
(new, cases)             3329
(dr, bonnie)             3239
(dr, henry)              2562
(british, columbians)    2187
(bc, liberal)            2116
(bc, ndp)                1726
(henry, says)            1579
(snap, election)         1512
(andrew, wilkinson)      1505
(british, columbia)      1482
(back, school)           1258
(banquet, halls)         1225
(active, cases)          1209
(public, health)         1135
(breaking, dr)           1056
(fall, election)         1054
(premier, john)          1036
dtype: int64

In [14]:
# Top individual terms
(pd.Series(nltk.ngrams(words_no_ht, 1)).value_counts())[:20]

(bc,)            39293
(election,)      15240
(new,)           13075
(amp,)           12859
(cases,)         11342
(people,)         9795
(government,)     8493
(says,)           8447
(school,)         8382
(ndp,)            8237
(health,)         8232
(horgan,)         8117
(one,)            7573
(time,)           6756
(henry,)          6719
(would,)          6587
(schools,)        6582
(dr,)             6561
(need,)           6335
(teachers,)       6271
dtype: int64