<h1>Twitter Data</h1>

<h3>Installation and import of libraries</h3>

In [33]:
folder_path = "../Twitter_Data/"
output_folder_path = "./"
file1 = "Twitter_Covid-19_Lockdown_5000.csv"
file2 = "Twitter_Jan_Mar_5000.csv"
file3 = "Twitter_Mar_5000.csv"
file4 = "Twitter_May_Nov_5000.csv"

In [34]:
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [35]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

import nltk
nltk.download('stopwords') #download if don't have yet
from nltk.tokenize import word_tokenize, RegexpTokenizer # tokenize words
from nltk.corpus import stopwords

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h3>Viewing Data</h3>

In [36]:
twitter_lockdown_df = pd.read_csv(folder_path + file1)
twitter_jan_mar_df = pd.read_csv(folder_path + file2)
twitter_mar_df = pd.read_csv(folder_path + file3)
twitter_may_nov_df = pd.read_csv(folder_path + file4)

print(twitter_lockdown_df.columns)
print(twitter_jan_mar_df.columns)
print(twitter_mar_df.columns)
print(twitter_may_nov_df.columns)

Index(['author_id', 'conversation_id', 'id', 'edit_history_tweet_ids', 'text',
       'created_at', 'withheld.copyright', 'withheld.country_codes',
       'author_name', 'orginal_text', 'original_text'],
      dtype='object')
Index(['Datetime', 'Tweet Id', 'Text', 'Username', 'Coordinates', 'Place'], dtype='object')
Index(['Datetime', 'Tweet Id', 'Text', 'Username', 'Coordinates', 'Place'], dtype='object')
Index(['Datetime', 'Tweet Id', 'Text', 'Username', 'Coordinates', 'Place'], dtype='object')


In [37]:
# Drop columns mentioning Bheed trailer
twitter_lockdown_df = twitter_lockdown_df[ twitter_lockdown_df["text"].str.contains("Bheed")==False]
twitter_jan_mar_df = twitter_jan_mar_df[ twitter_jan_mar_df["Text"].str.contains("Bheed")==False]
twitter_mar_df = twitter_mar_df[ twitter_mar_df["Text"].str.contains("Bheed")==False]
twitter_may_nov_df = twitter_may_nov_df[ twitter_may_nov_df["Text"].str.contains("Bheed")==False]

<h3>Cleaning Data</h3>

In [38]:
def remove_user_mentions(text):
    return re.sub("@[A-Za-z0-9_]+","", text)

def remove_links(text):
    return re.sub(r'http\S+', '', text)

def remove_digit_strings(text):
    return re.sub(r'\d+', '', text)

def remove_special_chars(text):
    remove_chars = '[0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    return re.sub(remove_chars, ' ', text)

def clean_text(text):
    result_text = text
    result_text = remove_user_mentions(result_text)
    result_text = remove_links(result_text)
    result_text = remove_digit_strings(result_text)
    result_text = remove_special_chars(result_text)
    result_text = result_text.lower()
    return result_text

In [39]:
'''
Clean text, and check for empty strings / strings containing only whitespace
'''
texts = twitter_lockdown_df["text"].tolist() + twitter_jan_mar_df["Text"].tolist() + twitter_mar_df["Text"].tolist() + twitter_may_nov_df["Text"].tolist()
for i in range(len(texts)):
    texts[i] = clean_text(texts[i])

In [40]:
print(len(texts))
print(texts[0])
print(texts[1])

14144
rt   today marks three years since the uk went into lockdown   

on this national day of reflection  i visited the  
 i used play fortnite a lot on my android in covid   lockdown and chapter  midas revenge was the best one


<h3>Tokenizing Data</h3>

In [41]:
# Stopwords
stop_words = stopwords.words('english')
exclude_words = stop_words

#exclude common words 
exclude_words_extra = ["RT","still","covid","coronavirus","lockdown","lockdo","pandemic","let","get","ago","go","im","ive","would","one","also","to","tag"]

exclude_words.extend(exclude_words_extra)

In [42]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(texts))

In [43]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['rt_today_marks', 'three_years_since', 'the', 'uk', 'went_into', 'lockdown', 'on', 'this', 'national', 'day', 'of', 'reflection', 'visited', 'the']


In [44]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in exclude_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [45]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['go', 'national', 'day', 'reflection', 'visit']]


In [46]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


[[('day', 1), ('go', 1), ('national', 1), ('reflection', 1), ('visit', 1)]]

<h3>LDA Model</h3>

In [47]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

# supporting function 2
def compute_perplexity_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    perplexity_score = lda_model.log_perplexity(corpus_sets[i])
    
    return perplexity_score

In [48]:
# Build LDA model
num_topics = 4
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [49]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.031*"day" + 0.015*"tier" + 0.015*"need" + 0.015*"time" + 0.015*"go" + '
  '0.013*"government" + 0.013*"end" + 0.013*"see" + 0.013*"week" + '
  '0.013*"come"'),
 (1,
  '0.047*"people" + 0.023*"death" + 0.016*"home" + 0.015*"make" + '
  '0.014*"think" + 0.013*"due" + 0.011*"health" + 0.010*"impose" + '
  '0.010*"close" + 0.010*"state"'),
 (2,
  '0.022*"say" + 0.016*"die" + 0.014*"country" + 0.011*"rise" + '
  '0.010*"interview" + 0.010*"read" + 0.009*"number" + 0.009*"get" + '
  '0.009*"spread" + 0.008*"good"'),
 (3,
  '0.019*"new" + 0.016*"case" + 0.011*"year" + 0.011*"restriction" + '
  '0.010*"take" + 0.010*"work" + 0.009*"help" + 0.009*"amp" + 0.009*"woman" + '
  '0.009*"violence"')]


In [50]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.081663422356895

Coherence Score:  0.3471080792640732


In [51]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(


In [52]:
optimal_model = lda_model
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

[(0,
  '0.031*"day" + 0.015*"tier" + 0.015*"need" + 0.015*"time" + 0.015*"go" + '
  '0.013*"government" + 0.013*"end" + 0.013*"see" + 0.013*"week" + '
  '0.013*"come"'),
 (1,
  '0.047*"people" + 0.023*"death" + 0.016*"home" + 0.015*"make" + '
  '0.014*"think" + 0.013*"due" + 0.011*"health" + 0.010*"impose" + '
  '0.010*"close" + 0.010*"state"'),
 (2,
  '0.022*"say" + 0.016*"die" + 0.014*"country" + 0.011*"rise" + '
  '0.010*"interview" + 0.010*"read" + 0.009*"number" + 0.009*"get" + '
  '0.009*"spread" + 0.008*"good"'),
 (3,
  '0.019*"new" + 0.016*"case" + 0.011*"year" + 0.011*"restriction" + '
  '0.010*"take" + 0.010*"work" + 0.009*"help" + 0.009*"amp" + 0.009*"woman" + '
  '0.009*"violence"')]


In [53]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, atopic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [54]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(4):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print

term                 frequency

Topic 0 |---------------------

day                  0.031
tier                 0.015
need                 0.015
time                 0.015
go                   0.015
government           0.013
end                  0.013
see                  0.013
week                 0.013
come                 0.013
Topic 1 |---------------------

people               0.047
death                0.023
home                 0.016
make                 0.015
think                0.014
due                  0.013
health               0.011
impose               0.010
close                0.010
state                0.010
Topic 2 |---------------------

say                  0.022
die                  0.016
country              0.014
rise                 0.011
interview            0.010
read                 0.010
number               0.009
get                  0.009
spread               0.009
good                 0.008
Topic 3 |---------------------

new                  0.019
cas