In [125]:
folder_path = "../Twitter_Data/"
cn_folder_path = "../Twitter_Data/Twitter_Data_Chinese/"

visuals_output_path = "./visuals/"
output_path = "./output/"

file1 = "Twitter_Covid-19_Lockdown_5000.csv"
file2 = "Twitter_Jan_Mar_5000.csv"
file3 = "Twitter_Mar_5000.csv"
file4 = "Twitter_May_Nov_5000.csv"

cn_file1 = "en.Twitter_Covid-19_Lockdown_5000_chinese.csv"
cn_file2 = "en.Twitter_Jan_Apr_2020_5000_chinese.csv"
cn_file3 = "en.Twitter_May_June_2022_5000_chinese.csv"

In [126]:
!pip install pyLDAvis



In [127]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

import nltk
nltk.download('stopwords') #download if don't have yet
from nltk.tokenize import word_tokenize, RegexpTokenizer # tokenize words
from nltk.corpus import stopwords

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lindy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [128]:
# Read in files
df1 = pd.read_csv(folder_path + file1)
df2 = pd.read_csv(folder_path + file2)
df3 = pd.read_csv(folder_path + file3)
df4 = pd.read_csv(folder_path + file3)

cn_df1 = pd.read_csv(cn_folder_path + cn_file1)
cn_df2 = pd.read_csv(cn_folder_path + cn_file2)
cn_df3 = pd.read_csv(cn_folder_path + cn_file3)

<h1>Cleaning Data</h1>

In [129]:
# Drop columns mentioning Bheed trailer
df1 = df1[df1["text"].str.contains("Bheed")==False]
df2 = df2[df2["Text"].str.contains("Bheed")==False]
df3 = df3[df3["Text"].str.contains("Bheed")==False]
df4 = df4[df4["Text"].str.contains("Bheed")==False]

In [130]:
list1 = df1["text"].tolist()
list2 = df2["Text"].tolist()
list3 = df3["Text"].tolist()
list4 = df4["Text"].tolist()

cn_list1 = cn_df1["text"].tolist()
cn_list2 = cn_df2["text"].tolist()
cn_list3 = cn_df3["text"].tolist()

In [131]:
def remove_user_mentions(text):
    return re.sub("@[A-Za-z0-9_]+","", text)

def remove_links(text):
    return re.sub(r'http\S+', '', text)

def remove_digit_strings(text):
    return re.sub(r'\d+', '', text)

def remove_special_chars(text):
    remove_chars = '[0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    return re.sub(remove_chars, ' ', text)

def clean_text(text):
    result_text = text
    result_text = remove_user_mentions(result_text)
    result_text = remove_links(result_text)
    result_text = remove_digit_strings(result_text)
    result_text = remove_special_chars(result_text)
    result_text = result_text.lower()
    return result_text

def remove_cn_chars(text):
    result_text = re.sub(r'([\u4e00-\u9fff]+', '', text)
    return result_text

In [132]:
for i in range(len(list1)):
    list1[i] = clean_text(list1[i])

for i in range(len(list2)):
    list2[i] = clean_text(list2[i])

for i in range(len(list3)):
    list3[i] = clean_text(list3[i])
    
for i in range(len(list4)):
    list4[i] = clean_text(list4[i])
    
for i in range(len(cn_list1)):
#     cn_list1[i] = remove_cn_chars(cn_list1[i])
    cn_list1[i] = clean_text(cn_list1[i])

for i in range(len(cn_list2)):
#     cn_list2[i] = remove_cn_chars(cn_list2[i])
    cn_list2[i] = clean_text(str(cn_list2[i]))

for i in range(len(cn_list3)):
#      cn_list3[i] = remove_cn_chars(cn_list3[i])
    cn_list3[i] = clean_text(str(cn_list3[i]))

In [133]:
df1["cleaned_text"] = list1
df2["cleaned_text"] = list2
df3["cleaned_text"] = list3
df4["cleaned_text"] = list4

cn_df1["cleaned_text"] = cn_list1
cn_df2["cleaned_text"] = cn_list2
cn_df3["cleaned_text"] = cn_list3

In [134]:
# Create dictionary of period of tweets to information associated with tweet

tweets = {}
keys = ["covid_lockdown", "jan_mar", "mar", "may_nov", "cn_covid_lockdown", "cn_jan_apr_2020", "cn_may_june_2022"]
for key in keys:
    tweets[key] = {}

tweets["covid_lockdown"]["text"] = df1["cleaned_text"].tolist()
tweets["jan_mar"]["text"] = df2["cleaned_text"].tolist()
tweets["mar"]["text"] = df3["cleaned_text"].tolist()
tweets["may_nov"]["text"] = df4["cleaned_text"].tolist()

tweets["cn_covid_lockdown"]["text"] = cn_df1["cleaned_text"].tolist()
tweets["cn_jan_apr_2020"]["text"] = cn_df2["cleaned_text"].tolist()
tweets["cn_may_june_2022"]["text"] = cn_df3["cleaned_text"].tolist()

<h1>Tokenizing Data</h1>

In [135]:
# Get custom stopwords
content = []
f = open("./stopwords.txt", encoding = 'utf-8')
# perform file operations
for line in f:
    content.append(line)
f.close()

custom_stopwords = []
for line in content:
    wordlist = line.split(",")
    for word in wordlist:
        custom_stopwords.append(word)

In [136]:
# Stopwords
stop_words = stopwords.words('english')
exclude_words = stop_words

exclude_words_extra = ["RT","still","covid","coronavirus","lockdown","lockdo","pandemic","epidemic","let","get","ago","go","im","ive","would","one","also","to","tag"]

# Exclude custom stopwords
exclude_words.extend(custom_stopwords)
exclude_words.extend(exclude_words_extra)

In [137]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

for period in tweets.keys():
    tweets[period]["data_words"] = list(sent_to_words(tweets[period]["text"]))

In [138]:
def bigram_trigram_models(period):
    data_words = tweets[period]["data_words"]
    
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
    
    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    tweets[period]["bigram"] = bigram
    tweets[period]["trigram"] = trigram
    tweets[period]["bigram_mod"] = bigram_mod
    tweets[period]["trigram_mod"] = trigram_mod

for period in tweets.keys():
    bigram_trigram_models(period)

In [139]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in exclude_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [140]:
def combined(period):
    data_words = tweets[period]["data_words"]

    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)
    tweets[period]["data_words_nostops"] = data_words_nostops 

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops, tweets[period]["bigram_mod"])
    tweets[period]["data_words_bigrams"] = data_words_bigrams

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    tweets[period]["data_lemmatized"] = data_lemmatized

for period in tweets.keys():
    combined(period)

In [141]:
# for period in tweets.keys():
#     data_lemmatized = tweets[period]["data_lemmatized"]
#     for arr in data_lemmatized:
#         for word in arr:
#             if word in exclude_words:
#                 arr.remove(word)

In [142]:
def tokenize(period):
    data_lemmatized = tweets[period]["data_lemmatized"]

    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)
    tweets[period]["id2word"] = id2word

    # Create Corpus
    texts = data_lemmatized
    tweets[period]["texts"] = texts

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    tweets[period]["corpus"] = corpus

    # Human readable format of corpus (term-frequency)
    tweets[period]["corpus_readable"] = [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

for period in tweets.keys():
    tokenize(period)

In [143]:
index = 1   # change this to see diff output
print(tweets[list(tweets.keys())[index]]["corpus_readable"])

[[('home', 1), ('place', 1)]]


<h1>LDA Model</h1>

In [144]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

# supporting function 2
def compute_perplexity_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    perplexity_score = lda_model.log_perplexity(corpus_sets[i])
    
    return perplexity_score

In [145]:
def lda_model(period):
    corpus = tweets[period]["corpus"]
    id2word = tweets[period]["id2word"]

    # Build LDA model
    num_topics = 4
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=num_topics, 
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

    tweets[period]["lda_model"] = lda_model

for period in tweets.keys():
    lda_model(period)

In [146]:
index = 1   # change this to see diff output
print(tweets[list(tweets.keys())[index]]["lda_model"])

LdaModel<num_terms=2890, num_topics=4, decay=0.5, chunksize=100>


In [147]:
def keyword_in_topics(period):
    corpus = tweets[period]["corpus"]
    lda_model = tweets[period]["lda_model"]

    # Print the Keyword in the 10 topics
    pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]
    tweets[period]["doc_lda"] = doc_lda

for period in tweets.keys():
    keyword_in_topics(period)

[(0,
  '0.067*"year" + 0.035*"first" + 0.020*"today" + 0.019*"government" + '
  '0.016*"come" + 0.015*"issue" + 0.015*"third" + 0.012*"see" + 0.012*"series" '
  '+ 0.012*"due"'),
 (1,
  '0.025*"support" + 0.018*"happen" + 0.017*"mental" + 0.016*"former" + '
  '0.016*"boris_johnson" + 0.015*"hlth" + 0.015*"collect" + '
  '0.014*"expenditure_data" + 0.013*"party" + 0.013*"truth"'),
 (2,
  '0.022*"make" + 0.021*"impact" + 0.017*"post" + 0.017*"population" + '
  '0.016*"mask" + 0.016*"wear" + 0.015*"inequality" + 0.014*"people" + '
  '0.012*"world" + 0.011*"lockdown"'),
 (3,
  '0.055*"first" + 0.043*"year" + 0.041*"day" + 0.035*"anniversary" + '
  '0.030*"today" + 0.025*"die" + 0.025*"national" + 0.024*"mark" + '
  '0.020*"time" + 0.019*"remember"')]
[(0,
  '0.015*"week" + 0.011*"news" + 0.011*"life" + 0.010*"post" + 0.010*"call" + '
  '0.009*"second" + 0.009*"force" + 0.009*"disease" + 0.008*"video" + '
  '0.008*"mask"'),
 (1,
  '0.049*"people" + 0.042*"day" + 0.023*"case" + 0.020*"city" 

In [148]:
index = 1   # change this to see diff output
print(tweets[list(tweets.keys())[index]]["doc_lda"])

<gensim.interfaces.TransformedCorpus object at 0x00000175D41758B0>


In [149]:
def perplexity_coherence(period):
    corpus = tweets[period]["corpus"]
    data_lemmatized = tweets[period]["data_lemmatized"]
    id2word = tweets[period]["id2word"]
    lda_model = tweets[period]["lda_model"]

    # Compute Perplexity
    tweets[period]["perplexity"] = lda_model.log_perplexity(corpus)
    # print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    tweets[period]["coherence_model_lda"] = coherence_model_lda
    tweets[period]["coherence_lda"] = coherence_lda

    # print('\nCoherence Score: ', coherence_lda)

for period in tweets.keys():
    perplexity_coherence(period)

In [150]:
index = 1   # change this to see diff output
print(tweets[list(tweets.keys())[index]]["perplexity"])
print(tweets[list(tweets.keys())[index]]["coherence_lda"])

-7.721058392952905
0.3630860495547645


In [151]:
def optimal_model(period): 
    lda_model = tweets[period]["lda_model"]

    optimal_model = lda_model
    model_topics = optimal_model.show_topics(formatted=False)

    tweets[period]["model_topics"] = model_topics

for period in tweets.keys():
    optimal_model(period)

In [152]:
index = 1   # change this to see diff output
print(tweets[list(tweets.keys())[index]]["model_topics"])

[(0, [('week', 0.015394689), ('news', 0.011458637), ('life', 0.010509352), ('post', 0.010279826), ('call', 0.01011382), ('second', 0.009262819), ('force', 0.009057287), ('disease', 0.00899129), ('video', 0.008359657), ('mask', 0.008124177)]), (1, [('people', 0.04904595), ('day', 0.041766297), ('case', 0.02298288), ('city', 0.019750832), ('town', 0.016668594), ('country', 0.016286071), ('outbreak', 0.0151690105), ('resident', 0.014788403), ('new', 0.014545913), ('say', 0.013076739)]), (2, [('ncov', 0.023394438), ('put', 0.021885443), ('spread', 0.018528098), ('leave', 0.016608331), ('case', 0.015789146), ('first', 0.013514718), ('quarantine', 0.013407771), ('population', 0.013169179), ('virus', 0.012353554), ('due', 0.012002673)]), (3, [('report', 0.019369626), ('quarantine', 0.018293725), ('measure', 0.015736118), ('take', 0.015252979), ('apartment_amidst', 0.011982421), ('amp', 0.010973084), ('novel', 0.010697662), ('city', 0.010030347), ('base', 0.009814688), ('outbreak', 0.009599294

In [153]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, atopic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [154]:
def explore_topic_by_title(period):
    lda_model = tweets[period]["lda_model"]
    topic_summaries = []

    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
    print(period)
    for i in range(4):
        print('Topic '+str(i)+' |---------------------\n')
        tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
        #     print tmp[:5]
        topic_summaries += [tmp[:5]]
        # print

    tweets[period]["topic_summaries"] = topic_summaries

for period in tweets.keys():
    explore_topic_by_title(period)

term                 frequency

covid_lockdown
Topic 0 |---------------------

year                 0.067
first                0.035
today                0.020
government           0.019
come                 0.016
issue                0.015
third                0.015
see                  0.012
series               0.012
due                  0.012
Topic 1 |---------------------

support              0.025
happen               0.018
mental               0.017
former               0.016
boris_johnson        0.016
hlth                 0.015
collect              0.015
expenditure_data     0.014
party                0.013
truth                0.013
Topic 2 |---------------------

make                 0.022
impact               0.021
post                 0.017
population           0.017
mask                 0.016
wear                 0.016
inequality           0.015
people               0.014
world                0.012
lockdown             0.011
Topic 3 |---------------------

first          

<h1>Data Output - Dominant Topic</h1>

In [155]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

for period in tweets.keys():
    lda_model = tweets[period]["lda_model"]
    corpus = tweets[period]["corpus"]
    texts = tweets[period]["texts"]
    
    df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, texts)
    
    # Format
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    
    tweets[period]["df_dominant_topic"] = df_dominant_topic
    
    df_dominant_topic.to_csv(output_path + period + ".csv")

<h1>Data Visualization</h1>

<h3>PyLDAVis</h3>

In [156]:
import pyLDAvis.gensim_models as gensimvis
import pickle
import string

# Visualize the topics
for period in tweets.keys():
    lda_model = tweets[period]["lda_model"]
    corpus = tweets[period]["corpus"]
    id2word = tweets[period]["id2word"]

    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    
    file_path = visuals_output_path + period + '.html'
    pyLDAvis.save_html(LDAvis_prepared, file_path)
    
    with open(file_path, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write("<h1>" + period + "</h1>" + '\n' + content)
        f.close()

  default_term_info = default_term_info.sort_values(
