In [3]:
folder_path = "../Reddit_Scraped_Comments/"
visuals_output_path = "./visuals_by_post/"
output_path = "./output_by_post/"
file1 = "anti_lockdown_comments.csv"
file2 = "corona_lockdown_comments.csv"
file3 = "covid19_lockdown_comments.csv"

In [4]:
!pip install pyLDAvis



In [5]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

import nltk
nltk.download('stopwords') #download if don't have yet
from nltk.tokenize import word_tokenize, RegexpTokenizer # tokenize words
from nltk.corpus import stopwords

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lindy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Read in files
df1 = pd.read_csv(folder_path + file1)
df2 = pd.read_csv(folder_path + file2)
df3 = pd.read_csv(folder_path + file3)

<h1>Cleaning Data</h1>

In [7]:
# Drop columns with removed comments
df1 = df1[df1.comment != "[removed]"]
df2 = df2[df2.comment != "[removed]"]
df3 = df3[df3.comment != "[removed]"]

In [8]:
list1 = df1["comment"].tolist()
list2 = df2["comment"].tolist()
list3 = df3["comment"].tolist()

In [9]:
def remove_special_chars(text):
    remove_chars = '[0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    return re.sub(remove_chars, ' ', text)

def remove_digit_strings(text):
    return re.sub(r'\d+', '', text)

def remove_links(text):
    return re.sub(r'http\S+', '', text)

def clean_text(text):
    result_text = remove_links(text)
    result_text = remove_digit_strings(result_text)
    result_text = remove_special_chars(result_text)
    result_text = result_text.lower()
    return result_text

In [10]:
for i in range(len(list1)):
    list1[i] = clean_text(list1[i])

for i in range(len(list2)):
    list2[i] = clean_text(list2[i])

for i in range(len(list3)):
    list3[i] = clean_text(list3[i])

In [11]:
df1["cleaned_text"] = list1
df2["cleaned_text"] = list2
df3["cleaned_text"] = list3

In [12]:
titles_list1 = df1["title"].unique()
titles_list2 = df2["title"].unique()
titles_list3 = df3["title"].unique()

# Create dictionary of title to topic information associated with title
title_comments = {}

# Populate dictionary
for title in titles_list1:
    title_comments[title] = {}
    df = df1.loc[df1['title'] == title]
    title_comments[title]["text"] = df["cleaned_text"].tolist()

for title in titles_list2:
    title_comments[title] = {}
    df = df2.loc[df2['title'] == title]
    title_comments[title]["text"] = df["cleaned_text"].tolist()

for title in titles_list3:
    title_comments[title] = {}
    df = df3.loc[df3['title'] == title]
    title_comments[title]["text"] = df["cleaned_text"].tolist()

<h1>Tokenizing Data</h1>

In [13]:
# Get custom stopwords
content = []
f = open("./stopwords.txt", encoding = 'utf-8')
# perform file operations
for line in f:
    content.append(line)
f.close()

custom_stopwords = []
for line in content:
    wordlist = line.split(",")
    for word in wordlist:
        custom_stopwords.append(word)

In [14]:
# Stopwords
stop_words = stopwords.words('english')
exclude_words = stop_words

exclude_words_extra = ["covid","lockdown","corona","pandemic","let","im","ive","would","one","also","to","say","day","well","month","thing","take","see","get","go"]

# Exclude custom stopwords
exclude_words.extend(custom_stopwords)
exclude_words.extend(exclude_words_extra)

In [15]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

for title in title_comments.keys():
    title_comments[title]["data_words"] = list(sent_to_words(title_comments[title]["text"]))
# data_words = list(sent_to_words(comments))

In [26]:
def bigram_trigram_models(title_key):
    data_words = title_comments[title_key]["data_words"]
    
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
    
    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # # See trigram example
    # print(trigram_mod[bigram_mod[data_words[0]]])

    title_comments[title_key]["bigram"] = bigram
    title_comments[title_key]["trigram"] = trigram
    title_comments[title_key]["bigram_mod"] = bigram_mod
    title_comments[title_key]["trigram_mod"] = trigram_mod

for title in title_comments.keys():
    bigram_trigram_models(title)

In [27]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in exclude_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [28]:
def combined(title_key):
    data_words = title_comments[title_key]["data_words"]

    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)
    data_words_nostops = data_words
    title_comments[title_key]["data_words_nostops"] = data_words_nostops 

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops, title_comments[title_key]["bigram_mod"])
    title_comments[title_key]["data_words_bigrams"] = data_words_bigrams

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    title_comments[title_key]["data_lemmatized"] = data_lemmatized

    # print(data_lemmatized[:1])

for title in title_comments.keys():
    combined(title)

In [50]:
print(title_comments[list(title_comments.keys())[0]].keys())

dict_keys(['text', 'data_words', 'bigram', 'trigram', 'bigram_mod', 'trigram_mod', 'data_words_nostops', 'data_words_bigrams', 'data_lemmatized', 'id2word', 'corpus', 'corpus_readable', 'lda_model', 'doc_lda', 'perplexity', 'coherence_model_lda', 'coherence_lda', 'model_topics', 'topic_summaries', 'file_title'])


In [51]:
def tokenize(title_key):
    data_lemmatized = title_comments[title_key]["data_lemmatized"]

    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)
    title_comments[title_key]["id2word"] = id2word

    # Create Corpus
    texts = data_lemmatized
    title_comments[title_key]["texts"] = texts

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    title_comments[title_key]["corpus"] = corpus

    # View
    # print(corpus[:1])

    # Human readable format of corpus (term-frequency)
    title_comments[title_key]["corpus_readable"] = [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

for title in title_comments.keys():
    tokenize(title)

In [52]:
index = 1   # change this to see diff output
print(title_comments[list(title_comments.keys())[index]]["corpus_readable"])

[[('all', 1), ('charge', 2), ('die', 1), ('endangerment', 1), ('homicide', 1), ('misconduct', 1), ('people', 1), ('reckless', 1), ('result', 1)]]


<h1>LDA Model</h1>

In [53]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

# supporting function 2
def compute_perplexity_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    perplexity_score = lda_model.log_perplexity(corpus_sets[i])
    
    return perplexity_score

In [54]:
def lda_model(title_key):
    corpus = title_comments[title_key]["corpus"]
    id2word = title_comments[title_key]["id2word"]

    # Build LDA model
    num_topics = 4
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=num_topics, 
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

    title_comments[title_key]["lda_model"] = lda_model

for title in title_comments.keys():
    lda_model(title)

In [55]:
index = 1   # change this to see diff output
print(title_comments[list(title_comments.keys())[index]]["lda_model"])

LdaModel<num_terms=846, num_topics=4, decay=0.5, chunksize=100>


In [56]:
def keyword_in_topics(title_key):
    corpus = title_comments[title_key]["corpus"]
    lda_model = title_comments[title_key]["lda_model"]

    # Print the Keyword in the 10 topics
    pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]
    title_comments[title_key]["doc_lda"] = doc_lda

for title in title_comments.keys():
    keyword_in_topics(title)

[(0,
  '0.028*"so" + 0.023*"moron" + 0.021*"stupidity" + 0.020*"disease" + '
  '0.019*"american" + 0.017*"literally" + 0.016*"free" + 0.016*"disabled" + '
  '0.016*"market" + 0.016*"meanwhile"'),
 (1,
  '0.031*"think" + 0.029*"hoax" + 0.019*"let" + 0.018*"know" + 0.015*"people" '
  '+ 0.015*"intelligent" + 0.014*"foot" + 0.014*"cuz" + 0.014*"virus" + '
  '0.013*"comment"'),
 (2,
  '0.026*"people" + 0.021*"just" + 0.010*"go" + 0.010*"make" + 0.008*"think" + '
  '0.008*"need" + 0.007*"more" + 0.007*"world" + 0.007*"know" + '
  '0.006*"medium"'),
 (3,
  '0.034*"protest" + 0.028*"man" + 0.028*"get" + 0.025*"crack" + 0.018*"come" '
  '+ 0.017*"time" + 0.017*"week" + 0.017*"mask" + 0.016*"death" + '
  '0.015*"back"')]
[(0,
  '0.015*"comment" + 0.013*"subreddit" + 0.013*"coronavirus" + '
  '0.011*"discussion" + 0.011*"remove" + 0.011*"concern" + 0.010*"want" + '
  '0.009*"allow" + 0.009*"post" + 0.009*"political"'),
 (1,
  '0.018*"people" + 0.013*"more" + 0.012*"just" + 0.012*"nurse" + '
  '0

In [57]:
index = 1   # change this to see diff output
print(title_comments[list(title_comments.keys())[index]]["doc_lda"])

<gensim.interfaces.TransformedCorpus object at 0x00000206D613CE20>


In [58]:
def perplexity_coherence(title_key):
    corpus = title_comments[title_key]["corpus"]
    data_lemmatized = title_comments[title_key]["data_lemmatized"]
    id2word = title_comments[title_key]["id2word"]
    lda_model = title_comments[title_key]["lda_model"]

    # Compute Perplexity
    title_comments[title_key]["perplexity"] = lda_model.log_perplexity(corpus)
    # print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()

    title_comments[title_key]["coherence_model_lda"] = coherence_model_lda
    title_comments[title_key]["coherence_lda"] = coherence_lda

    # print('\nCoherence Score: ', coherence_lda)

for title in title_comments.keys():
    perplexity_coherence(title)

In [59]:
index = 1   # change this to see diff output
print(title_comments[list(title_comments.keys())[index]]["perplexity"])
print(title_comments[list(title_comments.keys())[index]]["coherence_lda"])

-6.932837000200108
0.5299064377495655


In [60]:
def optimal_model(title_key): 
    lda_model = title_comments[title_key]["lda_model"]

    optimal_model = lda_model
    model_topics = optimal_model.show_topics(formatted=False)

    title_comments[title_key]["model_topics"] = model_topics

for title in title_comments.keys():
    optimal_model(title)

In [61]:
index = 1   # change this to see diff output
print(title_comments[list(title_comments.keys())[index]]["model_topics"])

[(0, [('comment', 0.015130016), ('subreddit', 0.012740157), ('coronavirus', 0.012740157), ('discussion', 0.011380486), ('remove', 0.011380486), ('concern', 0.011379844), ('want', 0.009773806), ('allow', 0.008905278), ('post', 0.008635865), ('political', 0.008635865)]), (1, [('people', 0.018022068), ('more', 0.0128353555), ('just', 0.011825321), ('nurse', 0.011659771), ('infect', 0.011143839), ('let', 0.0105321165), ('so', 0.010481239), ('stupid', 0.010232635), ('child', 0.008301792), ('healthcare', 0.008258962)]), (2, [('protest', 0.02080872), ('people', 0.020408772), ('right', 0.014061776), ('just', 0.01384189), ('get', 0.010964323), ('think', 0.008228627), ('seriously', 0.008134189), ('hope', 0.007265694), ('wave', 0.0070253676), ('covid', 0.0066533373)]), (3, [('people', 0.04763542), ('get', 0.022073563), ('other', 0.021901375), ('right', 0.01811826), ('say', 0.010293005), ('traffic', 0.009393783), ('see', 0.008731382), ('water', 0.008515408), ('poor', 0.008515198), ('break', 0.0077

In [62]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, atopic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [63]:
def explore_topic_by_title(title_key):
    lda_model = title_comments[title_key]["lda_model"]
    topic_summaries = []

    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
    print(title_key)
    for i in range(4):
        print('Topic '+str(i)+' |---------------------\n')
        tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
        #     print tmp[:5]
        topic_summaries += [tmp[:5]]
        # print

    title_comments[title_key]["topic_summaries"] = topic_summaries

for title in title_comments.keys():
    explore_topic_by_title(title)

term                 frequency

10,000 anti-lockdown protesters gather in London to claim coronavirus is ‘a hoax’
Topic 0 |---------------------

so                   0.028
moron                0.023
stupidity            0.021
disease              0.020
american             0.019
literally            0.017
free                 0.016
disabled             0.016
market               0.016
meanwhile            0.016
Topic 1 |---------------------

think                0.031
hoax                 0.029
let                  0.019
know                 0.018
people               0.015
intelligent          0.015
foot                 0.014
cuz                  0.014
virus                0.014
comment              0.013
Topic 2 |---------------------

people               0.026
just                 0.021
go                   0.010
make                 0.010
think                0.008
need                 0.008
more                 0.007
world                0.007
know                 0.007
medium 

people               0.027
just                 0.013
get                  0.013
isolate              0.012
lockdown             0.011
test                 0.011
go                   0.008
covid                0.008
positive             0.008
contact              0.008
Topic 1 |---------------------

comment              0.022
remove               0.015
more                 0.013
discussion           0.012
political_post       0.010
lock                 0.009
way                  0.008
life                 0.008
easy                 0.008
information          0.007
Topic 2 |---------------------

work                 0.030
go                   0.018
people               0.018
kid                  0.011
think                0.010
tell                 0.009
get                  0.009
home                 0.008
just                 0.008
week                 0.008
Topic 3 |---------------------

people               0.027
lockdown             0.022
go                   0.011
government   

sweden               0.020
people               0.016
swedish              0.014
think                0.013
dead                 0.011
economy              0.010
just                 0.010
swede                0.009
here                 0.009
death                0.008
term                 frequency

Austria approves lockdown for unvaccinated
Topic 0 |---------------------

vaccine              0.016
antivaxxer           0.012
austria              0.011
bot                  0.009
action               0.009
concern              0.009
contact              0.008
want                 0.008
comment              0.008
moderator            0.008
Topic 1 |---------------------

people               0.028
get                  0.028
vaccinate            0.019
don                  0.013
take                 0.013
just                 0.012
so                   0.010
country              0.010
measure              0.010
government           0.009
Topic 2 |---------------------

get                

still                0.018
people               0.012
immunity             0.012
week                 0.012
see                  0.011
time                 0.009
uptick               0.009
even                 0.009
really               0.009
mean                 0.009
Topic 1 |---------------------

people               0.013
video                0.013
mask                 0.013
case                 0.010
lockdown             0.010
store                0.010
more                 0.008
as                   0.008
country              0.008
keep                 0.008
Topic 2 |---------------------

antibody             0.015
population           0.011
test                 0.011
herd                 0.011
wave                 0.011
also                 0.011
already              0.011
theory               0.009
see                  0.009
immunity             0.008
Topic 3 |---------------------

test                 0.019
sensitivity          0.018
use                  0.016
sample       

<h1>Data Output - Dominant Topic</h1>

In [64]:
# Format filenames from titles
import string
for title in title_comments.keys():
    file_title = title.translate(str.maketrans('', '', string.punctuation))
    file_title = file_title.replace(" ", "_")
    file_title = file_title[0:10]
    title_comments[title]["file_title"] = file_title

In [65]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

for title in title_comments.keys():
    lda_model = title_comments[title]["lda_model"]
    corpus = title_comments[title]["corpus"]
    texts = title_comments[title]["texts"]
    file_name = title_comments[title]["file_title"]
    
    df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, texts)
    
    # Format
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    
    title_comments[title]["df_dominant_topic"] = df_dominant_topic
    
    df_dominant_topic.to_csv(output_path + file_name + ".csv")

<h1>Data Visualization</h1>

<h3>PyLDAVis</h3>

In [68]:
import pyLDAvis.gensim_models as gensimvis
import pickle
import string

# Visualize the topics
for title in title_comments.keys():
    lda_model = title_comments[title]["lda_model"]
    corpus = title_comments[title]["corpus"]
    id2word = title_comments[title]["id2word"]
    file_name = title_comments[title]["file_title"]

    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    
    file_path = visuals_output_path + file_name + '.html'
    pyLDAvis.save_html(LDAvis_prepared, file_path)
    
    with open(file_path, 'r+') as f:
        content = f.read()
        f.seek(0, 0)
        f.write("<h1>" + title.rstrip('\r\n') + "</h1>" + '\n' + content)
        f.close()