<h1>Reddit Scraped Comments</h1>

<h3>Installation and import of libraries</h3>

In [1]:
!pip install pyLDAvis



In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

import nltk
nltk.download('stopwords') #download if don't have yet
from nltk.tokenize import word_tokenize, RegexpTokenizer # tokenize words
from nltk.corpus import stopwords

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lindy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h3>Viewing Data</h3>

In [3]:
# Path
folder_path = "../Reddit_Scraped_Comments/"
file1 = "anti_lockdown_comments.csv"
file2 = "corona_lockdown_comments.csv"
file3 = "covid19_lockdown_comments.csv"

In [4]:
anti_lockdown_comments_df = pd.read_csv(folder_path + file1)
anti_lockdown_comments_df

Unnamed: 0.1,Unnamed: 0,id,author,title,score,comms_num,created,timestamp,commenter,comment,top_lvl
0,0,ij071t,casualphilosopher1,"10,000 anti-lockdown protesters gather in Lond...",3039,600,1.598735e+09,2020-08-30 05:05:25,schu4KSU,People like this everywhere in the world. Dif...,1
1,0,ij071t,casualphilosopher1,"10,000 anti-lockdown protesters gather in Lond...",3039,600,1.598735e+09,2020-08-30 05:05:25,StupidizeMe,"Well, there's certainly enough of them to caus...",0
2,0,ij071t,casualphilosopher1,"10,000 anti-lockdown protesters gather in Lond...",3039,600,1.598735e+09,2020-08-30 05:05:25,Eltharion-the-Grim,They are largely absent from Asia. The only pl...,0
3,0,ij071t,casualphilosopher1,"10,000 anti-lockdown protesters gather in Lond...",3039,600,1.598735e+09,2020-08-30 05:05:25,Thisam,"Yup, the percentage of population who are easi...",0
4,0,ij071t,casualphilosopher1,"10,000 anti-lockdown protesters gather in Lond...",3039,600,1.598735e+09,2020-08-30 05:05:25,,Short answer: Yes. Look at the USA and how Bre...,0
...,...,...,...,...,...,...,...,...,...,...,...
705,0,g4kdfs,vostok-Abdullah,Counter-Protesters in Scrubs Block Some Anti-L...,1483,192,1.587349e+09,2020-04-20 10:16:05,,[removed],1
706,0,g4kdfs,vostok-Abdullah,Counter-Protesters in Scrubs Block Some Anti-L...,1483,192,1.587349e+09,2020-04-20 10:16:05,AutoModerator,Your comment has been removed because\n\n* **I...,0
707,0,g4kdfs,vostok-Abdullah,Counter-Protesters in Scrubs Block Some Anti-L...,1483,192,1.587349e+09,2020-04-20 10:16:05,Ameriican,They broke quarintine to tell others to not br...,1
708,0,g4kdfs,vostok-Abdullah,Counter-Protesters in Scrubs Block Some Anti-L...,1483,192,1.587349e+09,2020-04-20 10:16:05,AshingiiAshuaa,If the protesters' goal is to clog up streets ...,1


In [5]:
corona_lockdown_comments_df = pd.read_csv(folder_path + file2)
corona_lockdown_comments_df

Unnamed: 0.1,Unnamed: 0,id,author,title,score,comms_num,created,timestamp,commenter,comment,top_lvl
0,0,g1hpwu,Johari82,Ending coronavirus lockdowns will be a dangero...,6967,1484,1.586912e+09,2020-04-15 08:46:34,Skooter_McGaven,I wish we had an understanding of where the ma...,1
1,0,g1hpwu,Johari82,Ending coronavirus lockdowns will be a dangero...,6967,1484,1.586912e+09,2020-04-15 08:46:34,Richandler,> Is it overly family spread?\n\nI believe the...,0
2,0,g1hpwu,Johari82,Ending coronavirus lockdowns will be a dangero...,6967,1484,1.586912e+09,2020-04-15 08:46:34,,This sounds horrible and I hate myself a littl...,0
3,0,g1hpwu,Johari82,Ending coronavirus lockdowns will be a dangero...,6967,1484,1.586912e+09,2020-04-15 08:46:34,lcbk,My husband and I are not yet confirmed to have...,0
4,0,g1hpwu,Johari82,Ending coronavirus lockdowns will be a dangero...,6967,1484,1.586912e+09,2020-04-15 08:46:34,ZombiGrn,In my neighborhood people started throwing par...,0
...,...,...,...,...,...,...,...,...,...,...,...
455,0,gr29as,frequenttimetraveler,Strict Physical Distancing May Be More Efficie...,909,352,1.590517e+09,2020-05-27 02:10:44,stillobsessed,A surprise lockdown period of 30 days wouldn't...,0
456,0,gr29as,frequenttimetraveler,Strict Physical Distancing May Be More Efficie...,909,352,1.590517e+09,2020-05-27 02:10:44,thisrockismyboone,Society would cease to function without infras...,0
457,0,gr29as,frequenttimetraveler,Strict Physical Distancing May Be More Efficie...,909,352,1.590517e+09,2020-05-27 02:10:44,SamH123,\- this depends a bit on what stage of spread ...,0
458,0,gr29as,frequenttimetraveler,Strict Physical Distancing May Be More Efficie...,909,352,1.590517e+09,2020-05-27 02:10:44,reini_urban,Strict physical distance may be statistically ...,1


<h3>Cleaning Data</h3>

In [6]:
# Drop columns with removed comments
anti_lockdown_comments_df = anti_lockdown_comments_df[anti_lockdown_comments_df.comment != "[removed]"]
corona_lockdown_comments_df = corona_lockdown_comments_df[corona_lockdown_comments_df.comment != "[removed]"]

In [7]:
list1 = anti_lockdown_comments_df["comment"].tolist()
list2 = corona_lockdown_comments_df["comment"].tolist()
comments = list1 + list2
print(len(comments))
print(comments[0])
print(comments[1])

1095
People like this everywhere in the world.  Difference is, are there enough to gain political power?
Well, there's certainly enough of them to cause a massive surge in COVID-19 cases and deaths, and to take some of us with them.


In [8]:
def remove_special_chars(text):
    remove_chars = '[0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    return re.sub(remove_chars, ' ', text)

def remove_digit_strings(text):
    return re.sub(r'\d+', '', text)

def remove_links(text):
    return re.sub(r'http\S+', '', text)

def clean_text(text):
    result_text = remove_links(text)
    result_text = remove_digit_strings(result_text)
    result_text = remove_special_chars(result_text)
    result_text = result_text.lower()
    return result_text

In [9]:
for i in range(len(comments)):
    comments[i] = clean_text(comments[i])

In [10]:
print(comments[0])
print(comments[1])

people like this everywhere in the world   difference is  are there enough to gain political power 
well  there s certainly enough of them to cause a massive surge in covid  cases and deaths  and to take some of us with them 


<h3>Tokenizing Data</h3>

In [11]:
# Stopwords
stop_words = stopwords.words('english')
exclude_words = stop_words

#exclude common words 
exclude_words_extra = ["covid","lockdown", "pandemic","get","go","let","im","ive","would","one","also","to","say","day","well","month","thing"]

exclude_words.extend(exclude_words_extra)

In [12]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(comments))

In [13]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['people', 'like', 'this', 'everywhere', 'in', 'the', 'world', 'difference', 'is', 'are', 'there', 'enough', 'to', 'gain', 'political', 'power']


In [14]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in exclude_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [15]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['people', 'everywhere', 'world', 'difference', 'enough', 'gain', 'political', 'power']]


In [16]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]]


[[('difference', 1),
  ('enough', 1),
  ('everywhere', 1),
  ('gain', 1),
  ('people', 1),
  ('political', 1),
  ('power', 1),
  ('world', 1)]]

<h3>LDA Model</h3>

In [19]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

# supporting function 2
def compute_perplexity_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    perplexity_score = lda_model.log_perplexity(corpus_sets[i])
    
    return perplexity_score

In [None]:
import numpy as np
import tqdm

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 11
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': [],
                 'Perplexity': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=271)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    p = compute_perplexity_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    model_results['Perplexity'].append(p)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('./lda_tuning_results.csv', index=False)
    pbar.close()


  0%|▎                                                                             | 1/271 [02:45<12:24:36, 165.47s/it][A

  0%|▎                                                                              | 1/271 [01:46<7:57:28, 106.10s/it][A
  1%|▌                                                                              | 2/271 [03:32<7:55:54, 106.15s/it][A
  1%|▊                                                                              | 3/271 [05:16<7:49:50, 105.19s/it][A
  1%|█▏                                                                             | 4/271 [06:58<7:43:15, 104.10s/it][A
  2%|█▍                                                                             | 5/271 [08:43<7:42:05, 104.23s/it][A
  2%|█▋                                                                             | 6/271 [10:34<7:50:51, 106.61s/it][A

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

In [None]:
optimal_model = lda_model
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
    accept a ldamodel, atopic number and topn vocabs of interest
    prints a formatted list of the topn terms
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [None]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(4):
    print('Topic '+str(i)+' |---------------------\n')
    tmp = explore_topic(lda_model,topic_number=i, topn=10, output=True )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print