In [1]:
#Loading gensim and nltk libraries
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
np.random.seed(2018)
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Min\Anaconda3\lib\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
stopWords = ['author', 'virus', 'wuhan', 'coronavirus', '2019nCoV', 'Covid-19', 'covid', 'singapore', 'wuhan pneumonia', 'novel coronavirus', 'corona virus', 'wuhanvirus', 'https', 'http', 'leea']

In [3]:
#method for lemmatization and stemming
def preprocess(text):
    try:
        result = []
        for token in gensim.utils.simple_preprocess(text):
            if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
                if token not in stopWords:
                    #result.append(lemmatize_stemming(token))
                    result.append(token)
        #print(result)
        return result
    except:
        return []
def lemmatize_stemming(text):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


In [4]:
posts = pd.read_csv('covid_Reddit_output_21_03_2020.csv')
posts = posts.replace(np.nan, ' ', regex=True)
posts.head(3)

Unnamed: 0,author,title,score,id,url,comms_num,created,body,timestamp
0,satoshigekkouga2303,Singapore reports two deaths from Covid-19,1201,fm8vm4,https://www.straitstimes.com/singapore/singapo...,254,1584791000.0,,2020-03-21 19:50:47
1,SadKaleidoscope2,"[Update] SG COVID-19 cases by cluster, as of 2...",83,fm6r6m,https://i.redd.it/qn3182yvfxn41.png,6,1584782000.0,,2020-03-21 17:16:26
2,justmewayne,PASSING OF TWO PATIENTS WITH COVID-19 INFECTION,63,fm90ns,https://www.moh.gov.sg/news-highlights/details...,30,1584792000.0,,2020-03-21 20:01:51


In [6]:
posts['combined_body'] = posts['title'] + ' ' + posts['body'] + ' ' 
# + posts['top_main_comment']

In [7]:
posts['combined_body'][2]

'PASSING OF TWO PATIENTS WITH COVID-19 INFECTION   '

In [8]:
posts['combined_body'] = posts['combined_body'].map(preprocess)
print('Pre-processing is done.')

Pre-processing is done.


In [9]:
#building dictionary and corpus for LDA
dictionary = gensim.corpora.Dictionary(posts['combined_body'])
print(dictionary)
corpus = [dictionary.doc2bow(text) for text in posts['combined_body']]
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 20:
        break
print('Dictionary is built.')

Dictionary(2431 unique tokens: ['covid', 'deaths', 'reports', 'singapore', 'cases']...)
0 covid
1 deaths
2 reports
3 singapore
4 cases
5 cluster
6 update
7 infection
8 passing
9 patients
10 airline
11 airlines
12 announced
13 anymore
14 barrier
15 block
16 blocking
17 bringing
18 cause
19 considering
20 curious
Dictionary is built.


In [10]:
#Generate Bag of Words on the data set
bow_corpus = [dictionary.doc2bow(text) for text in posts['combined_body']]
#generate TFIDF matrix
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.00805468636067354),
 (1, 0.7717216829654875),
 (2, 0.6181712120032168),
 (3, 0.1491479759132541)]


In [11]:
def compute_coherence_values(dictionary, corpus, texts, limit, start, step, coherence):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence)
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [12]:
start = 5
limit = 41
step = 5

try:
    model_list, coherence_values_umass = compute_coherence_values(dictionary=dictionary, 
                                                        corpus=corpus, texts=posts['combined_body'], 
                                                        start=start, limit=limit, step=step, coherence='u_mass')
except Exception as e:
    print(e)

In [13]:
start = 5
limit = 41
step = 5
# Show graph
import matplotlib.pyplot as plt
x = range(start, limit, step)
plt.plot(x, coherence_values_umass)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score for u_mass")
plt.legend(("coherence_values"), loc='best')
plt.show()

<Figure size 640x480 with 1 Axes>

In [14]:
#run LDA
#change num_topics if want to change the number of topics generated
lda_model = gensim.models.LdaModel(corpus, num_topics=8, id2word=dictionary, random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)


In [15]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.041*"know" + 0.017*"covid" + 0.015*"airlines" + 0.014*"tickets" + 0.013*"thanks" + 0.013*"counts" + 0.013*"queued" + 0.012*"flights" + 0.010*"real" + 0.010*"recently"
Topic: 1 
Words: 0.103*"covid" + 0.064*"singapore" + 0.039*"discharged" + 0.029*"infection" + 0.022*"confirmed" + 0.016*"student" + 0.012*"confirms" + 0.010*"couple" + 0.010*"ward" + 0.009*"restaurant"
Topic: 2 
Words: 0.051*"singapore" + 0.019*"covid" + 0.015*"malaysia" + 0.014*"going" + 0.013*"supply" + 0.011*"travel" + 0.011*"near" + 0.011*"getting" + 0.010*"https" + 0.010*"south"
Topic: 3 
Words: 0.021*"countries" + 0.021*"case" + 0.017*"think" + 0.015*"food" + 0.014*"sure" + 0.013*"month" + 0.013*"work" + 0.012*"chinese" + 0.011*"world" + 0.011*"currently"
Topic: 4 
Words: 0.079*"people" + 0.064*"covid" + 0.063*"cases" + 0.031*"feel" + 0.017*"safra" + 0.015*"cluster" + 0.014*"says" + 0.013*"jurong" + 0.013*"including" + 0.011*"linked"
Topic: 5 
Words: 0.051*"situation" + 0.044*"covid" + 0.026*"test

In [16]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
