# Load and Prepare Data

In [40]:
#load data
import pandas as pd
path = "Downloads/"
dat = pd.read_pickle(path+'webhose_cat.pkl')

In [41]:
#prepare data for model
import re
import string
import nltk as nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora

# Filter non-English news
dat = dat[dat['language']=='english'].reset_index(drop=True)

# Remove special characters to avoid problems with analysis
dat = dat['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))


stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

dat_clean = [clean(doc).split() for doc in dat] 

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(dat_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in dat_clean]

# Build LDA model

In [42]:
#number of topics = 3
from gensim import models
import pyLDAvis.gensim

Lda = gensim.models.ldamodel.LdaModel
numtopics = 3
ldamodel3 = Lda(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)
print(*ldamodel3.print_topics(num_topics=numtopics, num_words=3), sep='\n')
print(*ldamodel3.print_topics(num_topics=numtopics, num_words=5), sep='\n')
print(*ldamodel3.print_topics(num_topics=numtopics, num_words=10), sep='\n')

(0, '0.014*"market" + 0.010*"tax" + 0.008*"plant"')
(1, '0.008*"share" + 0.007*"jan" + 0.007*"caterpillar"')
(2, '0.007*"amazon" + 0.006*"sphere" + 0.006*"seattle"')
(0, '0.014*"market" + 0.010*"tax" + 0.008*"plant" + 0.008*"u" + 0.005*"china"')
(1, '0.008*"share" + 0.007*"jan" + 0.007*"caterpillar" + 0.006*"inc" + 0.006*"company"')
(2, '0.007*"amazon" + 0.006*"sphere" + 0.006*"seattle" + 0.004*"work" + 0.004*"company"')
(0, '0.014*"market" + 0.010*"tax" + 0.008*"plant" + 0.008*"u" + 0.005*"china" + 0.005*"case" + 0.005*"company" + 0.005*"global" + 0.005*"year" + 0.004*"industry"')
(1, '0.008*"share" + 0.007*"jan" + 0.007*"caterpillar" + 0.006*"inc" + 0.006*"company" + 0.005*"2017" + 0.005*"estimate" + 0.005*"median" + 0.005*"city" + 0.004*"university"')
(2, '0.007*"amazon" + 0.006*"sphere" + 0.006*"seattle" + 0.004*"work" + 0.004*"company" + 0.004*"space" + 0.004*"employee" + 0.004*"2018" + 0.003*"caterpillar" + 0.003*"monday"')


In [43]:
lda_display3 = pyLDAvis.gensim.prepare(ldamodel3, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [44]:
numtopics = 10
ldamodel10 = Lda(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

In [45]:
lda_display10 = pyLDAvis.gensim.prepare(ldamodel10, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [46]:
numtopics = 20
ldamodel20 = Lda(doc_term_matrix, num_topics=numtopics, id2word = dictionary, passes=50)

In [47]:
lda_display20 = pyLDAvis.gensim.prepare(ldamodel20, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display20)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# Use Coherence Scores

In [48]:
#build a few more models to see how coherence socre change
ldamodel5 = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)
ldamodel7 = Lda(doc_term_matrix, num_topics=7, id2word = dictionary, passes=50)
ldamodel13 = Lda(doc_term_matrix, num_topics=13, id2word = dictionary, passes=50)
ldamodel15 = Lda(doc_term_matrix, num_topics=15, id2word = dictionary, passes=50)
ldamodel20 = Lda(doc_term_matrix, num_topics=20, id2word = dictionary, passes=50)

In [49]:
from gensim.models import CoherenceModel
data = {'Model':['lda3', 'lda5', 'lda7', 'lda10', 'lda13', 'lda15', 'lda20'], 
        'Scores':[CoherenceModel(model=ldamodel3, texts=dat_clean, dictionary=dictionary, coherence='c_v').get_coherence(), 
                  CoherenceModel(model=ldamodel5, texts=dat_clean, dictionary=dictionary, coherence='c_v').get_coherence(), 
                  CoherenceModel(model=ldamodel7, texts=dat_clean, dictionary=dictionary, coherence='c_v').get_coherence(), 
                  CoherenceModel(model=ldamodel10, texts=dat_clean, dictionary=dictionary, coherence='c_v').get_coherence(),
                  CoherenceModel(model=ldamodel13, texts=dat_clean, dictionary=dictionary, coherence='c_v').get_coherence(),
                  CoherenceModel(model=ldamodel15, texts=dat_clean, dictionary=dictionary, coherence='c_v').get_coherence(),
                  CoherenceModel(model=ldamodel20, texts=dat_clean, dictionary=dictionary, coherence='c_v').get_coherence()]} 


In [50]:
pd.DataFrame(data)

Unnamed: 0,Model,Scores
0,lda3,0.340653
1,lda5,0.370999
2,lda7,0.454944
3,lda10,0.5438
4,lda13,0.458448
5,lda15,0.513863
6,lda20,0.507762


From the coherence scores list above we can see that the score increase as number of topics increase until around N=10, then the coherence score decreases as N grows larger. Even though there is slight trend of increase in coherence score when N increase at 15, the increase is not significant and does not surpass the score when N=10.  Therefore, we selected N=10 as our best numbers of topics. 

As we look at the graphs produced previously, when N=3, the bubbles are far away from each others, and the top terms are more general and does not contain a lot of distinct terms. For N=10, the bubbles are more well-spread and not overlapping, and the top terms are more specific and contain distinct information in them. For N=20, the bubbles are slightly overlapping, and the top terms are similar to the top terms in N=10. Therefore, it makes sense to choose N=10 for our best number of topics. 