Follow this blog post
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [1]:
from gensim import corpora, models 
import numpy as np
import sys
import os
import gensim
import pickle
import nltk
from collections import Counter
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
from gensim.models import CoherenceModel
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

python_root = './scripts'
sys.path.insert(0, python_root)



#### Load data

In [2]:
paras = pickle.load(open('./data/processed_corpus.p', "rb")) 

In [3]:
print(paras[0])

['1', 'as', 'a', 'small', 'open', 'tourism', 'base', 'economy', 'st._lucia', 'be', 'highly_vulnerable_to_exogenous_shock', 'tourism', 'account', 'for', 'over', 'three_quarter', 'of', 'export', 'and', 'the', 'import_content', 'of', 'both', 'consumption', 'and', 'foreign_direct_investment', 'fdi', 'be', 'very', 'high', 'figure_1', 'the', 'economy', 'have', 'be', 'buffet', 'by', 'the', 'global', 'economic', 'downturn', 'which', 'have', 'hobble', 'the', 'tourism', 'and', 'construction', 'sector', 'with', 'potential', 'spillover', 'to', 'the', 'financial', 'sector']


#### Use Gensim for topic modeling

In [4]:
## remove stop words 
norm_tokenized_corpus = paras
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.extend(['-PRON-','percent','’s','would','also',
                      'year','continue','include','give','may','new',
                      'however','well','help','since'])
norm_tokenized_corpus = [[token for token in p if token not in stopword_list] for p in norm_tokenized_corpus]

In [5]:
def _flatten(container):
    for i in container:
        if isinstance(i, (list,tuple)):
            for j in _flatten(i):
                yield j
        else:
            yield i

In [6]:
words = list(_flatten(norm_tokenized_corpus))
wc = Counter(words)
wc.most_common(30)

[('authority', 118184),
 ('growth', 106963),
 ('fiscal', 106076),
 ('bank', 102460),
 ('sector', 98047),
 ('policy', 97760),
 ('gdp', 88619),
 ('increase', 84369),
 ('financial', 81209),
 ('government', 80711),
 ('reform', 77477),
 ('debt', 74619),
 ('public', 71985),
 ('rate', 69359),
 ('staff', 69139),
 ('high', 62180),
 ('remain', 58921),
 ('need', 57091),
 ('tax', 56207),
 ('program', 55408),
 ('external', 54081),
 ('economic', 52831),
 ('support', 52515),
 ('improve', 52027),
 ('market', 49549),
 ('risk', 46722),
 ('exchange', 46034),
 ('project', 45026),
 ('investment', 44747),
 ('revenue', 44276)]

In [7]:
# build dictionary
dictionary = corpora.Dictionary(norm_tokenized_corpus)
dictionary.filter_extremes(no_below=5,no_above=0.5, keep_n=10000)
# convert document into bow
corpus_bow = [dictionary.doc2bow(text) for text in norm_tokenized_corpus]
## comput tfidf feature vectors
tfidf = models.TfidfModel(corpus_bow) # smartirs = 'atc' https://radimrehurek.com/gensim/models/tfidfmodel.html
corpus_tfidf = tfidf[corpus_bow]

In [8]:
len(dictionary)

10000

In [9]:
## a better way to print 
def print_topics_gensim(topic_model, total_topics=1,
                        weight_threshold=0.0001,
                        display_weights=False,
                        num_terms=None):
    
    for index in range(total_topics):
        topic = topic_model.show_topic(index,topn=num_terms)
        topic = [(word, round(wt,4)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print (topic[:num_terms] if num_terms else topic)
        else:
            print ('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print (tw[:num_terms] if num_terms else tw)
        print

####  Latent Dirichlet Allocation

In [10]:
total_topics = 7

In [11]:
lda = models.LdaMulticore(corpus = corpus_tfidf,
                          id2word = dictionary,
                          iterations = 1000,
                          num_topics = total_topics)#,
                          #workers = 20) #alpha='auto',

In [12]:
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus_bow))  # a measure of how good the model is. lower the better.


Perplexity:  -7.573914344961725


In [13]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=norm_tokenized_corpus, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.37673968309529193


In [14]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=2):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    n_topics = []
    for num_topics in range(start, limit, step):
        model = models.LdaMulticore(corpus = corpus,
                          id2word = dictionary,
                          random_state = 1,
                          iterations = 1000,
                          num_topics = num_topics,
                          workers=25)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        n_topics.append(num_topics)
        print("{}: {}".format(num_topics,coherence_values[-1]))
    
    return model_list, coherence_values,n_topics

In [None]:
# Can take a long time to run.
model_list, coherence_values,n_topics = compute_coherence_values(dictionary=dictionary, corpus=corpus_tfidf,
                                                        texts=norm_tokenized_corpus, start=5, limit=40, step=1)

5: 0.3670669920593702
6: 0.4048856903927633
7: 0.43304287873667796
8: 0.4271937504602833
9: 0.435415536813317
10: 0.46305101244318625
11: 0.4546452876134362
12: 0.4536783969909961
13: 0.446779069744696


In [None]:
plt.plot(n_topics, coherence_values)
plt.show()

In [None]:
best_model = model_list[np.argmax(coherence_values)]

In [None]:
print_topics_gensim(topic_model=best_model,
                   total_topics = total_topics,
                   num_terms=10,
                   display_weights=True) 

In [None]:
lda_model_filepath = 'data/lda_res'
best_model.save(lda_model_filepath)

In [None]:
print(' '.join(paras[3]))

In [None]:
# Our test document is document number 4310
for index, score in sorted(best_model[corpus_tfidf[3]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, best_model.print_topic(index, 10)))

In [None]:
best_model[corpus_tfidf[1]]

In [None]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus_bow, dictionary,n_jobs=2)

In [None]:
vis

In [None]:
pyLDAvis.save_html(vis, './data/topic_lda.html')