Follow this blog post
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [1]:
from gensim import corpora, models 
import numpy as np
import sys
import os
import gensim
import pickle
import nltk
from collections import Counter
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

python_root = './scripts'
sys.path.insert(0, python_root)



#### Load data

In [2]:
paras = pickle.load(open('./data/processed_corpus.p', "rb")) 

In [3]:
print(paras[0])

['1', 'as', 'a', 'small', 'open', 'tourism', 'base', 'economy', 'st._lucia', 'be', 'highly_vulnerable_to_exogenous_shock', 'tourism', 'account', 'for', 'over', 'three_quarter', 'of', 'export', 'and', 'the', 'import_content', 'of', 'both', 'consumption', 'and', 'foreign_direct_investment', 'fdi', 'be', 'very', 'high', 'figure_1', 'the', 'economy', 'have', 'be', 'buffet', 'by', 'the', 'global', 'economic', 'downturn', 'which', 'have', 'hobble', 'the', 'tourism', 'and', 'construction', 'sector', 'with', 'potential', 'spillover', 'to', 'the', 'financial', 'sector']


#### Use Gensim for topic modeling

In [4]:
## remove stop words 
norm_tokenized_corpus = paras[:10000]
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.extend(['-PRON-','percent','’s','would','also',
                      'year','continue','include','give','may','new',
                      'however','well','help','since'])
norm_tokenized_corpus = [[token for token in p if token not in stopword_list] for p in norm_tokenized_corpus]

In [5]:
def _flatten(container):
    for i in container:
        if isinstance(i, (list,tuple)):
            for j in _flatten(i):
                yield j
        else:
            yield i

In [6]:
words = list(_flatten(norm_tokenized_corpus))
wc = Counter(words)
wc.most_common(30)

[('authority', 4601),
 ('fiscal', 4207),
 ('growth', 4181),
 ('bank', 4093),
 ('sector', 3601),
 ('policy', 3544),
 ('gdp', 3406),
 ('financial', 3275),
 ('increase', 3213),
 ('government', 3122),
 ('reform', 2966),
 ('public', 2860),
 ('debt', 2853),
 ('high', 2640),
 ('staff', 2619),
 ('tax', 2466),
 ('remain', 2390),
 ('rate', 2385),
 ('need', 2150),
 ('program', 2112),
 ('economic', 2074),
 ('market', 1995),
 ('risk', 1974),
 ('support', 1958),
 ('external', 1929),
 ('improve', 1914),
 ('low', 1818),
 ('revenue', 1721),
 ('investment', 1664),
 ('measure', 1638)]

In [7]:
# build dictionary
dictionary = corpora.Dictionary(norm_tokenized_corpus)
dictionary.filter_extremes(no_below=5,no_above=0.5, keep_n=10000)
# convert document into bow
corpus_bow = [dictionary.doc2bow(text) for text in norm_tokenized_corpus]
## comput tfidf feature vectors
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

In [8]:
len(dictionary)

6395

In [9]:
## a better way to print 
def print_topics_gensim(topic_model, total_topics=1,
                        weight_threshold=0.0001,
                        display_weights=False,
                        num_terms=None):
    
    for index in range(total_topics):
        topic = topic_model.show_topic(index,topn=num_terms)
        topic = [(word, round(wt,4)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print (topic[:num_terms] if num_terms else topic)
        else:
            print ('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print (tw[:num_terms] if num_terms else tw)
        print

####  Latent Dirichlet Allocation

In [10]:
total_topics = 7

In [11]:
lda = models.LdaMulticore(corpus = corpus_bow,
                          id2word = dictionary,
                          iterations = 1000,
                          num_topics = total_topics)#,
                          #workers = 20) #alpha='auto',

In [12]:
print_topics_gensim(topic_model=lda,
                   total_topics = total_topics,
                   num_terms=20,
                   display_weights=True) 

Topic #1 with weights
[('fiscal', 0.0121), ('gdp', 0.0118), ('bank', 0.0105), ('authority', 0.0092), ('growth', 0.0074), ('policy', 0.0069), ('increase', 0.0066), ('sector', 0.0065), ('staff', 0.0065), ('financial', 0.0064), ('government', 0.0059), ('risk', 0.0051), ('rate', 0.0051), ('debt', 0.005), ('reform', 0.0048), ('market', 0.0047), ('support', 0.0046), ('deficit', 0.0046), ('need', 0.0043), ('public', 0.0042)]
Topic #2 with weights
[('authority', 0.0126), ('bank', 0.0105), ('government', 0.0103), ('debt', 0.0097), ('fiscal', 0.0087), ('public', 0.0083), ('sector', 0.0072), ('staff', 0.0065), ('exchange', 0.0058), ('gdp', 0.0058), ('rate', 0.0057), ('policy', 0.0056), ('reform', 0.0055), ('need', 0.0054), ('fund', 0.005), ('financial', 0.0049), ('program', 0.0046), ('growth', 0.0045), ('high', 0.0045), ('budget', 0.0043)]
Topic #3 with weights
[('growth', 0.0139), ('policy', 0.0092), ('increase', 0.0081), ('authority', 0.0071), ('financial', 0.0069), ('sector', 0.0068), ('high',

In [13]:
lda_model_filepath = 'data/lda_res'
lda.save(lda_model_filepath)

In [33]:
lda[corpus_bow[900]]

[(1, 0.952198)]

In [14]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

In [23]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus_bow, dictionary,n_jobs=2)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [24]:
vis

In [21]:
pyLDAvis.save_html(vis, './data/topic_lda.html')

In [25]:
pyLDAvis.display(vis)