### This file adds result anaysis and visualization into a tuned LDA model

#### Some terminologies:
1. raw_doc: unprocessed raw document from txt file
2. docs: lemmentized corpus
3. corpus_bow: bag of words corpus
4. corpus_tfidf: tfidf corpus

#### Change from eariler version:
1. filter out documents with too few words

### Load dictionary and pre-built functions

In [1]:
from gensim import corpora, models
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel 
import numpy as np
import sys
import os
import gensim
import pickle
#from collections import Counter
#import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import pyLDAvis
import pyLDAvis.gensim

In [2]:
## global folder path 
data_folder = '../../data/'
raw_data_path = os.path.join(data_folder,'raw/article_IV_corpus.txt')
data_processed_folder = os.path.join(data_folder,'processed')
results_folder = os.path.join(data_folder,'results','temp_results')
## binary file for mallet model
mallet_path = '/mnt/packages/Mallet/bin/mallet' # update this path

In [3]:
# %load topic_models.py
# python_root = './scripts'
# sys.path.insert(0, python_root)

#%%
def prepare_data(data_folder,save=True):
    ## read and transform data 
    contents = pickle.load(open(os.path.join(data_folder,'lemma_corpus.p'), "rb"))
    print('length of lemmentized corpus: {}'.format(len(contents)))
    docs = list()
    for paragraph in contents:
        docs.append([w for sentance in paragraph for w in sentance])

    # build dictionary

    dictionary = corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=5,no_above=0.5, keep_n=10000)
    # convert document into bow
    corpus_bow = [dictionary.doc2bow(text) for text in docs]
    ## comput tfidf feature vectors
    tfidf = models.TfidfModel(corpus_bow) # smartirs = 'atc' https://radimrehurek.com/gensim/models/tfidfmodel.html
    corpus_tfidf = tfidf[corpus_bow]
    
    ## save dictionary and corpora 
    if save:
        dictionary_save_path = os.path.join(data_folder,'dictionary.dict')
        dictionary.compactify()
        dictionary.save(dictionary_save_path)
        corpora.MmCorpus.serialize( os.path.join(data_folder,'corpus_bow.mm'), corpus_bow)
        corpora.MmCorpus.serialize( os.path.join(data_folder,'corpus_tfidf.mm'), corpus_tfidf)
        #print(len(dictionary))
    return docs,dictionary,corpus_bow,corpus_tfidf

#%%

   
def basic_lda(total_topics,corpus,dictionary,docs,score=False):
    
    print('Training for {} documents ......'.format(len(corpus)))
    
    lda = LdaModel(corpus = corpus,
                              id2word = dictionary,
                              num_topics = total_topics,
                              alpha='auto',
                              eta = 'auto',
                              random_state = np.random.RandomState(seed =2))#,
                              #workers = 20) #
                              #iterations = 1000,
    # Compute Coherence Score
    if score:
        print('calculating coherence socre for {} documents ......'.format(len(docs)))
        coherence_model_lda = CoherenceModel(model=lda, texts=docs, dictionary=dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)

        return lda,coherence_lda
    
    return lda

### Load and process data

#### Load original text to look through later

In [4]:
with open(raw_data_path,'r',encoding='utf8') as f:
    raw_doc = f.readlines()
    raw_doc = [l.strip(' \n') for l in raw_doc if len(l)>50]

print('Length of raw documents {}'.format(len(raw_doc)))

Length of raw documents 142564


#### Load lemmentized corpus

In [5]:

rerun = True
if rerun == True:                ## save gensim objects, corpus, dictionary, and lda model
    mode = 'all'
    docs,dictionary,corpus_bow,corpus_tfidf = prepare_data(data_processed_folder,save=False)
    # corpus_bow = [c for c in corpus_bow_full if len(c)>0]
    
print('Length of length of bag-of-word corpus: {}'.format(len(corpus_bow)))

length of lemmentized corpus: 142564
Length of length of bag-of-word corpus: 142564


#### filter out paragraphs with <20 words or contain 'titles'

In [6]:
raw_doc_new = list()
corpus_bow_new = list()
docs_new = list()

tuple_temp = [(x, y, z) for (x, y, z) in zip(raw_doc, corpus_bow, docs) if len(x.split())>20 and ('<Title>' not in x) ]

raw_doc_new, corpus_bow_new, docs_new = zip(*tuple_temp)

print('Length of corpus without "<Title>" and has more than 20 words: {}'.format(len(raw_doc_new)))

raw_doc_new[0]

Length of corpus without "<Title>" and has more than 20 words: 123908


'1. The Russian economy proved to be more resilient than expected to the dual shocks of lower oil prices and sanctions. Output fell sharply in 2015, by 2.8 percent (revised from an initial estimate of 3.7 percent) but stabilized in 2016, contracting by only 0.2 percent. The relatively modest response to the large external shocks reflects the authorities’ effective policy response—floating exchange rate, banking system liquidity support and capital injections, and limited fiscal stimulus coupled with restrictive incomes policies—and was enabled by robust buffers.'

### Run 1 LDA model and get results for each document

In [7]:
# n_topics = 40 # number of topics assumed
# n_words = 20  # number of key words interested
# np.random.seed(seed=1)
# model, score = basic_lda(total_topics=n_topics,corpus=corpus_bow_new,dictionary=dictionary,docs=docs_new,score=True)

# # View: given document, get its topics
# doc_id = 0
# tp = model.get_document_topics(bow= corpus_bow_new[doc_id], minimum_probability= 0.17)
# print('tp: {}'.format(tp))
# for i in tp:
#     print('topic: {}'.format(i))
#     print(model.show_topic(topicid=i[0],topn=n_words))

# # Create a topic-key word table
# topic_df = pd.DataFrame(data = np.zeros((n_topics, n_words)), columns= ['word'+ str(x) for x in range(n_words)])
# for i in range(n_topics):
#     topic_df.iloc[i] = pd.DataFrame(model.show_topic(topicid= i, topn= n_words))[0].tolist()

# topic_df

# # Now create a document-topic dataframe
# docs_df = pd.DataFrame(data = np.zeros(len(docs_new)), columns=['paragraph'])
# docs_df['paragraph'] = raw_doc_new

# col_names = ['T'+ str(i) for i in np.array(range(n_topics))]
# for col in col_names:
#     docs_df[col]= 0 

# docs_df.head()

# for row in range(docs_df.shape[0]):
#     tp = model.get_document_topics(bow= corpus_bow_new[row], minimum_probability= 0.2)
#     for x in tp:
#         docs_df.loc[row, 'T'+ str(x[0])] = x[1]

### Some Analysis

In [8]:
# # Test: given topic, find most possible document
# top_document_per_topic = []

# for t_id in range(n_topics):
#     t = 'T'+ str(t_id)
#     print("Topic {}:".format(t_id))
#     print(model.show_topic(topicid= t_id, topn=n_words))
#     print(str(raw_doc_new[docs_df[t].idxmax()]))
#     top_document_per_topic.append(str(raw_doc_new[docs_df[t].idxmax()]))

# # Make a table for better visualization

# top_document_per_topic_df = pd.DataFrame(data = topic_df[topic_df.columns[0:]].apply(
#     lambda x: ','.join(x.astype(str)), axis =1),
#                                          columns =['topic'])
# top_document_per_topic_df['top_document']= top_document_per_topic 
# top_document_per_topic_df

# # save results to excel
# now = datetime.datetime.now()
# now = now.strftime("%Y_%m_%d")
# print(now)
# writer = pd.ExcelWriter(path = os.path.join(results_folder,'Analysis_{}_topics_{}.xlsx'.format(n_topics, now)))
# top_document_per_topic_df.to_excel(writer,'Topic and Top Document')

# # save toipc-key word to excel
# topic_df.to_excel(writer, 'Toipc and Key Word')

# # save document-toipc mapping to excel
# # transform from wide to long format
# docs_df_long = pd.melt(docs_df,id_vars=['paragraph'], value_vars=['T'+ str(i) for i in range(50)] )
# docs_df_long= docs_df_long[docs_df_long['value']>0]

# docs_df_long.rename(columns={'variable':'topic','value':'probabiilty'}, inplace= True)
# docs_df_long.topic = docs_df_long.topic.apply(lambda x: x.replace('T',''))
# docs_df_long.to_excel(writer, 'Document and Topic')

# writer.save()

# # Create html visulaization using pyLDAvis
# viz_data= pyLDAvis.gensim.prepare(model, corpus_bow_new, dictionary, sort_topics= False)
# # pyLDAvis.prepare
# #pyLDAvis.enable_notebook()
# #pyLDAvis.display(viz_data)

# #pyLDAvis.save_json(viz_data, '../../../analysis/Analysis_{}_topics_{}.js'.format(n_topics, now) )
# pyLDAvis.save_html(viz_data,  os.path.join(results_folder,'Analysis_{}_topics_{}.html'.format(n_topics, now))) 

### Run LDA using Mallet 

In [11]:
n_topics = 40
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_bow_new, num_topics=n_topics, id2word= dictionary)

In [12]:
## a better way to print 
def print_topics_gensim(topic_model, total_topics=1,
                        weight_threshold=0.0001,
                        display_weights=False,
                        num_terms=None):
    
    for index in range(total_topics):
        topic = topic_model.show_topic(index,topn=num_terms)
        topic = [(word, round(wt,4)) 
                 for word, wt in topic 
                 if abs(wt) >= weight_threshold]
        if display_weights:
            print('Topic #'+str(index+1)+' with weights')
            print (topic[:num_terms] if num_terms else topic)
        else:
            print ('Topic #'+str(index+1)+' without weights')
            tw = [term for term, wt in topic]
            print (tw[:num_terms] if num_terms else tw)
        print()
     

In [13]:
print('calculating coherence socre for {} documents ......'.format(len(docs_new)))
coherence_model_lda = CoherenceModel(model=ldamallet, texts=docs_new, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

calculating coherence socre for 123908 documents ......

Coherence Score:  0.629796149683


In [14]:
print_topics_gensim(ldamallet,total_topics=n_topics,display_weights=False,num_terms=10)

Topic #1 without weights
['financial', 'financial_sector', 'institution', 'supervision', 'strengthen', 'system', 'regulatory', 'regulation', 'banking', 'stability']

Topic #2 without weights
['law', 'rule', 'establish', 'legal', 'legislation', 'governance', 'act', 'transparency', 'framework', 'issue']

Topic #3 without weights
['government', 'public', 'finance', 'level', 'central', 'transfer', 'local', 'include', 'state', 'general']

Topic #4 without weights
['sector', 'service', 'economy', 'market', 'competition', 'small', 'activity', 'good', 'industry', 'firm']

Topic #5 without weights
['problem', 'address', 'challenge', 'face', 'significant', 'concern', 'difficult', 'recent', 'give', 'constraint']

Topic #6 without weights
['suggest', 'impact', 'large', 'change', 'effect', 'estimate', 'potential', 'factor', 'gap', 'output']

Topic #7 without weights
['authority', 'note', 'agree', 'mission', 'stress', 'emphasize', 'importance', 'concern', 'recognize', 'point']

Topic #8 without weig

In [None]:
# def fine_tune_lda(corpus, dictionary, texts, limit, start=2, step=2):
#     """
#     Compute c_v coherence for various number of topics
#     Parameters:
#     ----------
#     dictionary : Gensim dictionary
#     corpus : Gensim corpus
#     texts : List of input texts
#     limit : Max num of topics

#     Returns:
#     -------
#     model_list : List of LDA topic models
#     coherence_values : Coherence values corresponding to the LDA model with respective number of topics
#     n_topics : numbmber of topics
#     """
#     coherence_values = []
#     model_list = []
#     n_topics = []
#     for num_topics in range(start, limit, step):
#         print('\nTraing with n_topics = {}, training sample = {}.'.format(num_topics,len(corpus)))
#         model = LdaModel(corpus = corpus,
#                           id2word = dictionary,
#                           random_state = 2,
#                           alpha='auto',
#                           eta = 'auto',
#                           num_topics = num_topics)#
#                           #distributed = True)  # alpha='auto' is not implenented in distributed lda
#         model_list.append(model)
#         print('Calculating coherence score based on {} samples.'.format(len(texts)))
#         coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
#         coherence_values.append(coherencemodel.get_coherence())
#         n_topics.append(num_topics)
#         print("{}: {}".format(num_topics,coherence_values[-1]))
        

#     return model_list, coherence_values,n_topics

In [None]:
# #%%
# if __name__== "__main__":
    
#     save = True  ## save gensim objects, corpus, dictionary, and lda model
#     mode = 'all'
#     docs,dictionary,corpus_bow,corpus_tfidf = prepare_data(save=save)
#     corpus_bow = [c for c in corpus_bow if len(c)>0]
    
#     if mode == 'lda' or mode=='all':
#         n_topics = 25
#         model, score = basic_lda(total_topics=n_topics,corpus=corpus_bow,dictionary=dictionary,docs=docs,score=True)
#         print(score)
#         print_topics_gensim(topic_model=model,
#                            total_topics = n_topics,
#                            num_terms=20,
#                            display_weights=True)
#     if mode =='seed_lda' or mode=='all':
#         n_topics = 25
#         boost = 1000
#         seed_topic_list = [['mpm','MPM','CFM','cfm','ltv','LTC','DSTI','dsti','lcr','LCR',
#                             'capital_buffer','macroprudential','capital_flow','prudential'],
#                             ['population','ageing','pension','productivity','migration','migrat']]
            
#         seed_model = seeded_lda(n_topics,corpus_bow,dictionary,docs,seed_topic_list, boost, score=False)
#         ## for some reason keeps buging out when calculating coherence score 
        
#         print_topics_gensim(topic_model=seed_model,
#                            total_topics = n_topics,
#                            num_terms=20,
#                            display_weights=True)
    
#     if mode == 'fine_tune' or mode =='all':
        
#         model_list, coherence_values,n_topics = fine_tune_lda(dictionary=dictionary, corpus=corpus_bow,
#                                                             texts=docs, start=15, limit=35, step=1)
        
#         best_model = model_list[np.argmax(coherence_values)]
#         best_topic_n = best_model.get_topics().shape[0]
        
#         plt.plot(n_topics, coherence_values)
#         plt.show()
        
#         print_topics_gensim(topic_model=best_model,
#                        total_topics = best_topic_n,
#                        num_terms=10,
#                        display_weights=True)
#         if save:
#             lda_model_filepath = '../data/lda_res'
#             best_model.save(lda_model_filepath)


In [None]:
# def mallet_lda(model_path,total_topics,corpus,dictionary,docs,score=False):
#     """
#     https://radimrehurek.com/gensim/models/wrappers/ldamallet.html
#     sudo apt-get install default-jdk
#     sudo apt-get install ant
#     git clone git@github.com:mimno/Mallet.git
#     cd Mallet/
#     ant
    
#     we don't have those packages in server environment
#     """
#     lda = gensim.models.wrappers.LdaMallet(model_path, corpus=corpus, num_topics=total_topics, id2word=dictionary)
#     if score:
#         print('calculating coherence socre for {} documents ......'.format(len(docs)))
#         coherence_model = CoherenceModel(model=lda, texts=docs, dictionary=dictionary, coherence='c_v')
#         coherence_score = coherence_model.get_coherence()
#         print('\nCoherence Score: ', coherence_score)
#         return lda,coherence_score
    
# def hdp(corpus,dictionary,docs,score=False):
#     print('Traiing for {} documents ......'.format(len(corpus)))
#     hdpmodel = HdpModel(corpus = corpus,id2word = dictionary)
#     if score:
#         print('calculating coherence socre for {} documents ......'.format(len(docs)))
#         coherence_model = CoherenceModel(model=hdpmodel, texts=docs, dictionary=dictionary, coherence='c_v')
#         coherence_score = coherence_model.get_coherence()
#         print('\nCoherence Score: ', coherence_score)
#         return hdpmodel,coherence_score
#     return hdpmodel
    
# def lsi(total_topics, corpus,dictionary,docs,score=False):
#     print('Traiing for {} documents ......'.format(len(corpus)))
#     lsimodel = LsiModel(corpus = corpus,id2word = dictionary,num_topics=total_topics)
#     if score:
#         print('calculating coherence socre for {} documents ......'.format(len(docs)))
#         coherence_model = CoherenceModel(model=lsimodel, texts=docs, dictionary=dictionary, coherence='c_v')
#         coherence_score = coherence_model.get_coherence()
#         print('\nCoherence Score: ', coherence_score)
#         return lsimodel,coherence_score
#     return lsimodel

# def seeded_lda(total_topics,corpus,dictionary,docs,seed_topic_list, boost, score=False):
#     print('Modify beta prior ......')
#     _model = LdaModel(corpus = corpus_bow, id2word = dictionary,random_state = 2,alpha='auto',num_topics = total_topics,iterations=0)
#     beta_matrix = _model.expElogbeta
#     for t_id, st in enumerate(seed_topic_list):
#         for word in st:
#             try:
#                 w_id = dictionary.token2id[word]
#                 beta_matrix[t_id,w_id] = boost
#                 print('{} : {} : {}'.format(t_id,w_id,word))
#             except:
#                 continue
#     print('Training for {} documents ......'.format(len(corpus)))
#     seed_model = LdaModel(corpus = corpus_bow,
#                                   id2word = dictionary,
#                                   num_topics = total_topics,
#                                   eta = beta_matrix,
#                                   random_state=2)
#     # Compute Coherence Score
#     if score:
#         print('calculating coherence socre for {} documents ......'.format(len(docs)))
#         coherence_model_lda = CoherenceModel(model=seed_model, texts=docs, dictionary=dictionary, coherence='c_v')
#         coherence_lda = coherence_model_lda.get_coherence()
#         print('\nCoherence Score: ', coherence_lda)

#         return seed_model,coherence_lda
    
#     return seed_model