## Train mallet lda model using best tuning results, convert to gensim model, and save LDAvis and excel file

#### Some terminologies:
1. raw_doc: unprocessed raw document from txt file
2. docs: lemmentized corpus
3. corpus_bow: bag of words corpus
4. corpus_tfidf: tfidf corpus

#### Change from eariler version:
1. filter out documents with too few words
2. use mallet

### Load dictionary and pre-built functions

In [12]:
from gensim import corpora, models
import numpy as np
import sys
import os
import gensim
import pickle
#from collections import Counter
#import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import datetime
import pyLDAvis
import pyLDAvis.gensim as gensim_vis

In [2]:
## global folder path 
data_folder = '../../data/'
model_folder = '../../model/'
raw_data_path = os.path.join(data_folder,'raw/article_IV_corpus.txt')
data_processed_folder = os.path.join(data_folder,'processed')
results_folder = os.path.join(data_folder,'results','temp_results')
## binary file for mallet model
mallet_path = '/mnt/packages/Mallet/bin/mallet' # update this path

In [3]:
# %load topic_models.py
# python_root = './scripts'
# sys.path.insert(0, python_root)

#%%
def prepare_data(data_folder,save=True):
    ## read and transform data 
    contents = pickle.load(open(os.path.join(data_folder,'lemma_corpus.p'), "rb"))
    print('length of lemmentized corpus: {}'.format(len(contents)))
    docs = list()
    for paragraph in contents:
        docs.append([w for sentance in paragraph for w in sentance])

    # build dictionary

    dictionary = corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=5,no_above=0.5, keep_n=10000)
    # convert document into bow
    corpus_bow = [dictionary.doc2bow(text) for text in docs]
    ## comput tfidf feature vectors
    tfidf = models.TfidfModel(corpus_bow) # smartirs = 'atc' https://radimrehurek.com/gensim/models/tfidfmodel.html
    corpus_tfidf = tfidf[corpus_bow]
    
    ## save dictionary and corpora 
    if save:
        dictionary_save_path = os.path.join(data_folder,'dictionary.dict')
        dictionary.compactify()
        dictionary.save(dictionary_save_path)
        corpora.MmCorpus.serialize( os.path.join(data_folder,'corpus_bow.mm'), corpus_bow)
        corpora.MmCorpus.serialize( os.path.join(data_folder,'corpus_tfidf.mm'), corpus_tfidf)
        #print(len(dictionary))
    return docs,dictionary,corpus_bow,corpus_tfidf

#%%

### Load and process data

#### Load original text to look through later

In [4]:
with open(raw_data_path,'r',encoding='utf8') as f:
    raw_doc = f.readlines()
    raw_doc = [l.strip(' \n') for l in raw_doc if len(l)>50]

print('Length of raw documents {}'.format(len(raw_doc)))

Length of raw documents 142564


#### Load lemmentized corpus

In [5]:
rerun = True
if rerun == True:                ## save gensim objects, corpus, dictionary, and lda model
    mode = 'all'
    docs,dictionary,corpus_bow,corpus_tfidf = prepare_data(data_processed_folder,save=False)
    # corpus_bow = [c for c in corpus_bow_full if len(c)>0]
    
print('Length of length of bag-of-word corpus: {}'.format(len(corpus_bow)))

length of lemmentized corpus: 142564
Length of length of bag-of-word corpus: 142564


#### filter out paragraphs with <20 words or contain 'titles'

In [6]:
raw_doc_new = list()
corpus_bow_new = list()
docs_new = list()

tuple_temp = [(x, y, z) for (x, y, z) in zip(raw_doc, corpus_bow, docs) if len(x.split())>20 and ('<Title>' not in x) ]

raw_doc_new, corpus_bow_new, docs_new = zip(*tuple_temp)

print('Length of corpus without "<Title>" and has more than 20 words: {}'.format(len(raw_doc_new)))

raw_doc_new[0]

Length of corpus without "<Title>" and has more than 20 words: 123908


'1. The Russian economy proved to be more resilient than expected to the dual shocks of lower oil prices and sanctions. Output fell sharply in 2015, by 2.8 percent (revised from an initial estimate of 3.7 percent) but stabilized in 2016, contracting by only 0.2 percent. The relatively modest response to the large external shocks reflects the authorities’ effective policy response—floating exchange rate, banking system liquidity support and capital injections, and limited fiscal stimulus coupled with restrictive incomes policies—and was enabled by robust buffers.'

### Train Mallet LDA model

In [7]:
n_topics = 50
n_words = 20
np.random.seed(seed=1)
lda_mallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_bow_new, num_topics=n_topics, 
                                             id2word= dictionary, alpha= 1, optimize_interval=10, 
                                             iterations = 2000, random_seed = 1, 
                                             prefix=os.path.join(model_folder,"mallet_{}_topics_".format(n_topics)))
#initial alpha = 5/ n_topics = 5/ 50 = 0.1

In [8]:
lda_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

In [None]:
print('calculating coherence socre for {} documents ......'.format(len(docs_new)))
coherence_model_lda = CoherenceModel(model=lda_gensim, texts=docs_new, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

### Visualization

In [13]:
# Test if gensim model conversion gives the same viz- slight difference in ranking due to topic probability. However, positions are correct.
pyLDAvis.enable_notebook()
vis_data = gensim_vis.prepare(lda_gensim,corpus= corpus_bow_new, dictionary= dictionary)
vis_data

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [14]:
id_map = dict(zip(np.array(vis_data.topic_order)- 1  , list(range(1,51))))
# key: topic_id_gensim + 1 , value: topic_id_LDAvis (ranked by topic size)

### Save Model and Results

In [124]:
def save_results_to_excel(today, model = lda_gensim, corpus_bow_new = corpus_bow_new, n_topics = n_topics, n_words = n_words, id_map = id_map):
    
    '''saves results to excel for better visualization'''
    # 1. Make a topic-key word table
    
    topic_word_df = pd.DataFrame(data = np.zeros((n_topics, n_words)), columns= ['word'+ str(x) for x in range(n_words)])
    for i in range(n_topics):
        topic_word_df.iloc[i] = pd.DataFrame(model.show_topic(topicid= i, topn= n_words))[0].tolist()
    topic_word_df['viz_topic_id'] = [id_map[key] for key in topic_df.index.to_list()]
    
    # 2. Make dataframe for doc_topic using parallel computing
    
    import time
    start_time = time.time()
    doc_topic = [*map(lambda x: model.get_document_topics(x), corpus_bow_new)]
    print("--- %s seconds ---" % (time.time() - start_time))

    doc_topic_df = pd.DataFrame(doc_topic)
    doc_topic_df['paragraph_id'] = doc_topic_df .index.to_list()
    doc_topic_df  = doc_topic_df.melt(id_vars='paragraph_id')
    doc_topic_df = doc_topic_df[doc_topic_df['value'].notnull()]
    doc_topic_df['gensim_topic'] = doc_topic_df['value'].apply(lambda x: x[0])
    doc_topic_df['probability'] = doc_topic_df['value'].apply(lambda x: x[1])
    doc_topic_df  = doc_topic_df[['paragraph_id','gensim_topic', 'probability']]
    doc_topic_df['viz_topic_id']= doc_topic_df ['gensim_topic'].apply(lambda x: id_map[x])

    # 3. Make paragraph table
    
    paragraph_df = pd.DataFrame(columns=['Paragraph'], data = raw_doc_new)
    
    # 4. save results to excel
    
    writer = pd.ExcelWriter(path = os.path.join(results_folder,'Mallet_{}_topics_{}.xlsx'.format(n_topics, today)))
        
    topic_word_df.to_excel(writer, 'Toipc and Key Word')
    doc_topic_df.to_excel(writer, 'Document and Topic')
    paragraph_df.to_excel(writer,'Paragraph')
    
    writer.save()  
    
    return

In [None]:
now = datetime.datetime.now()
now = now.strftime("%Y_%m_%d")

lda_gensim.save(os.path.join(model_folder,'mallet_as_gensim_weights_{}_{}'.format(n_topics, now)))
# test loading model
#from gensim.models import LdaModel
#test = LdaModel.load(os.path.join(model_folder,'mallet_as_gensim_weights_{}_{}'.format(n_topics, now)))

pyLDAvis.save_html(vis_data,  os.path.join(results_folder,'mallet_as_gensim_{}_topics_{}.html'.format(n_topics, now))) 

save_results_to_excel(today = now)

--- 73.21768116950989 seconds ---
