In [1]:
from gensim import corpora
from gensim.models import LdaModel
import numpy as np
import sys
import os
import gensim
import pickle
import pandas as pd
import datetime
import time 

In [2]:
## load text data 
def load_data(data_processed_folder,processed_file_name):
    file_path = os.path.join(data_processed_folder,processed_file_name)
    with open(file_path,'rb') as f:
        dataset = pickle.load(f)
        for r in dataset:
            r[4] = [w for sentance in r[4] for w in sentance]
            r.append(vocab_dict.doc2bow(r[4]))
   
    ## transformed to dataframe
    dataset = pd.DataFrame(dataset,columns=['id','country','year','title','tokens','text','bow'])
    dataset['length'] = dataset.tokens.apply(len)

    return dataset

def get_topics(model,bow):
    return model[bow]

def get_topics_list(model,bows):
    
    res = [*map(lambda x: model.get_document_topics(x), bows)]

    return res 


def merge_transform_topic_df(topic_model_res, dataset,
                             keep_columns= ['id','country','year','title','text','length']):
    
    doc_topic_df = pd.DataFrame(topic_model_res)
    doc_topic_df['paragraph_id'] = doc_topic_df.index.to_list()
    
    #reshape from wide to long
    doc_topic_df  = doc_topic_df.melt(id_vars='paragraph_id')
    doc_topic_df = doc_topic_df[doc_topic_df['value'].notnull()]
    doc_topic_df['gensim_topic'] = doc_topic_df['value'].apply(lambda x: x[0])
    doc_topic_df['probability'] = doc_topic_df['value'].apply(lambda x: x[1])
    doc_topic_df  = doc_topic_df[['paragraph_id','gensim_topic', 'probability']]
    
    ## merge database
    doc_topic_full_df = pd.merge(dataset[keep_columns],doc_topic_df,left_index=True,right_on=['paragraph_id'])
     
    return doc_topic_full_df

def get_topic_words_df(model, top_n=30):
    
    topics_words_matrix = np.array([[w for w,p in model.show_topic(topicid=i,topn=top_n)] 
                         for i in range(n_topics)])
    columns_n = ['word{}'.format(str(i)) for i in range(top_n)]
    topic_words_df = pd.DataFrame(topics_words_matrix,columns=columns_n)
    
    return topic_words_df 


In [3]:
data_folder = '../../data/'
model_folder = '../../model/'
model_path = os.path.join('../../model/mallet_as_gensim_weights_50_2019_02_12')
data_processed_folder = os.path.join(data_folder,'processed')
processed_file_name = 'lemma_corpus_with_n_with_meta.p'
results_folder = os.path.join(data_folder,'results','temp_results')

## gensim dictionrary file 
dictionary_path = os.path.join(data_processed_folder,'dictionary.dict')

## load model 
lda_gensim = LdaModel.load(model_path)
vocab_dict = corpora.Dictionary.load(dictionary_path)
print('Topic Model loaded successfully...')


## set up paramaters
n_topics = lda_gensim.num_topics
n_words = 30

## test flag
test = False

## load data 
dataset = load_data(data_processed_folder,processed_file_name)

dataset

Topic Model loaded successfully...


Unnamed: 0,id,country,year,title,tokens,text,bow,length
0,102,Russian Federation,2017,Russian Federation : 2017 Article IV Consultat...,"[context, leverage, tailwind, high, stable, oi...",<Title> Context: Leveraging the Tailwinds of H...,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]",6
1,102,Russian Federation,2017,Russian Federation : 2017 Article IV Consultat...,"[economy, prove, resilient, expect, dual, shoc...",1. The Russian economy proved to be more resil...,"[(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11,...",41
2,102,Russian Federation,2017,Russian Federation : 2017 Article IV Consultat...,"[recovery, oil_price, support, exit, recession...",2. The recovery in oil prices is supporting th...,"[(1, 1), (3, 2), (12, 2), (35, 1), (38, 1), (4...",64
3,102,Russian Federation,2017,Russian Federation : 2017 Article IV Consultat...,"[need, new_growth_model, accelerate, income, c...",3. The need for a new growth model to accelera...,"[(3, 1), (8, 1), (12, 2), (18, 1), (22, 2), (4...",66
4,102,Russian Federation,2017,Russian Federation : 2017 Article IV Consultat...,"[economic, recovery, gain, pace, gdp, expand, ...",4. The economic recovery is gaining pace (Figu...,"[(46, 1), (79, 2), (85, 1), (87, 2), (120, 1),...",41
5,102,Russian Federation,2017,Russian Federation : 2017 Article IV Consultat...,"[current_account, surplus, decline, recovery, ...",5. The current account surplus declined as the...,"[(3, 2), (12, 1), (15, 1), (30, 1), (55, 1), (...",70
6,102,Russian Federation,2017,Russian Federation : 2017 Article IV Consultat...,"[high, oil_price, easy, financial, condition, ...","6. Higher oil prices, easier financial conditi...","[(0, 1), (1, 2), (3, 2), (12, 1), (17, 2), (33...",75
7,102,Russian Federation,2017,Russian Federation : 2017 Article IV Consultat...,"[medium-term, prospect, subdue, unlike, past, ...","7. However, medium-term prospects are subdued....","[(3, 1), (17, 1), (22, 1), (30, 1), (39, 1), (...",45
8,102,Russian Federation,2017,Russian Federation : 2017 Article IV Consultat...,"[short-term_risk, decline, risk, outlook, pers...",8. Short-term risks have declined. Risks to th...,"[(1, 1), (7, 1), (17, 2), (25, 2), (27, 1), (3...",82
9,102,Russian Federation,2017,Russian Federation : 2017 Article IV Consultat...,"[authority, agree, recovery, year, risk, decli...",9. The authorities agreed with staff that 2017...,"[(3, 1), (6, 1), (9, 1), (12, 4), (15, 2), (16...",82


In [3]:
if __name__ == '__main__':
    
    start_time = time.time()
    
    ## global folder path 
    data_folder = '../../data/'
    model_folder = '../../model/'
    model_path = os.path.join('../../model/mallet_as_gensim_weights_50_2019_02_12')
    data_processed_folder = os.path.join(data_folder,'processed')
    processed_file_name = 'lemma_corpus_with_n_with_meta.p'
    results_folder = os.path.join(data_folder,'results','temp_results')

    ## gensim dictionrary file 
    dictionary_path = os.path.join(data_processed_folder,'dictionary.dict')

    ## load model 
    lda_gensim = LdaModel.load(model_path)
    vocab_dict = corpora.Dictionary.load(dictionary_path)
    print('Topic Model loaded successfully...')
    
    
    ## set up paramaters
    n_topics = lda_gensim.num_topics
    n_words = 30

    ## test flag
    test = False

    ## load data 
    dataset = load_data(data_processed_folder,processed_file_name)
    if test:
        dataset = dataset.head(5)
        ## run one test
        try:
            x = get_topics(lda_gensim,dataset['bow'].head(1))
            print('Data Loaded, and topic model is valid.')
        except:
            raise Exception('your lda model weights are probably corrupted, please update weights.')

    ## generate results 
        ## get the results in sheet2
    topic_model_res = get_topics_list(lda_gensim,dataset['bow'])
    doc_topic_df = merge_transform_topic_df(topic_model_res,dataset)
    
    ##
    used_time = time.gmtime(time.time()-start_time)
    print('\nTime used to process topic probability: {}'.format(time.strftime("%H:%M:%S", used_time)))
    ## get results in sheet 1 
    topic_words_df= get_topic_words_df(lda_gensim, top_n=20)

    ## export to excel 
    now = datetime.datetime.now()
    now = now.strftime("%Y_%m_%d")
    writer = pd.ExcelWriter(path=os.path.join(results_folder,
                                              'Mallet_{}_topics_with_country_year_{}.xlsx'.format(n_topics, now)))
    #top_document_per_topic_df.to_excel(writer,'Topic and Top Document')
    topic_words_df.to_excel(writer, 'Toipc and Key Word')
    doc_topic_df.to_excel(writer, 'Document and Topic')
    writer.save()
    used_time = time.gmtime(time.time()-start_time)
    
    ## print out time used 
    print('Total time used for processing: {}'.format(time.strftime("%H:%M:%S", used_time)))
    

Topic Model loaded successfully...

Time used to process topic probability: 00:02:13
Total time used for processing: 00:04:38
