### Efficient version by using mallet2gensim conversion for inference

In [1]:
import sys
import os
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy

from gensim import corpora, models
from gensim.models import LdaModel

from docx import Document

In [2]:
model_path = os.path.join('../../model/mallet_weights_50_2019_01_15')
dictionary_path = os.path.join('../../data/processed/dictionary.dict')
label_definition_path = os.path.join('../../data/processed/Topic Definition_2019_01_15.npy')
text_file_path = "../../documentation/sample_docs/5138964-v5-Brazil_2013_Article_IV_Consultation_-_Policy_Note.DOCX"
processed_file_path = os.path.join('../../data/processed/', text_file_path.split(sep='/')[-1].split(sep ='.')[0] + '.csv')
print(os.getcwd())
print(model_path)
print(dictionary_path)

/mnt/notebook/poc
../../model/mallet_weights_50_2019_01_15
../../data/processed/dictionary.dict


#### Import Model, Dictionary, and Label (manually created)

In [3]:
lda_model = LdaModel.load(model_path)
old_dict = corpora.Dictionary.load(dictionary_path)



In [4]:
label_topic_dict = np.load(label_definition_path)
label_topic_dict = dict(label_topic_dict.tolist())

#### Create Model Topic Dictionary (Topic ID ~ Word List)

In [5]:
model_topic_list = lda_model.show_topics(num_topics= 50, num_words= 15, formatted= False)
model_topic_list = dict(model_topic_list)

model_topic_dict = dict()

for key, value in model_topic_list.items():
    word_list, prob = zip(*value)
    model_topic_dict[key] = list(word_list)

#### Generate Topic-Lable Mapping by applying IOU to manually-created labels

In [6]:
def calculate_intersection_over_union(list_a, list_b):
    
    inter_set = list(set(list_a) & set(list_b))
    union_set = list(set(list_a) | set(list_b))
    
    return len(inter_set)/len(union_set)

def map_topic_label(model_dict, label_dict):
    
    new_list =dict()
    
    for model_key, model_value in model_dict.items():
        iou_list =[]
        for label_key, label_value in label_dict.items():
            iou_list.append(calculate_intersection_over_union(model_value, label_value))
        max_id = np.array(iou_list).argmax()
        new_list[model_key] = list(label_dict.keys())[max_id]
    
    return new_list

topic_label_dict = map_topic_label(model_dict= model_topic_dict, label_dict= label_topic_dict )

#### Load Text File

In [7]:
def read_doc(f_path,word_length_filter=20):
    if os.path.isfile(f_path):
        doc = Document(f_path)
        text_list = [p.text for p in doc.paragraphs if len(p.text)>10]#[3:]
        text_list = [p.replace('\xa0',' ') for p in text_list] # some clean up 
        text_list = [p for p in text_list if len(p.split()) > word_length_filter]
    else:
        raise Exception('File does not exist: {}'.format(f_path))

    return text_list

new_text = read_doc(text_file_path)

new_text[2]

'Policies. With supply-side constraints restraining short-term growth, the overall policy stance should gradually withdraw stimulus, embed policies in Brazil’s long-standing frameworks, and emphasize monetary policy as the main counter-cyclical tool. Exchange rate flexibility and judicious use of CFMs can continue to help address the challenges posed by volatile capital flows. The renewed focus on supply-side policies is welcome but further efforts are needed to mobilize domestic saving, increase investment, and enhance productivity and competitiveness.'

In [8]:
nlp = spacy.load('en') 

def infer_single_paragraph(paragraph, ldaModel):
    '''Load raw paragraph and model, return cleaned paragraph and topic_label with highest probability'''
    #### Process text using Spacy for Tokenization/Lemmentization and loaded dictionary for bag-of-words
    new_text = nlp(paragraph)
    new_doc = [word.lemma_ for word in new_text]
    new_bow = old_dict.doc2bow(new_doc)
    
    ## Make inference using gensim_lda model (converted from mallet) and retrieve Top ID
    ldaModel = models.wrappers.ldamallet.malletmodel2ldamodel(ldaModel)
    topic_prob = ldaModel[new_bow]
    n, prob = zip(*topic_prob)
    top_id = np.array(n)[np.array(prob).argmax()]
    
    return new_text, top_id

In [9]:
import time
start_time = time.time()
result = [infer_single_paragraph(paragraph, lda_model) for paragraph in new_text]
print("--- %s seconds ---" % (time.time() - start_time))

--- 7.17311954498291 seconds ---


In [10]:
p, topic_id = zip(*result)

result = pd.DataFrame({'Paragraph': p, 'Topic ID' : topic_id})

result.Paragraph.apply(str)
result['Label'] = result['Topic ID'].apply(lambda x: topic_label_dict[x])

result.to_csv(processed_file_path)