# LDA Models

Run several LDA models to see if it is fit enough for use. base results on coherence scores for comparsion against all other models generated. 

Coherence score uses the uci/c_v/extrinsic method. [Explanation](http://qpleple.com/topic-coherence-to-evaluate-topic-models/)

Use pyLDAvis to visualize and interpret results. 

A total of 6 models were tested, but we have excluded 4 of them for berevity. All other models had poorer coherence scores. Parameters changed were topic numbers and chunk size.

In [6]:
import numpy as np 
import pandas as pd
import re

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel, ldamodel
from gensim.models.ldamodel import LdaModel

import warnings
warnings.filterwarnings("ignore")

from gensim.test.utils import datapath
import spacy
import pickle
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from time import time  # To time our operations

In [2]:
pickle_in = open("D:/Capstone/dataset/spacy_lemv2.pickle","rb")
spacy_text = pickle.load(pickle_in)

In [4]:
# Create Dictionary
id2word = corpora.Dictionary(spacy_text)
# Create Corpus
texts = spacy_text
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [8]:
lda_model1 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6,
                                           random_state=31,
                                           update_every=2,
                                           chunksize=4500,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=True)

In [7]:
lda_model3 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7,
                                           random_state=31,
                                           update_every=2,
                                           chunksize=2000,
                                           passes=30,
                                           alpha='auto',
                                           per_word_topics=True)

In [9]:
for i in lda_model1.print_topics():
    print('\n',i)


 (0, '0.020*"gas" + 0.009*"capacity" + 0.007*"day" + 0.007*"pipeline" + 0.006*"plant" + 0.006*"area" + 0.005*"time" + 0.005*"energy" + 0.005*"point" + 0.005*"contract"')

 (1, '0.032*"message" + 0.019*"intended_recipient" + 0.019*"receive" + 0.018*"information" + 0.014*"corp" + 0.013*"confidential" + 0.012*"click" + 0.011*"contact" + 0.010*"delete" + 0.010*"reply"')

 (2, '0.012*"market" + 0.009*"company" + 0.008*"energy" + 0.008*"power" + 0.006*"service" + 0.006*"business" + 0.005*"issue" + 0.005*"state" + 0.005*"new" + 0.004*"year"')

 (3, '0.022*"deal" + 0.010*"price" + 0.010*"gas" + 0.009*"change" + 0.009*"day" + 0.008*"contract" + 0.007*"trade" + 0.007*"let" + 0.006*"month" + 0.006*"power"')

 (4, '0.013*"agreement" + 0.011*"attach" + 0.010*"meeting" + 0.009*"let" + 0.009*"send" + 0.007*"work" + 0.006*"question" + 0.006*"review" + 0.006*"time" + 0.006*"draft"')

 (5, '0.009*"good" + 0.008*"time" + 0.008*"go" + 0.007*"day" + 0.007*"think" + 0.006*"want" + 0.005*"will" + 0.005*"com

In [10]:
print('\nPerplexity: ', lda_model1.log_perplexity(corpus))
coherence_model_lda = CoherenceModel(model=lda_model1, texts=spacy_text, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.186360707886864

Coherence Score:  0.512405444623492


In [18]:
for i in lda_model3.print_topics():
    print('\n',i)


 (0, '0.011*"company" + 0.011*"market" + 0.011*"energy" + 0.008*"power" + 0.006*"business" + 0.006*"year" + 0.005*"state" + 0.005*"service" + 0.004*"issue" + 0.004*"new"')

 (1, '0.018*"click" + 0.012*"information" + 0.009*"service" + 0.009*"new" + 0.009*"time" + 0.009*"receive" + 0.008*"free" + 0.008*"report" + 0.008*"message" + 0.008*"send"')

 (2, '0.017*"meeting" + 0.010*"schedule" + 0.010*"time" + 0.010*"pm" + 0.009*"date" + 0.009*"houston" + 0.008*"conference" + 0.007*"attend" + 0.007*"travel" + 0.007*"room"')

 (3, '0.012*"good" + 0.010*"go" + 0.009*"think" + 0.008*"time" + 0.008*"want" + 0.008*"day" + 0.007*"will" + 0.007*"work" + 0.006*"come" + 0.006*"look"')

 (4, '0.021*"deal" + 0.018*"gas" + 0.012*"price" + 0.011*"day" + 0.010*"contract" + 0.008*"month" + 0.007*"market" + 0.007*"change" + 0.006*"volume" + 0.006*"power"')

 (5, '0.013*"agreement" + 0.012*"attach" + 0.011*"let" + 0.010*"send" + 0.008*"work" + 0.008*"question" + 0.007*"change" + 0.006*"review" + 0.006*"forwar

In [19]:
print('\nPerplexity: ', lda_model3.log_perplexity(corpus))
coherence_model_lda = CoherenceModel(model=lda_model3, texts=spacy_text, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.205993772788032

Coherence Score:  0.5514598107680275


In [15]:
temp_file1 = datapath('D:/Capstone/dataset/lda_model1.model')
temp_file3 = datapath('D:/Capstone/dataset/ldamodel/lda_model3.model')

In [16]:
lda_model1.save(temp_file1)
lda_model3.save(temp_file3)

In [11]:
pyLDAvis.enable_notebook(sort=True)
vis = pyLDAvis.gensim.prepare(lda_model1, corpus, id2word)
pyLDAvis.display(vis)

In [None]:
topic_dict_lda1={0:'legal',
                1:'operational compliance',
                2:'social',
                3:'trading',
                4:'contracts',
                5:'energy operations'}

In [12]:
pyLDAvis.enable_notebook(sort=True)
vis = pyLDAvis.gensim.prepare(lda_model3, corpus, id2word)
pyLDAvis.display(vis)

In [None]:
topic_dict_lda3={0:'office communications',
                1:'industry expertise',
                2:'social',
                3:'operational compliance',
                4:'trading',
                5:'legal',
                6:'spam & html communique'
               }

In [15]:

loaded_model = LdaModel.load(temp_file3)