In [1]:
# importing our libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# libraries that will let us do LDA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# library that will let us go ahead and visualize LDA results
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.sklearn


# helpful library to allow for computing coherence using Gensim
import tmtoolkit

# what we're using to pickle with
import pickle

# importing json to serialize the thingy
import json

# import timing library
import time

In [2]:
start = time.time()

In [3]:
# function to override the pre-proc that occurs within the vectorizer
# just returns the original string -> because I already had it clean
def dummy_func(x):   
    return x

In [4]:
def cust_tokenizer(x):
    return x.split()

In [5]:
%%time
# reading in the data
df = pd.read_json("min_act_lang_filtered_pre_proc.ndjson", lines=True)

CPU times: total: 3min 13s
Wall time: 3min 16s


In [6]:
# initializing a TFIDF vectorizer with unigram representation
# it's cheaper and it will allow for relative pruning (those terms appearing within fewer than .05% of docs or in more than 99%)
# feeding in our own functions for splitting and cleaning because otherwise it will mess up our plan
tfidf_vectorizer = TfidfVectorizer(
    min_df =.005, max_df = .99,  preprocessor=dummy_func, tokenizer=cust_tokenizer)

In [7]:
%%time
# fitting the vectorizer
tf = tfidf_vectorizer.fit_transform(df["bo"])

CPU times: total: 2min 19s
Wall time: 2min 19s


In [8]:
# values [7, 10, 15, 20, 25, 30, 40, 60, 80, 100]

In [9]:
# making a tokenized representation of the cleaned column -> this is needed for the coherence calculation
df["tokens"] = df["bo"].str.split()

In [10]:
# testing at various levels of K [7, 10, 15, 20, 25, 30, 40, 60, 80, 100]

In [11]:
# 1st time just trying from 10-50 to see how that works

In [12]:
%%time
# initializing the LDA model with a k of 7
lda_k7 = LatentDirichletAllocation(
    n_components=7,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k7.fit(tf)

# computing the coherence of the topic model with K=7
k7_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k7.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k7, open("min_act_lda_k7.pk", "wb"))

  from scipy.linalg.special_matrices import triu


CPU times: total: 17min 11s
Wall time: 1h 26min 17s


In [13]:
%%time
# initializing the LDA model with a k of 10
lda_k10 = LatentDirichletAllocation(
    n_components=10,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k10.fit(tf)

# computing the coherence of the topic model with K=10
k10_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k10.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k10, open("min_act_lda_k10.pk", "wb"))

CPU times: total: 17min 59s
Wall time: 1h 22min 56s


In [14]:
%%time
# initializing the LDA model with a k of 15
lda_k15 = LatentDirichletAllocation(
    n_components=15,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k15.fit(tf)

# computing the coherence of the topic model with K=10
k15_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k15.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k15, open("min_act_lda_k15.pk", "wb"))

CPU times: total: 18min 53s
Wall time: 1h 20min 54s


In [15]:
%%time
# initializing the LDA model with a k of 20
lda_k20 = LatentDirichletAllocation(
    n_components=20,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k20.fit(tf)

# computing the coherence of the topic model with K=20
k20_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k20.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k20, open("min_act_lda_k20.pk", "wb"))

CPU times: total: 19min 29s
Wall time: 1h 19min 54s


In [16]:
%%time
# initializing the LDA model with a k of 25
lda_k25 = LatentDirichletAllocation(
    n_components=25,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k25.fit(tf)

# computing the coherence of the topic model with K=25
k25_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k25.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k25, open("min_act_lda_k25.pk", "wb"))

CPU times: total: 20min 5s
Wall time: 1h 19min 28s


In [17]:
%%time
# initializing the LDA model with a k of 30
lda_k30 = LatentDirichletAllocation(
    n_components=30,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k30.fit(tf)

# computing the coherence of the topic model with K=30
k30_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k30.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k30, open("min_act_lda_k30.pk", "wb"))

CPU times: total: 21min 3s
Wall time: 1h 19min 20s


In [18]:
%%time
# initializing the LDA model with a k of 40
lda_k40 = LatentDirichletAllocation(
    n_components=40,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k40.fit(tf)

# computing the coherence of the topic model with K=40
k40_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k40.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k40, open("min_act_lda_k40.pk", "wb"))

CPU times: total: 22min
Wall time: 1h 18min 29s


In [19]:
%%time
# initializing the LDA model with a k of 60
lda_k60 = LatentDirichletAllocation(
    n_components=60,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k60.fit(tf)

# computing the coherence of the topic model with K=40
k60_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k60.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k60, open("min_act_lda_k60.pk", "wb"))

CPU times: total: 23min 44s
Wall time: 1h 27min 22s


In [20]:
%%time
# initializing the LDA model with a k of 80
lda_k80 = LatentDirichletAllocation(
    n_components=80,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k80.fit(tf)

# computing the coherence of the topic model with K=40
k80_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k80.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k80, open("min_act_lda_k80.pk", "wb"))

CPU times: total: 25min 6s
Wall time: 1h 29min 33s


In [None]:
%%time
# initializing the LDA model with a k of 100
lda_k100 = LatentDirichletAllocation(
    n_components=100,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k100.fit(tf)

In [37]:
# saving the k100 model (b/c it got angry during coherence calculation (memory issues))
pickle.dump(lda_k100, open("min_act_lda_k100.pk", "wb"))

In [21]:
coh_perp_dict = {}

In [22]:
# setting an entry 
coh_perp_dict.setdefault("k7", {})

# calculating perplexity
perplexity = lda_k7.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=7:", lda_k7.perplexity(tf))
coh_perp_dict["k7"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=7:", k7_coh)
coh_perp_dict["k7"].setdefault("coherence", k7_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k7, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak7_pyldavis.html")

ERROR! Session/line number was not unique in database. History logging moved to new session 513
Perplexity at K=7: 341.13637105754117
Coherence at K=7: [0.5123705802925321, 0.5600361551715582, 0.4646701220074818, 0.468973953046122, 0.3668484267436407, 0.4276540135840512, 0.44490309714029197]


  default_term_info = default_term_info.sort_values(


In [23]:
# setting an entry 
coh_perp_dict.setdefault("k10", {})

# calculating perplexity
perplexity = lda_k10.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=10:", lda_k10.perplexity(tf))
coh_perp_dict["k10"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=10:", k10_coh)
coh_perp_dict["k10"].setdefault("coherence", k10_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k10, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak10_pyldavis.html")

Perplexity at K=10: 392.6578657869459
Coherence at K=10: [0.44928052052107753, 0.5870318763092695, 0.49203942774128706, 0.49017830144810437, 0.43308165395573006, 0.41542065002646283, 0.4913483792609603, 0.5861142703704031, 0.4649887089652231, 0.38188149365058005]


  default_term_info = default_term_info.sort_values(


In [24]:
# setting an entry 
coh_perp_dict.setdefault("k15", {})

# calculating perplexity
perplexity = lda_k15.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=15:", lda_k15.perplexity(tf))
coh_perp_dict["k15"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=15:", k15_coh)
coh_perp_dict["k15"].setdefault("coherence", k15_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k15, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak15_pyldavis.html")

Perplexity at K=15: 470.128374859617
Coherence at K=15: [0.46865903286978716, 0.613997309214653, 0.4793665564626367, 0.48310602884360393, 0.4437977399201305, 0.4823084818565156, 0.5459531381101509, 0.591948106962888, 0.49572340661296466, 0.4014054367674835, 0.4747786926441583, 0.4480175119963062, 0.5602666995340048, 0.46077995844845326, 0.43630945766843554]


  default_term_info = default_term_info.sort_values(


In [25]:
# setting an entry 
coh_perp_dict.setdefault("k20", {})

# calculating perplexity
perplexity = lda_k20.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=20:", lda_k20.perplexity(tf))
coh_perp_dict["k20"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=20:", k20_coh)
coh_perp_dict["k20"].setdefault("coherence", k20_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k20, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak20_pyldavis.html")

Perplexity at K=20: 540.505461938051
Coherence at K=20: [0.4969279707378436, 0.5341753563672381, 0.5182183487904384, 0.5313740007126243, 0.46252197765086533, 0.48781013258191946, 0.5459543592841053, 0.5798280157293888, 0.5294497140583942, 0.42839021849888753, 0.4801832760326407, 0.46660358945457736, 0.5477684105637548, 0.4840430327035433, 0.4600931041473597, 0.5099425079576099, 0.6012583257634511, 0.5044340836767573, 0.44980744939255324, 0.4369006770715288]


  default_term_info = default_term_info.sort_values(


In [26]:
# setting an entry 
coh_perp_dict.setdefault("k25", {})

# calculating perplexity
perplexity = lda_k25.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=25:", lda_k25.perplexity(tf))
coh_perp_dict["k25"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=25:", k25_coh)
coh_perp_dict["k25"].setdefault("coherence", k25_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k25, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak25_pyldavis.html")

Perplexity at K=25: 612.5863926854026
Coherence at K=25: [0.478196674399773, 0.5549206949535486, 0.5061438067011512, 0.5109319113716678, 0.4507465154170339, 0.4839048538401934, 0.5119740367052873, 0.5696034876038543, 0.5180541542790074, 0.5060990283875013, 0.5102516532166843, 0.4545815439779977, 0.5536216428469788, 0.4772068661339369, 0.4560207639399364, 0.5080670263567529, 0.545411978534512, 0.5086707011294922, 0.44334730488175766, 0.455964296619815, 0.4893062459645063, 0.44422081432601485, 0.3930329699628306, 0.5089539493490676, 0.4591217089179801]


  default_term_info = default_term_info.sort_values(


In [27]:
# setting an entry 
coh_perp_dict.setdefault("k30", {})

# calculating perplexity
perplexity = lda_k30.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=30:", lda_k30.perplexity(tf))
coh_perp_dict["k30"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=30:", k30_coh)
coh_perp_dict["k30"].setdefault("coherence", k30_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k30, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak30_pyldavis.html")

Perplexity at K=30: 677.4449957087228
Coherence at K=30: [0.5401932357827812, 0.5786367078489684, 0.4539459375938849, 0.5090271295188388, 0.4366398931950942, 0.4705215639704069, 0.47889241074610156, 0.5815233429121788, 0.5509941711604067, 0.5295555736431116, 0.4984953554863468, 0.4531463182003114, 0.5322322092556045, 0.46725707561924096, 0.494270260343941, 0.5081427198718346, 0.5477762539854223, 0.5088700057434151, 0.4900989759624763, 0.4687364634275203, 0.5140339512756367, 0.4712210855818806, 0.4495911447379163, 0.5062137419946923, 0.46314711785930135, 0.4436535749571974, 0.4974392828654084, 0.450667953677198, 0.5485455684884136, 0.47164027985045454]


  default_term_info = default_term_info.sort_values(


In [28]:
# setting an entry 
coh_perp_dict.setdefault("k40", {})

# calculating perplexity
perplexity = lda_k40.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=40:", lda_k40.perplexity(tf))
coh_perp_dict["k40"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=40:", k40_coh)
coh_perp_dict["k40"].setdefault("coherence", k40_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k40, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak40_pyldavis.html")

Perplexity at K=40: 810.6749566447656
Coherence at K=40: [0.5471366068938177, 0.5548199268767771, 0.5202066271294573, 0.5026316078255137, 0.4368206038744746, 0.4920161643929495, 0.43716617441229166, 0.5238645938949315, 0.5299055826501491, 0.5230874557760254, 0.4865080020785869, 0.4637658927819328, 0.5237638609782623, 0.4860576398248645, 0.4681595760324324, 0.48751376693235604, 0.48128570054949416, 0.500780459652435, 0.5511830100515026, 0.4608143797318885, 0.48248344046380076, 0.47449687753168224, 0.45164174198517915, 0.5228103963361, 0.4423257991413454, 0.38588082947256164, 0.4689329350704455, 0.459461727573691, 0.48875821229094935, 0.5081889600708951, 0.39628704458183495, 0.4778592028471815, 0.44698125876946193, 0.5485915132688041, 0.48831657498968734, 0.5257843495363052, 0.5081253398769212, 0.46664981635144837, 0.5194490965954935, 0.46750573307113197]


  default_term_info = default_term_info.sort_values(


In [29]:
# setting an entry 
coh_perp_dict.setdefault("k60", {})

# calculating perplexity
perplexity = lda_k60.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=60:", lda_k60.perplexity(tf))
coh_perp_dict["k60"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=60:", k60_coh)
coh_perp_dict["k60"].setdefault("coherence", k60_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k60, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak60_pyldavis.html")

Perplexity at K=60: 1298.8439602757503
Coherence at K=60: [0.5057010537108281, 0.5617596414362902, 0.517156370104176, 0.438832620771859, 0.4754873194599364, 0.4631811104711372, 0.47641826358223127, 0.4713946630480136, 0.5219099105730491, 0.47377572565380904, 0.46950629775027686, 0.4328049307522603, 0.523592511281315, 0.41564726852890593, 0.5184580874457184, 0.48687358963552363, 0.4843653152255246, 0.5049331390876517, 0.5509038225654639, 0.44869546699138807, 0.4752604763243397, 0.48797075245415744, 0.4605723157874266, 0.4872818742126596, 0.47200937909510426, 0.35640146142473833, 0.455840483803993, 0.4657936518206497, 0.5057906098632887, 0.5005854755729278, 0.4399209659793219, 0.4562847493991713, 0.46022547842477735, 0.5400764169539944, 0.5465594903248954, 0.4907484024893177, 0.5369420168267967, 0.46300338869379154, 0.5368163333892134, 0.4739900378209293, 0.434904939568166, 0.43890601991973294, 0.49222366384600946, 0.47063588776053467, 0.48860344264361577, 0.5272595981029136, 0.480987422

  default_term_info = default_term_info.sort_values(


In [30]:
# setting an entry 
coh_perp_dict.setdefault("k80", {})

# calculating perplexity
perplexity = lda_k80.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=80:", lda_k80.perplexity(tf))
coh_perp_dict["k80"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=80:", k80_coh)
coh_perp_dict["k80"].setdefault("coherence", k80_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k80, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak80_pyldavis.html")

Perplexity at K=80: 2047.6577626188505
Coherence at K=80: [0.4753756746661836, 0.4980040698047585, 0.5122364145664362, 0.5045920803295105, 0.4506504578082796, 0.46767861981929737, 0.4398780428718886, 0.4915194288362925, 0.4789040775539305, 0.49773972981177544, 0.47114495218928676, 0.44553256019320847, 0.5014364015415063, 0.42873270359326704, 0.5257358480062522, 0.47160329988908706, 0.48155967135121314, 0.5377457844326579, 0.545692383546016, 0.4451079727359365, 0.443741931386742, 0.45913829353867686, 0.45135707775001743, 0.48075337621814535, 0.4670324877067422, 0.4017590796678162, 0.44021294308500625, 0.4808317425804212, 0.469771421473088, 0.501654032580548, 0.4305365636460669, 0.5308264657945282, 0.48594571184664453, 0.5871476821353165, 0.4898984321441649, 0.5110286762951533, 0.45847007664908795, 0.47228178550645994, 0.48780072930573626, 0.48623450800925916, 0.47821690255639293, 0.4388618065533912, 0.4992818341395847, 0.4962783780310206, 0.47820072823799, 0.4652862252454952, 0.48251248

  default_term_info = default_term_info.sort_values(


In [None]:
# serializing the dict
# saving the the test run coherence and perplexity for each of the models
with open('coh_perp.json', 'w') as outfile:
    json.dump(coh_perp_dict, outfile)

In [None]:
end = time.time()

In [None]:
runtime = end - start

In [None]:
print(runtime)