In [1]:
# importing our libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# libraries that will let us do LDA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# library that will let us go ahead and visualize LDA results
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.sklearn


# helpful library to allow for computing coherence using Gensim
import tmtoolkit

# what we're using to pickle with
import pickle

# importing json to serialize the thingy
import json

# import timing library
import time

In [2]:
start = time.time()

In [3]:
# function to override the pre-proc that occurs within the vectorizer
# just returns the original string -> because I already had it clean
def dummy_func(x):   
    return x

In [4]:
def cust_tokenizer(x):
    return x.split()

In [5]:
%%time
# reading in the data
df = pd.read_json("ver_act_lang_filtered_pre_proc.ndjson", lines=True)

CPU times: total: 1min 7s
Wall time: 1min 8s


In [6]:
# initializing a TFIDF vectorizer with unigram representation
# it's cheaper and it will allow for relative pruning (those terms appearing within fewer than .05% of docs or in more than 99%)
# feeding in our own functions for splitting and cleaning because otherwise it will mess up our plan
tfidf_vectorizer = TfidfVectorizer(
    min_df =.005, max_df = .99,  preprocessor=dummy_func, tokenizer=cust_tokenizer)

In [7]:
%%time
# fitting the vectorizer
tf = tfidf_vectorizer.fit_transform(df["bo"])

CPU times: total: 1min 2s
Wall time: 1min 2s


In [8]:
# values [7, 10, 15, 20, 25, 30, 40, 60, 80, 100]

In [9]:
# making a tokenized representation of the cleaned column -> this is needed for the coherence calculation
df["tokens"] = df["bo"].str.split()

In [10]:
# testing at various levels of K [7, 10, 15, 20, 25, 30, 40, 60, 80, 100]

In [11]:
%%time
# initializing the LDA model with a k of 7
lda_k7 = LatentDirichletAllocation(
    n_components=7,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k7.fit(tf)

# computing the coherence of the topic model with K=7
k7_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k7.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k7, open("ver_act_lda_k7.pk", "wb"))

  from scipy.linalg.special_matrices import triu


CPU times: total: 7min 21s
Wall time: 34min 33s


In [12]:
%%time
# initializing the LDA model with a k of 10
lda_k10 = LatentDirichletAllocation(
    n_components=10,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k10.fit(tf)

# computing the coherence of the topic model with K=10
k10_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k10.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k10, open("ver_act_lda_k10.pk", "wb"))

CPU times: total: 7min 31s
Wall time: 33min 32s


In [13]:
%%time
# initializing the LDA model with a k of 15
lda_k15 = LatentDirichletAllocation(
    n_components=15,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k15.fit(tf)

# computing the coherence of the topic model with K=10
k15_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k15.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k15, open("ver_act_lda_k15.pk", "wb"))

CPU times: total: 7min 54s
Wall time: 31min 50s


In [14]:
%%time
# initializing the LDA model with a k of 20
lda_k20 = LatentDirichletAllocation(
    n_components=20,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k20.fit(tf)

# computing the coherence of the topic model with K=20
k20_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k20.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k20, open("ver_act_lda_k20.pk", "wb"))

CPU times: total: 8min 6s
Wall time: 31min 42s


In [15]:
%%time
# initializing the LDA model with a k of 25
lda_k25 = LatentDirichletAllocation(
    n_components=25,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k25.fit(tf)

# computing the coherence of the topic model with K=25
k25_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k25.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k25, open("ver_act_lda_k25.pk", "wb"))

CPU times: total: 8min 17s
Wall time: 32min 11s


In [16]:
%%time
# initializing the LDA model with a k of 30
lda_k30 = LatentDirichletAllocation(
    n_components=30,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k30.fit(tf)

# computing the coherence of the topic model with K=30
k30_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k30.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k30, open("ver_act_lda_k30.pk", "wb"))

CPU times: total: 8min 23s
Wall time: 34min 53s


In [17]:
%%time
# initializing the LDA model with a k of 40
lda_k40 = LatentDirichletAllocation(
    n_components=40,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k40.fit(tf)

# computing the coherence of the topic model with K=40
k40_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k40.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k40, open("ver_act_lda_k40.pk", "wb"))

CPU times: total: 8min 42s
Wall time: 34min 46s


In [18]:
%%time
# initializing the LDA model with a k of 60
lda_k60 = LatentDirichletAllocation(
    n_components=60,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k60.fit(tf)

# computing the coherence of the topic model with K=40
k60_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k60.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k60, open("ver_act_lda_k60.pk", "wb"))

CPU times: total: 9min 13s
Wall time: 34min 51s


In [19]:
%%time
# initializing the LDA model with a k of 80
lda_k80 = LatentDirichletAllocation(
    n_components=80,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k80.fit(tf)

# computing the coherence of the topic model with K=40
k80_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k80.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k80, open("ver_act_lda_k80.pk", "wb"))

CPU times: total: 10min 2s
Wall time: 35min 51s


In [20]:
%%time
# initializing the LDA model with a k of 100
lda_k100 = LatentDirichletAllocation(
    n_components=100,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k100.fit(tf)

# computing the coherence of the topic model with K=40
k100_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k100.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k100, open("ver_act_lda_k100.pk", "wb"))

CPU times: total: 10min 21s
Wall time: 36min 20s


In [21]:
coh_perp_dict = {}

In [22]:
# setting an entry 
coh_perp_dict.setdefault("k7", {})

# calculating perplexity
perplexity = lda_k7.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=7:", lda_k7.perplexity(tf))
coh_perp_dict["k7"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=7:", k7_coh)
coh_perp_dict["k7"].setdefault("coherence", k7_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k7, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak7_pyldavis.html")

Perplexity at K=7: 102.52220015165013
Coherence at K=7: [0.41077576993083487, 0.40024461330155026, 0.44332191834528994, 0.5362008866971468, 0.48959043885466047, 0.37191821226844807, 0.5361527964535225]


  default_term_info = default_term_info.sort_values(


In [23]:
# setting an entry 
coh_perp_dict.setdefault("k10", {})

# calculating perplexity
perplexity = lda_k10.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=10:", lda_k10.perplexity(tf))
coh_perp_dict["k10"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=10:", k10_coh)
coh_perp_dict["k10"].setdefault("coherence", k10_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k10, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak10_pyldavis.html")

Perplexity at K=10: 131.76892384732218
Coherence at K=10: [0.4156246898918637, -0.047286816212240915, 0.39226170511651415, 0.5010238037867992, 0.444415297241061, 0.3905083049241566, 0.26806680123628995, 0.5138613334336888, 0.3138691030289914, 0.5561293337171453]


  default_term_info = default_term_info.sort_values(


In [24]:
# setting an entry 
coh_perp_dict.setdefault("k15", {})

# calculating perplexity
perplexity = lda_k15.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=15:", lda_k15.perplexity(tf))
coh_perp_dict["k15"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=15:", k15_coh)
coh_perp_dict["k15"].setdefault("coherence", k15_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k15, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak15_pyldavis.html")

Perplexity at K=15: 136.0256279540631
Coherence at K=15: [0.48158750770666553, 0.27082728868436534, 0.48656256076077814, 0.5571830180784885, 0.43120028660436305, 0.43591089018319507, 0.539212462399936, 0.557407871527375, 0.3845946531265512, 0.52969325294833, 0.39399999896429483, 0.5975242259796298, 0.44464327130422754, 0.46866260258848735, 0.5741323367192149]


  default_term_info = default_term_info.sort_values(


In [25]:
# setting an entry 
coh_perp_dict.setdefault("k20", {})

# calculating perplexity
perplexity = lda_k20.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=20:", lda_k20.perplexity(tf))
coh_perp_dict["k20"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=20:", k20_coh)
coh_perp_dict["k20"].setdefault("coherence", k20_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k20, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak20_pyldavis.html")

Perplexity at K=20: 154.05148620067166
Coherence at K=20: [0.47809070801547077, 0.024262566754694202, 0.33408689982887263, 0.5854391493502626, 0.5597847122989255, 0.39793889750172023, 0.26724335253024145, 0.5655869782731475, 0.556043361554313, 0.5244272938348222, 0.393901527446778, 0.5151280152942019, 0.41906063146009503, 0.535686095652186, 0.5171308659388476, 0.5019107363075781, 0.451702634689072, 0.42663916550121145, 0.46098683184247474, 0.40855944421418205]


  default_term_info = default_term_info.sort_values(


In [26]:
# setting an entry 
coh_perp_dict.setdefault("k25", {})

# calculating perplexity
perplexity = lda_k25.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=25:", lda_k25.perplexity(tf))
coh_perp_dict["k25"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=25:", k25_coh)
coh_perp_dict["k25"].setdefault("coherence", k25_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k25, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak25_pyldavis.html")

Perplexity at K=25: 207.28143029106323
Coherence at K=25: [0.5374748147673905, 0.3835619904970965, 0.40003956358473414, 0.5594907397788268, 0.5368080495702101, 0.401328096733801, 0.40189436462712563, 0.5574438169439673, 0.44931714532275596, 0.4989228163156859, 0.39427160252639687, 0.5843553015392878, 0.4214850555436679, 0.47345063689532874, 0.5127678640554928, 0.4578111444375609, 0.4223949776273681, 0.459730321502148, 0.49284441274428203, 0.41499919032125404, 0.5345801592422308, 0.36421046385578665, 0.5635330907348529, 0.3652364367661295, 0.5007981333356119]


  default_term_info = default_term_info.sort_values(


In [27]:
# setting an entry 
coh_perp_dict.setdefault("k30", {})

# calculating perplexity
perplexity = lda_k30.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=30:", lda_k30.perplexity(tf))
coh_perp_dict["k30"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=30:", k30_coh)
coh_perp_dict["k30"].setdefault("coherence", k30_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k30, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak30_pyldavis.html")

Perplexity at K=30: 387.02633863423284
Coherence at K=30: [0.5379864589637602, 0.3992286912108813, 0.4004122533105917, 0.5518219514276671, 0.5158467248298141, 0.398673222242567, 0.36550185285631404, 0.5901645943414978, 0.5196816265043395, 0.5385083416023242, 0.39696941707335953, 0.5746076184980438, 0.45129516795661884, 0.4957017985179605, 0.3909880295273026, 0.4266067093846798, 0.46205511583664016, 0.4343380179263834, 0.5066961886691657, 0.42717710215255034, 0.49851537015764646, 0.4203456974817816, 0.5695722273817687, 0.4703168415026154, 0.5170904295184571, 0.23642302733020734, 0.3726829051921724, 0.5626512352079265, 0.3090720989460439, 0.3706997834134988]


  default_term_info = default_term_info.sort_values(


In [28]:
# setting an entry 
coh_perp_dict.setdefault("k40", {})

# calculating perplexity
perplexity = lda_k40.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=40:", lda_k40.perplexity(tf))
coh_perp_dict["k40"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=40:", k40_coh)
coh_perp_dict["k40"].setdefault("coherence", k40_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k40, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak40_pyldavis.html")

Perplexity at K=40: 849.5943092094645
Coherence at K=40: [0.5261372110056319, 0.4813500821758291, 0.4004347702671689, 0.5465706416727893, 0.5258638358224685, 0.24235256245969455, 0.3655139079079848, 0.6037880628628978, 0.5348518276406414, 0.5103588343474058, 0.3902446016456175, 0.5251519932867939, 0.45533069328868275, 0.5028121454831489, 0.368257222973987, 0.525741232671562, 0.47269234252437825, 0.44915129046748775, 0.45391346576269215, 0.4323673841034689, 0.47392886691888797, 0.4498282853207762, 0.5882422020128941, 0.46074401411595634, 0.5261189515551965, 0.37806473646772504, 0.2946758010359202, 0.5618651543669105, 0.3946762748582032, 0.4319033999709121, 0.5431887104380158, 0.3689336418980044, 0.44350585330606423, 0.5636582237942098, 0.3902446016456174, 0.39699173642414587, 0.4744399704251682, 0.4652762617274301, 0.45924897889313154, 0.3902446016456174]


  default_term_info = default_term_info.sort_values(


In [29]:
# setting an entry 
coh_perp_dict.setdefault("k60", {})

# calculating perplexity
perplexity = lda_k60.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=60:", lda_k60.perplexity(tf))
coh_perp_dict["k60"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=60:", k60_coh)
coh_perp_dict["k60"].setdefault("coherence", k60_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k60, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak60_pyldavis.html")

Perplexity at K=60: 1914.9139713714649
Coherence at K=60: [0.5337627873177111, 0.5400718959056847, 0.39024460164561725, 0.5540999602447167, 0.5157609706106958, 0.3671460279723656, 0.3655139079079847, 0.5963378065303417, 0.5353850288137226, 0.528929558506948, 0.39024460164561725, 0.5809693802901269, 0.28690343838970045, 0.5042759717999885, 0.5322046848429006, 0.5429862429354676, 0.5310438094187024, 0.5062531064708903, 0.5112781217843027, 0.28045928096524136, 0.4992926019246554, 0.38631704743260126, 0.5460540006940823, 0.5320926813678166, 0.5495747740865903, 0.5870555373318984, 0.4050330195250365, 0.5486454129655777, 0.513122932207015, 0.43650359874496114, 0.5417107927442767, 0.4758105164887301, 0.4710347681256006, 0.553336638113894, 0.3902446016456173, 0.39699173642414587, 0.2865292759367074, 0.43764404567121834, 0.5294224318732134, 0.3902446016456173, 0.4978308435931996, 0.5499043846363112, 0.5194744082360316, 0.3494567569075622, 0.5281880264338688, 0.4192509495853689, 0.53994039410747

  default_term_info = default_term_info.sort_values(


In [30]:
# setting an entry 
coh_perp_dict.setdefault("k80", {})

# calculating perplexity
perplexity = lda_k80.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=80:", lda_k80.perplexity(tf))
coh_perp_dict["k80"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=80:", k80_coh)
coh_perp_dict["k80"].setdefault("coherence", k80_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k80, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak80_pyldavis.html")

Perplexity at K=80: 6577.939970089888
Coherence at K=80: [0.5120509030594952, 0.42826593252485845, 0.39024460164561725, 0.5207181994997352, 0.4689810667177447, 0.39024460164561725, 0.39024460164561725, 0.5138974906305637, 0.5452895059923353, 0.4454401919785734, 0.3902446016456173, 0.5468026835439264, 0.4475547349695016, 0.4765194170759477, 0.38356391624664593, 0.5146264216361821, 0.4779275781556052, 0.4541522627083018, 0.5056786874667801, 0.4340763993404179, 0.5104763963991873, 0.3385032779701345, 0.5381900176199652, 0.5491883856706651, 0.458403881280148, 0.4127755238181837, 0.451365811215892, 0.5463064891014325, 0.5270133606822902, 0.43320048247499504, 0.4930766702596384, 0.45742753416022186, 0.4713235738402964, 0.5484934142297747, 0.39024460164561725, 0.39699173642414587, 0.3510600945460519, 0.4708641248387256, 0.47681768950786846, 0.39024460164561725, 0.492464108673816, 0.4878327409508181, 0.43739208563389553, 0.3051931267099055, 0.5493997358589388, 0.419250949585369, 0.541558903255

  default_term_info = default_term_info.sort_values(


In [31]:
# setting an entry 
coh_perp_dict.setdefault("k100", {})

# calculating perplexity
perplexity = lda_k100.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=100:", lda_k100.perplexity(tf))
coh_perp_dict["k100"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=100:", k100_coh)
coh_perp_dict["k100"].setdefault("coherence", k100_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k100, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak100_pyldavis.html")

Perplexity at K=100: 44720.58534564494
Coherence at K=100: [0.5097483684709139, 0.5630578730231208, 0.3902446016456175, 0.4883501579392543, 0.5446746826996771, 0.3902446016456175, 0.3902446016456175, 0.5392816876709159, 0.5575095251670439, 0.4881608379585449, 0.3902446016456175, 0.516668607350702, 0.46755646268695433, 0.49771006410513696, 0.39671008807087077, 0.5420642584531038, 0.4615652346327567, 0.5009258984011108, 0.528694657437087, 0.4575140347467343, 0.531664098341229, 0.4348459962258627, 0.5440934437005583, 0.5655229728439616, 0.4441212542005603, 0.374249922941155, 0.46432093824964404, 0.5466630662334329, 0.4504617480833072, 0.44019784948034013, 0.5147523866886157, 0.45694592231253195, 0.48949520993478507, 0.5561760199925702, 0.3902446016456174, 0.39699173642414587, 0.4277203554247924, 0.517740776357269, 0.48022402236319583, 0.3902446016456175, 0.5645067379202935, 0.5383111627397686, 0.4295348825640404, 0.3098525996884237, 0.49893461396780436, 0.4192509495853688, 0.5336720450498

  default_term_info = default_term_info.sort_values(


In [32]:
# serializing the dict
# saving the the test run coherence and perplexity for each of the models
with open('coh_perp.json', 'w') as outfile:
    json.dump(coh_perp_dict, outfile)

In [33]:
end = time.time()

In [34]:
runtime = end - start

In [35]:
print(runtime)

33450.30321073532
