In [41]:
# importing our libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# libraries that will let us do LDA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# library that will let us go ahead and visualize LDA results
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.sklearn


# helpful library to allow for computing coherence using Gensim
import tmtoolkit

# what we're using to pickle with
import pickle

# importing json to serialize the thingy
import json

# import timing library
import time

In [42]:
start = time.time()

In [7]:
# function to override the pre-proc that occurs within the vectorizer
# just returns the original string -> because I already had it clean
def dummy_func(x):   
    return x

In [8]:
def cust_tokenizer(x):
    return x.split()

In [9]:
%%time
# reading in the data
df = pd.read_json("ext_act_lang_filtered_pre_proc.ndjson", lines=True)

CPU times: total: 11.2 s
Wall time: 11.6 s


In [10]:
# initializing a TFIDF vectorizer with unigram representation
# it's cheaper and it will allow for relative pruning (those terms appearing within fewer than .05% of docs or in more than 99%)
# feeding in our own functions for splitting and cleaning because otherwise it will mess up our plan
tfidf_vectorizer = TfidfVectorizer(
    min_df =.005, max_df = .99,  preprocessor=dummy_func, tokenizer=cust_tokenizer)

In [11]:
%%time
# fitting the vectorizer
tf = tfidf_vectorizer.fit_transform(df["bo"])

CPU times: total: 9.89 s
Wall time: 10.1 s


In [12]:
# values [7, 10, 15, 20, 25, 30, 40, 60, 80, 100]

In [13]:
# making a tokenized representation of the cleaned column -> this is needed for the coherence calculation
df["tokens"] = df["bo"].str.split()

In [14]:
# testing at various levels of K [7, 10, 15, 20, 25, 30, 40, 60, 80, 100]

In [15]:
# 1st time just trying from 10-50 to see how that works

In [16]:
%%time
# initializing the LDA model with a k of 7
lda_k7 = LatentDirichletAllocation(
    n_components=7,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k7.fit(tf)

# computing the coherence of the topic model with K=7
k7_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k7.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k7, open("ext_act_lda_k7.pk", "wb"))

  from scipy.linalg.special_matrices import triu


CPU times: total: 1min
Wall time: 5min 31s


In [17]:
%%time
# initializing the LDA model with a k of 10
lda_k10 = LatentDirichletAllocation(
    n_components=10,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k10.fit(tf)

# computing the coherence of the topic model with K=10
k10_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k10.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k10, open("ext_act_lda_k10.pk", "wb"))

CPU times: total: 1min 2s
Wall time: 5min


In [18]:
%%time
# initializing the LDA model with a k of 15
lda_k15 = LatentDirichletAllocation(
    n_components=15,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k15.fit(tf)

# computing the coherence of the topic model with K=10
k15_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k15.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k15, open("ext_act_lda_k15.pk", "wb"))

CPU times: total: 1min 8s
Wall time: 4min 52s


In [19]:
%%time
# initializing the LDA model with a k of 20
lda_k20 = LatentDirichletAllocation(
    n_components=20,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k20.fit(tf)

# computing the coherence of the topic model with K=20
k20_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k20.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k20, open("ext_act_lda_k20.pk", "wb"))

CPU times: total: 1min 9s
Wall time: 4min 46s


In [39]:
%%time
# initializing the LDA model with a k of 25
lda_k25 = LatentDirichletAllocation(
    n_components=25,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k25.fit(tf)

# computing the coherence of the topic model with K=25
k25_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k25.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k25, open("ext_act_lda_k25.pk", "wb"))

CPU times: total: 1min 11s
Wall time: 4min 46s


In [21]:
%%time
# initializing the LDA model with a k of 30
lda_k30 = LatentDirichletAllocation(
    n_components=30,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k30.fit(tf)

# computing the coherence of the topic model with K=30
k30_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k30.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k30, open("ext_act_lda_k30.pk", "wb"))

CPU times: total: 1min 12s
Wall time: 4min 41s


In [22]:
%%time
# initializing the LDA model with a k of 40
lda_k40 = LatentDirichletAllocation(
    n_components=40,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k40.fit(tf)

# computing the coherence of the topic model with K=40
k40_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k40.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k40, open("ext_act_lda_k40.pk", "wb"))

CPU times: total: 1min 14s
Wall time: 4min 34s


In [23]:
%%time
# initializing the LDA model with a k of 60
lda_k60 = LatentDirichletAllocation(
    n_components=60,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k60.fit(tf)

# computing the coherence of the topic model with K=40
k60_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k60.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k60, open("ext_act_lda_k60.pk", "wb"))

CPU times: total: 1min 20s
Wall time: 4min 35s


In [24]:
%%time
# initializing the LDA model with a k of 80
lda_k80 = LatentDirichletAllocation(
    n_components=80,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k80.fit(tf)

# computing the coherence of the topic model with K=40
k80_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k80.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k80, open("ext_act_lda_k80.pk", "wb"))

CPU times: total: 1min 23s
Wall time: 4min 40s


In [25]:
%%time
# initializing the LDA model with a k of 100
lda_k100 = LatentDirichletAllocation(
    n_components=100,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k100.fit(tf)

# computing the coherence of the topic model with K=40
k100_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k100.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k100, open("ext_act_lda_k100.pk", "wb"))

CPU times: total: 1min 27s
Wall time: 4min 41s


In [26]:
coh_perp_dict = {}

In [27]:
# setting an entry 
coh_perp_dict.setdefault("k7", {})

# calculating perplexity
perplexity = lda_k7.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=7:", lda_k7.perplexity(tf))
coh_perp_dict["k7"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=7:", k7_coh)
coh_perp_dict["k7"].setdefault("coherence", k7_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k7, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak7_pyldavis.html")

Perplexity at K=7: 553.9879384302158
Coherence at K=7: [0.20653480355634457, 0.12752371007963395, 0.1986154379199874, 0.2881710744467823, 0.22486014324495257, 0.22495259029695922, 0.2921519127132128]


  default_term_info = default_term_info.sort_values(


In [28]:
# setting an entry 
coh_perp_dict.setdefault("k10", {})

# calculating perplexity
perplexity = lda_k10.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=10:", lda_k10.perplexity(tf))
coh_perp_dict["k10"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=10:", k10_coh)
coh_perp_dict["k10"].setdefault("coherence", k10_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k10, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak10_pyldavis.html")

Perplexity at K=10: 631.4525628682134
Coherence at K=10: [0.22560086568448368, 0.18334230836921006, 0.24091832024912843, 0.30675242199961616, 0.3162184492753771, 0.27923251338410826, 0.30145025512209356, 0.3235284624543667, 0.2497124474033913, 0.29060148190202495]


  default_term_info = default_term_info.sort_values(


In [29]:
# setting an entry 
coh_perp_dict.setdefault("k15", {})

# calculating perplexity
perplexity = lda_k15.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=15:", lda_k15.perplexity(tf))
coh_perp_dict["k15"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=15:", k15_coh)
coh_perp_dict["k15"].setdefault("coherence", k15_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k15, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak15_pyldavis.html")

Perplexity at K=15: 774.108110117537
Coherence at K=15: [0.2914679123993105, 0.24333955944529495, 0.34273984560934306, 0.33959699168407775, 0.34522764274122486, 0.2934061066346772, 0.35367127160678025, 0.3402903521778132, 0.25457348490710585, 0.3707112555717194, 0.28404928401222485, 0.3890754021377609, 0.332298630462361, 0.5209571565819519, 0.26234152239207587]


  default_term_info = default_term_info.sort_values(


In [30]:
# setting an entry 
coh_perp_dict.setdefault("k20", {})

# calculating perplexity
perplexity = lda_k20.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=20:", lda_k20.perplexity(tf))
coh_perp_dict["k20"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=20:", k20_coh)
coh_perp_dict["k20"].setdefault("coherence", k20_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k20, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak20_pyldavis.html")

Perplexity at K=20: 887.5889038789646
Coherence at K=20: [0.3138553519886008, 0.29762388678573914, 0.3418469885338461, 0.34271708744643814, 0.32446392521781336, 0.26594236553847456, 0.32523122157312667, 0.39558836939983444, 0.2670268852794223, 0.3804918666097376, 0.31878985246694136, 0.3496089883127233, 0.29447293158286514, 0.5153701631007344, 0.3022776819234119, 0.2858197512919027, 0.2972009565292545, 0.2905172283334992, 0.34940387627397507, 0.31971041011086454]


  default_term_info = default_term_info.sort_values(


In [40]:
# setting an entry 
coh_perp_dict.setdefault("k25", {})

# calculating perplexity
perplexity = lda_k25.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=25:", lda_k25.perplexity(tf))
coh_perp_dict["k25"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=25:", k25_coh)
coh_perp_dict["k25"].setdefault("coherence", k25_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k25, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak25_pyldavis.html")

Perplexity at K=25: 1038.7925851284174
Coherence at K=25: [0.2785262907575272, 0.28982410198736985, 0.33996974063879304, 0.33342041569801084, 0.300299337025764, 0.27411857213762547, 0.35381736776681616, 0.3612270896800022, 0.26585019089385964, 0.37814104242741037, 0.3528744539380106, 0.3157677459364038, 0.2880150704606676, 0.5134894190134436, 0.2839926516975662, 0.28793949005353003, 0.2972384020940763, 0.2846748086063867, 0.29494187569011865, 0.5052254473470714, 0.3924527203191229, 0.319852370472549, 0.3058684895094318, 0.3184481098929617, 0.3644783479772873]


  default_term_info = default_term_info.sort_values(


In [32]:
# setting an entry 
coh_perp_dict.setdefault("k30", {})

# calculating perplexity
perplexity = lda_k30.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=30:", lda_k30.perplexity(tf))
coh_perp_dict["k30"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=30:", k30_coh)
coh_perp_dict["k30"].setdefault("coherence", k30_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k30, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak30_pyldavis.html")

Perplexity at K=30: 1169.3769557331866
Coherence at K=30: [0.28762658108567285, 0.2736007244332251, 0.35583801804682946, 0.343482539596291, 0.2867441764907145, 0.36806552767996614, 0.3303733588654318, 0.3846862128267877, 0.2781509304883308, 0.32916035128866045, 0.3576554573969238, 0.35514397314733875, 0.3035564362804253, 0.5080461083733276, 0.3120032112490919, 0.2930603716385204, 0.31105858853725793, 0.4922840230507555, 0.29241499762026585, 0.3223286295808255, 0.3996321425749184, 0.3186266533066705, 0.2924194442569161, 0.32378707936906886, 0.3022532095875311, 0.3378545070058089, 0.2779298818717707, 0.32050904176784156, 0.29071037300307523, 0.5137540201936097]


  default_term_info = default_term_info.sort_values(


In [33]:
# setting an entry 
coh_perp_dict.setdefault("k40", {})

# calculating perplexity
perplexity = lda_k40.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=40:", lda_k40.perplexity(tf))
coh_perp_dict["k40"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=40:", k40_coh)
coh_perp_dict["k40"].setdefault("coherence", k40_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k40, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak40_pyldavis.html")

Perplexity at K=40: 1452.6971953649245
Coherence at K=40: [0.31064734238653824, 0.2693815347281546, 0.36459196529715293, 0.2945964833569271, 0.32630769660153863, 0.3830379833765179, 0.3721974750370406, 0.37186948789292407, 0.3094818723648586, 0.33881176608340313, 0.312318863533778, 0.32927364106725165, 0.3463341175227717, 0.5158146313018034, 0.3161849934327386, 0.33851722330157086, 0.2844940726970181, 0.4865695617821072, 0.2378672883812286, 0.3171659558187104, 0.3788855056970505, 0.29032036642405223, 0.31408148528827085, 0.3309349809870069, 0.3206059780511289, 0.28506551266818536, 0.28917112700842496, 0.32658224788541146, 0.3092035962518122, 0.5062843968829742, 0.40348347006156554, 0.3378051761052817, 0.25584966625457334, 0.3559836803741906, 0.33337462151548924, 0.307662630002581, 0.2940237873857193, 0.3767744073696791, 0.28862642479244854, 0.30399633570993817]


  default_term_info = default_term_info.sort_values(


In [34]:
# setting an entry 
coh_perp_dict.setdefault("k60", {})

# calculating perplexity
perplexity = lda_k60.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=60:", lda_k60.perplexity(tf))
coh_perp_dict["k60"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=60:", k60_coh)
coh_perp_dict["k60"].setdefault("coherence", k60_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k60, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak60_pyldavis.html")

Perplexity at K=60: 2062.168158870461
Coherence at K=60: [0.2901505330615127, 0.2704184210573187, 0.33942459740409414, 0.2898899670540163, 0.37211689156236, 0.30142197709459817, 0.3423638161456187, 0.3640792278209606, 0.30035547191243406, 0.3483016287678384, 0.31302752277360735, 0.3250713691859758, 0.32103145128111993, 0.46724465894678113, 0.32053896489702777, 0.3153081587267329, 0.2639841525350422, 0.5061942249829176, 0.24198460192568713, 0.32580126415415944, 0.3090592154521491, 0.289069254459568, 0.32136920538473135, 0.3324135242130718, 0.3228714490807626, 0.2695849514877185, 0.24398278682355912, 0.32183433972676717, 0.27473955371668657, 0.5039784714579956, 0.32415060290511427, 0.3011911649700814, 0.2199913004406288, 0.34030140010986965, 0.29232025299338205, 0.25555369442260173, 0.28498276719690513, 0.3582296977131996, 0.2959072204482188, 0.28474883033136716, 0.25623404216911255, 0.3630239164887933, 0.28597766944434855, 0.2989919304464653, 0.34506680204757073, 0.2765596500281794, 0.3

  default_term_info = default_term_info.sort_values(


In [35]:
# setting an entry 
coh_perp_dict.setdefault("k80", {})

# calculating perplexity
perplexity = lda_k80.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=80:", lda_k80.perplexity(tf))
coh_perp_dict["k80"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=80:", k80_coh)
coh_perp_dict["k80"].setdefault("coherence", k80_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k80, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak80_pyldavis.html")

Perplexity at K=80: 2628.8699205103458
Coherence at K=80: [0.26913440374552994, 0.25238666987641833, 0.30747420018775795, 0.32881330679289256, 0.3064234373094075, 0.3399608189094925, 0.3051055796483738, 0.34228456428717924, 0.2980046701529725, 0.34624242568421015, 0.28432977451988956, 0.3375586709715344, 0.319272023376348, 0.25992258554623543, 0.28229850773115167, 0.3506899952307707, 0.32142276210185117, 0.3076526228700293, 0.27979398745034284, 0.3184640344860349, 0.2739978365113296, 0.3021490058330924, 0.3155889487057616, 0.31620133262420935, 0.31723022302588316, 0.23623670076738112, 0.29269300365492495, 0.31044857617342003, 0.30143277202382424, 0.5154213147397717, 0.3840024975723303, 0.3056040112310871, 0.227813528974158, 0.3061814922629565, 0.2879971785906873, 0.26909236992253777, 0.36015133440962566, 0.31612215363501495, 0.3421247903147676, 0.2549873381769704, 0.2740274920966187, 0.3643242818915188, 0.289911732021557, 0.29355064354359117, 0.29123292208717894, 0.2848636197342388, 0.

  default_term_info = default_term_info.sort_values(


In [36]:
# setting an entry 
coh_perp_dict.setdefault("k100", {})

# calculating perplexity
perplexity = lda_k100.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=100:", lda_k100.perplexity(tf))
coh_perp_dict["k100"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=100:", k100_coh)
coh_perp_dict["k100"].setdefault("coherence", k100_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k100, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak100_pyldavis.html")

Perplexity at K=100: 3143.7455514356902
Coherence at K=100: [0.2602702385075107, 0.26083953428579126, 0.3036704434169414, 0.28122923316056947, 0.3234751673483036, 0.30022983111409135, 0.3650217637420303, 0.35257112837890153, 0.2616749155731667, 0.3662929648063272, 0.27250031273144526, 0.3566852957995512, 0.34626391259013783, 0.2982727112599552, 0.3074438221308992, 0.2999435412820093, 0.3107675987225365, 0.34816781860733037, 0.3077697155979512, 0.29466720600662194, 0.3261429303699236, 0.3269837257223093, 0.3215436462931792, 0.29130626734681925, 0.2517788271657365, 0.2859707960502327, 0.3127776624055154, 0.2815045341303028, 0.269331294292389, 0.4969901292873925, 0.38408395725836736, 0.32960632393985867, 0.24924219410865095, 0.3100693083792788, 0.3369893887813823, 0.2627011205121371, 0.3605613992338307, 0.2889239366870883, 0.2809482230432961, 0.27593009021357345, 0.23257630662181827, 0.37794058664172026, 0.28301911632101284, 0.2932228772542285, 0.2946186340017069, 0.25907909137971663, 0.3

  default_term_info = default_term_info.sort_values(


In [37]:
# serializing the dict
# saving the the test run coherence and perplexity for each of the models
with open('coh_perp.json', 'w') as outfile:
    json.dump(coh_perp_dict, outfile)

In [43]:
end = time.time()

In [44]:
runtime = end - start

In [45]:
print(runtime)

11.622255086898804
