In [1]:
# importing our libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# libraries that will let us do LDA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

# library that will let us go ahead and visualize LDA results
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.sklearn


# helpful library to allow for computing coherence using Gensim
import tmtoolkit

# what we're using to pickle with
import pickle

# importing json to serialize the thingy
import json

# import timing library
import time

In [2]:
start = time.time()

In [3]:
# function to override the pre-proc that occurs within the vectorizer
# just returns the original string -> because I already had it clean
def dummy_func(x):   
    return x

In [4]:
def cust_tokenizer(x):
    return x.split()

In [5]:
%%time
# reading in the data
df = pd.read_json("mod_act_lang_filtered_pre_proc.ndjson", lines=True)

CPU times: total: 2min 8s
Wall time: 2min 10s


In [6]:
# initializing a TFIDF vectorizer with unigram representation
# it's cheaper and it will allow for relative pruning (those terms appearing within fewer than .05% of docs or in more than 99%)
# feeding in our own functions for splitting and cleaning because otherwise it will mess up our plan
tfidf_vectorizer = TfidfVectorizer(
    min_df =.005, max_df = .99,  preprocessor=dummy_func, tokenizer=cust_tokenizer)

In [7]:
%%time
# fitting the vectorizer
tf = tfidf_vectorizer.fit_transform(df["bo"])

CPU times: total: 2min
Wall time: 2min


In [8]:
# values [7, 10, 15, 20, 25, 30, 40, 60, 80, 100]

In [9]:
# making a tokenized representation of the cleaned column -> this is needed for the coherence calculation
df["tokens"] = df["bo"].str.split()

In [10]:
# testing at various levels of K [7, 10, 15, 20, 25, 30, 40, 60, 80, 100]

In [11]:
# 1st time just trying from 10-50 to see how that works

In [12]:
%%time
# initializing the LDA model with a k of 7
lda_k7 = LatentDirichletAllocation(
    n_components=7,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k7.fit(tf)

# computing the coherence of the topic model with K=7
k7_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k7.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k7, open("mod_act_lda_k7.pk", "wb"))

  from scipy.linalg.special_matrices import triu


CPU times: total: 12min 23s
Wall time: 1h 1min 50s


In [13]:
%%time
# initializing the LDA model with a k of 10
lda_k10 = LatentDirichletAllocation(
    n_components=10,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k10.fit(tf)

# computing the coherence of the topic model with K=10
k10_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k10.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k10, open("mod_act_lda_k10.pk", "wb"))

CPU times: total: 13min 16s
Wall time: 59min 46s


In [14]:
%%time
# initializing the LDA model with a k of 15
lda_k15 = LatentDirichletAllocation(
    n_components=15,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k15.fit(tf)

# computing the coherence of the topic model with K=10
k15_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k15.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k15, open("mod_act_lda_k15.pk", "wb"))

CPU times: total: 13min 54s
Wall time: 59min 13s


In [15]:
%%time
# initializing the LDA model with a k of 20
lda_k20 = LatentDirichletAllocation(
    n_components=20,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k20.fit(tf)

# computing the coherence of the topic model with K=20
k20_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k20.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k20, open("mod_act_lda_k20.pk", "wb"))

CPU times: total: 14min 13s
Wall time: 58min 31s


In [16]:
%%time
# initializing the LDA model with a k of 25
lda_k25 = LatentDirichletAllocation(
    n_components=25,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k25.fit(tf)

# computing the coherence of the topic model with K=25
k25_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k25.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k25, open("mod_act_lda_k25.pk", "wb"))

CPU times: total: 14min 34s
Wall time: 58min 49s


In [17]:
%%time
# initializing the LDA model with a k of 30
lda_k30 = LatentDirichletAllocation(
    n_components=30,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k30.fit(tf)

# computing the coherence of the topic model with K=30
k30_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k30.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k30, open("mod_act_lda_k30.pk", "wb"))

CPU times: total: 14min 54s
Wall time: 57min 24s


In [18]:
%%time
# initializing the LDA model with a k of 40
lda_k40 = LatentDirichletAllocation(
    n_components=40,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k40.fit(tf)

# computing the coherence of the topic model with K=40
k40_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k40.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k40, open("mod_act_lda_k40.pk", "wb"))

CPU times: total: 15min 59s
Wall time: 58min 56s


In [19]:
%%time
# initializing the LDA model with a k of 60
lda_k60 = LatentDirichletAllocation(
    n_components=60,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k60.fit(tf)

# computing the coherence of the topic model with K=40
k60_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k60.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k60, open("mod_act_lda_k60.pk", "wb"))

CPU times: total: 17min 22s
Wall time: 1h 4min 44s


In [20]:
%%time
# initializing the LDA model with a k of 80
lda_k80 = LatentDirichletAllocation(
    n_components=80,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k80.fit(tf)

# computing the coherence of the topic model with K=40
k80_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k80.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k80, open("mod_act_lda_k80.pk", "wb"))

CPU times: total: 18min 57s
Wall time: 1h 6min 30s


In [21]:
%%time
# initializing the LDA model with a k of 100
lda_k100 = LatentDirichletAllocation(
    n_components=100,
    n_jobs=5,
    max_iter=5,
    learning_method="batch",
    learning_offset=50.0,
    random_state=0,
)

# fitting the LDA model
lda_k100.fit(tf)

# computing the coherence of the topic model with K=40
k100_coh = tmtoolkit.topicmod.evaluate.metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda_k100.components_, # the components of the lda count as 
                        dtm=tf, # the term frequency
                        vocab=np.array([x for x in tfidf_vectorizer.vocabulary_.keys()]), # pass in vectorizer
                        texts=df["tokens"].values) # pass in list of tokenized texts -> needs to match vocab

# saving the model
pickle.dump(lda_k100, open("mod_act_lda_k100.pk", "wb"))

CPU times: total: 19min 58s
Wall time: 1h 8min 58s


In [22]:
coh_perp_dict = {}

In [23]:
# setting an entry 
coh_perp_dict.setdefault("k7", {})

# calculating perplexity
perplexity = lda_k7.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=7:", lda_k7.perplexity(tf))
coh_perp_dict["k7"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=7:", k7_coh)
coh_perp_dict["k7"].setdefault("coherence", k7_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k7, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak7_pyldavis.html")

Perplexity at K=7: 135.7024438829358
Coherence at K=7: [0.40507696986574115, 0.3600011178328915, 0.3289423007844948, 0.43165860194707834, 0.49562432397846434, 0.5494227682186844, 0.5423486722820021]


  default_term_info = default_term_info.sort_values(


In [24]:
# setting an entry 
coh_perp_dict.setdefault("k10", {})

# calculating perplexity
perplexity = lda_k10.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=10:", lda_k10.perplexity(tf))
coh_perp_dict["k10"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=10:", k10_coh)
coh_perp_dict["k10"].setdefault("coherence", k10_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k10, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak10_pyldavis.html")

Perplexity at K=10: 148.1169427606087
Coherence at K=10: [0.5280043925379201, 0.40822659178231857, 0.33255119144423506, 0.5349105746644819, 0.5017927208091394, 0.5266198047082257, 0.6235166306994254, 0.18581944134308953, 0.4754375661937975, 0.5596404202901797]


  default_term_info = default_term_info.sort_values(


In [25]:
# setting an entry 
coh_perp_dict.setdefault("k15", {})

# calculating perplexity
perplexity = lda_k15.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=15:", lda_k15.perplexity(tf))
coh_perp_dict["k15"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=15:", k15_coh)
coh_perp_dict["k15"].setdefault("coherence", k15_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k15, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak15_pyldavis.html")

Perplexity at K=15: 166.20026520836439
Coherence at K=15: [0.4946089333809871, 0.4512892590439204, 0.333627343275968, 0.42722552809785136, 0.5045840680046763, 0.47379528145983174, 0.5676078795469202, 0.3500110158838868, 0.5135217377532306, 0.3655792559252003, 0.5053818781353995, 0.43143637444413974, 0.46096649036874693, 0.455529846091672, 0.5273061345301825]


  default_term_info = default_term_info.sort_values(


In [26]:
# setting an entry 
coh_perp_dict.setdefault("k20", {})

# calculating perplexity
perplexity = lda_k20.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=20:", lda_k20.perplexity(tf))
coh_perp_dict["k20"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=20:", k20_coh)
coh_perp_dict["k20"].setdefault("coherence", k20_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k20, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak20_pyldavis.html")

Perplexity at K=20: 183.56217607080498
Coherence at K=20: [0.45110682343754666, 0.445394226724049, 0.33397756396819855, 0.4753079810186617, 0.5049005599078424, 0.41266486715971995, 0.555875206907052, 0.4103850708247216, 0.5189374895439361, 0.42007126165090086, 0.503901040985393, 0.3921524757143219, 0.4662693104547266, 0.4659226396359236, 0.47927492707588487, 0.5591366265710442, 0.4474938104325908, 0.5557979081616331, 0.5667354943258861, 0.49611376519608497]


  default_term_info = default_term_info.sort_values(


In [27]:
# setting an entry 
coh_perp_dict.setdefault("k25", {})

# calculating perplexity
perplexity = lda_k25.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=25:", lda_k25.perplexity(tf))
coh_perp_dict["k25"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=25:", k25_coh)
coh_perp_dict["k25"].setdefault("coherence", k25_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k25, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak25_pyldavis.html")

Perplexity at K=25: 201.25744920464288
Coherence at K=25: [0.32942098999550107, 0.49507105744683155, 0.33408801302794655, 0.43269818173472957, 0.5049989730600365, 0.4537517981633479, 0.5209250801730174, 0.3542723925568348, 0.47104536872015074, 0.3703671820613662, 0.528176128196611, 0.44313312933597837, 0.48400497021117717, 0.329804725813366, 0.4474728717403608, 0.5936918938799847, 0.40196370052275227, 0.5137927384756575, 0.5724939190398851, 0.46092744925226353, 0.6094436472975768, 0.41714263838684557, 0.452190821190629, 0.47395397798195465, 0.5387851423544777]


  default_term_info = default_term_info.sort_values(


In [28]:
# setting an entry 
coh_perp_dict.setdefault("k30", {})

# calculating perplexity
perplexity = lda_k30.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=30:", lda_k30.perplexity(tf))
coh_perp_dict["k30"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=30:", k30_coh)
coh_perp_dict["k30"].setdefault("coherence", k30_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k30, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak30_pyldavis.html")

Perplexity at K=30: 216.63660408979842
Coherence at K=30: [0.559268978368004, 0.44719384663267103, 0.41887711730552885, 0.45450742034818087, 0.47410317370023536, 0.4508255957445673, 0.5200528137928899, 0.47262659245645455, 0.5260323959391069, 0.4583127312981947, 0.5145256169318503, 0.4436664261411088, 0.4732882840142543, 0.5141720540946766, 0.40960423134237545, 0.6095219956833988, 0.4381695323268383, 0.4925513574751352, 0.5525521988144904, 0.46112467247332567, 0.5817989612539411, 0.4528788698622771, 0.4405854065087196, 0.44658194201880275, 0.5277034932546982, 0.40144345591362435, 0.4264533086548449, 0.47725489957665024, 0.33351428775556985, 0.3621101134117234]


  default_term_info = default_term_info.sort_values(


In [29]:
# setting an entry 
coh_perp_dict.setdefault("k40", {})

# calculating perplexity
perplexity = lda_k40.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=40:", lda_k40.perplexity(tf))
coh_perp_dict["k40"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=40:", k40_coh)
coh_perp_dict["k40"].setdefault("coherence", k40_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k40, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak40_pyldavis.html")

Perplexity at K=40: 284.7724078493674
Coherence at K=40: [0.5193774639657722, 0.41423589196292443, 0.41630929684411616, 0.48122624119399215, 0.5009996848828652, 0.42348642610717546, 0.45924167531653043, 0.39746516297911966, 0.52658295277553, 0.4923714155991116, 0.48323970577514463, 0.5308183115668431, 0.5201888296934045, 0.5507993475509195, 0.40652050691531183, 0.548402967053639, 0.4313976650310923, 0.4941034378176643, 0.5546694186607622, 0.4643691443164246, 0.5957020842795788, 0.4625757003923126, 0.4097952242958753, 0.4391359511859167, 0.514849118370331, 0.5035496735528712, 0.43719833090549043, 0.4586808397646111, 0.3335156352402355, 0.43733544502379973, 0.47944969664648623, 0.4653935953090051, 0.39380796825883785, 0.4415891425283302, 0.48892098276951257, 0.47410741172198445, 0.319599897026801, 0.5200229757002013, 0.46338152185660053, 0.467889218665409]


  default_term_info = default_term_info.sort_values(


In [30]:
# setting an entry 
coh_perp_dict.setdefault("k60", {})

# calculating perplexity
perplexity = lda_k60.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=60:", lda_k60.perplexity(tf))
coh_perp_dict["k60"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=60:", k60_coh)
coh_perp_dict["k60"].setdefault("coherence", k60_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k60, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak60_pyldavis.html")

Perplexity at K=60: 661.9802687759513
Coherence at K=60: [0.4721952814108391, 0.42380488612671546, 0.38278934258578673, 0.48878688734298115, 0.4911865918060029, 0.529262913301261, 0.45041861647877945, 0.357610975820984, 0.528124238891286, 0.440000199743944, 0.539094117712214, 0.5564457986819156, 0.48297794350197776, 0.5793444410647087, 0.4328903298668131, 0.5388319659101721, 0.4522497834386968, 0.5378528772669747, 0.46756447095317677, 0.4871898106527988, 0.5912678793904075, 0.45738219970332006, 0.4626943359383794, 0.4325104954179764, 0.5466010003832512, 0.5680272765877329, 0.48027761088947757, 0.5349070242942856, 0.3335014256081601, 0.4278723707323122, 0.5546451221693007, 0.46162899070209296, 0.3987266502851689, 0.496476269414485, 0.5137724644553279, 0.4672346218138066, 0.3786231111372999, 0.5489770494786252, 0.4252682580215703, 0.47294971360022736, 0.5178299493327707, 0.5652258184378616, 0.5295882512000605, 0.5551200021267987, 0.5373290875606204, 0.4911865918060029, 0.5136259694672435

  default_term_info = default_term_info.sort_values(


In [31]:
# setting an entry 
coh_perp_dict.setdefault("k80", {})

# calculating perplexity
perplexity = lda_k80.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=80:", lda_k80.perplexity(tf))
coh_perp_dict["k80"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=80:", k80_coh)
coh_perp_dict["k80"].setdefault("coherence", k80_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k80, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak80_pyldavis.html")

Perplexity at K=80: 774.9120341057561
Coherence at K=80: [0.45705449425202094, 0.4524240006992648, 0.4806564484645336, 0.5357574306633225, 0.5009996848828651, 0.4349488975855886, 0.4720512233500142, 0.4494632159611596, 0.526925506721176, 0.4397312598447446, 0.471021766388933, 0.48237333960967216, 0.5324211115106163, 0.5481149851707022, 0.4987225275136452, 0.5382488489584019, 0.39648772304231134, 0.4374023893196031, 0.529221478802683, 0.46849642331140534, 0.5905237203361986, 0.5140653880823782, 0.4791617641201492, 0.45716613863497724, 0.5473941575805216, 0.5779243848821042, 0.5039387588006498, 0.5179414587587997, 0.32814165438078896, 0.40836586951526177, 0.5273880725657041, 0.48583475630322, 0.2994272040120947, 0.5316758405325525, 0.47064562248654235, 0.44686448007447604, 0.40770033801188016, 0.5735669304954342, 0.4780619499249211, 0.4773498941839914, 0.5510916180793118, 0.5456217354234233, 0.47139720156295734, 0.5559906223033627, 0.5718944794907379, 0.4181205829765684, 0.49974679248191

  default_term_info = default_term_info.sort_values(


In [32]:
# setting an entry 
coh_perp_dict.setdefault("k100", {})

# calculating perplexity
perplexity = lda_k100.perplexity(tf)

# printing perplexity and adding it to the dict
print("Perplexity at K=100:", lda_k100.perplexity(tf))
coh_perp_dict["k100"].setdefault("perplexity", perplexity)

# printing coherence and adding it to the dict
print("Coherence at K=100:", k100_coh)
coh_perp_dict["k100"].setdefault("coherence", k100_coh)

# making the visualization, showing it, and saving it
vis = pyLDAvis.sklearn.prepare(lda_k100, tf, tfidf_vectorizer) # lda_tf, dtm, tf_vectorizzer
vis
pyLDAvis.save_html(vis,"ldak100_pyldavis.html")

Perplexity at K=100: 1768.6989038035117
Coherence at K=100: [0.5196252322342403, 0.48271995450569893, 0.5118980119343173, 0.5507362540225116, 0.500999684882865, 0.434471671596058, 0.46114681795832624, 0.5119901352208421, 0.5252631678500476, 0.4721448262002738, 0.5032852438119451, 0.5051796731570082, 0.4849834677937109, 0.5481178806176282, 0.5509636074470237, 0.53964066418332, 0.27180001172509416, 0.49056565584828327, 0.4689088940241266, 0.5115371486918632, 0.520307056577665, 0.4941198890351763, 0.42914336303188605, 0.46047605455726004, 0.5383216448557283, 0.5438131260530267, 0.4972988073154406, 0.5359265185623557, 0.3536866492227461, 0.4152987908444657, 0.4806609186651503, 0.4730246278927893, 0.4251929692293251, 0.48201656753965133, 0.518233793850928, 0.47349389285043325, 0.4372652661007167, 0.5662877591161458, 0.49657037096829554, 0.4330364072045539, 0.5428908758465689, 0.547977940903347, 0.505893998588336, 0.5442053235090404, 0.5476544668347749, 0.5625181646790376, 0.5640687585846423

  default_term_info = default_term_info.sort_values(


In [33]:
# serializing the dict
# saving the the test run coherence and perplexity for each of the models
with open('coh_perp.json', 'w') as outfile:
    json.dump(coh_perp_dict, outfile)

In [34]:
end = time.time()

In [35]:
runtime = end - start

In [36]:
print(runtime)

63989.53839755058
