In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import pickle
from matplotlib import pyplot as plt
from langdetect import detect
import json

In [None]:
with open("new_lems.json", encoding="utf8") as file:
    lemmas_by_id = json.loads(file.read())

In [None]:
texts_by_id = {id: " ".join(lemmas_by_id[id]) for id in lemmas_by_id}
id_by_texts = {texts_by_id[id]: id for id in texts_by_id}
texts = list(texts_by_id.values())

In [None]:
from langdetect import detect

for i, text in enumerate(texts):
    try:
        if detect(text) != "en":
            del texts[i]
    except:
        del texts[i]

In [None]:
with open("english_texts.json", "w") as file:
    json.dump(texts, file)

In [None]:
with open("english_texts.json") as file:
    texts = json.loads(file.read())

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

In [None]:
n_topics = [3, 5, 7, 9]
learning_decays = [.7, .9]
search_params = {'n_components': n_topics, 'learning_decay': learning_decays}

lda = LatentDirichletAllocation(n_jobs=-1)
model = GridSearchCV(lda, param_grid=search_params, n_jobs=-1)
model.fit(X)

In [None]:
best_lda_model = model.best_estimator_

print("Best Model's Params: ", model.best_params_)
print("Best Log Likelihood Score: ", model.best_score_)
print("Model Perplexity: ", best_lda_model.perplexity(X))

In [None]:
# Get Log Likelyhoods from Grid Search Output
from matplotlib import pyplot as plt
import numpy as np

plt.figure(figsize=(12, 8))
for decay in learning_decays:
    indices = np.where(model.cv_results_['param_learning_decay'] == decay)[0]
    scores = model.cv_results_['mean_test_score'][indices]
    plt.plot(n_topics, scores, label=str(decay))

plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.savefig("optimal_model.png")

In [None]:
import pickle

with open("model.p", "wb") as file:
    pickle.dump(model, file)