In [None]:
import gensim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim
from gensim import corpora
from gensim.models import LdaMulticore
from sklearn.model_selection import train_test_split

In [None]:
# load previosly created dataframe
df = pd.read_csv('../data/df_final.csv',index_col=0, encoding='utf-8')

In [None]:
# shuffle dataframe
df = df.sample(frac=1, random_state=1).reset_index(drop=True)

In [None]:
# create word mapping and corpus for gensim
document_list = list(df['fragment'].apply(lambda x: x[1:-1].replace("'","").split(',')))
id2words = corpora.Dictionary(document_list)

corpus = []
for document in document_list:
    new = id2words.doc2bow(document)
    corpus.append(new)

In [None]:
# create train and test corpus for estimaiton and perplexity calculation
train_corpus, test_corpus = train_test_split(corpus, test_size=0.1, shuffle=False)

In [None]:
# create LDA Model with basic params
n_topics = [1,5,10,15,20,30,35,40,50]
alphas = [0.1,0.01,0.001]
betas = [0.01,0.001,0.0001,0.00001] 

results = []
models = []
for n_topic in n_topics:
    for alpha in alphas:
        for beta in betas:
            lda_model = LdaMulticore(
                corpus=train_corpus,
                id2word=id2words,
                num_topics=n_topic,
                passes=10,
                alpha=alpha,
                eta=beta,
                random_state=1
            )
            perp = np.exp(-1. * lda_model.log_perplexity(test_corpus))
            print(f'model params: alpha:{alpha} beta:{beta} n_topics:{n_topic}')
            print(f'model perplexity: {perp}')
            results.append(tuple([n_topic, alpha, beta, perp]))
            models.append(lda_model)

In [None]:
result_data = [[result[3] for result in results if result[1] == alpha and result[2] == 0.001] for alpha in alphas]

In [None]:
fig = plt.figure(figsize=(12,8))
for index, result_set in enumerate(result_data):
    plt.plot(n_topics, result_set, label=f'alpha: {alphas[index]}')

plt.legend(loc="upper left")
plt.xlabel('number of topics')
plt.ylabel('perplexity')
plt.show()

In [None]:
lda_model = models[47]

In [None]:
# enbale model visualization
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, id2words, mds="mmds", R=8)

In [None]:
# Create a list to hold the document topic distributions
doc_topic_dists = []
for doc in corpus:
    topic_dist = lda_model.get_document_topics(doc, minimum_probability=0.0)
    doc_topic_dists.append(topic_dist)

In [None]:
# Convert the list of topic distributions into a numpy array
doc_topic_matrix = gensim.matutils.corpus2dense(doc_topic_dists, num_terms=lda_model.num_topics).transpose()

In [None]:
# join document term probablilies with metadata
df_result = pd.concat([df.reset_index(drop=True), pd.DataFrame(doc_topic_matrix).reset_index(drop=True)], axis=1)

In [None]:
df_result.to_csv('../data/df_result.csv')

In [None]:
for index, model in enumerate(models):
    lda_model.save(f'../models/lda_grid_{index}')

In [None]:
corpora.MmCorpus.serialize('../models/corpus.mm', corpus=corpus)