In [None]:
import gensim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim
from gensim import corpora
from gensim.models import LdaMulticore
from sklearn.model_selection import train_test_split

In [None]:
# load previosly created dataframe
df = pd.read_csv('../data/df_final.csv',index_col=0, encoding='utf-8')

In [None]:

# create word mapping and corpus for gensim
document_list = list(df['fragment'].apply(lambda x: x[1:-1].replace("'","").split(',')))
id2words = corpora.Dictionary(document_list)

corpus = []
for document in document_list:
    new = id2words.doc2bow(document)
    corpus.append(new)

In [None]:
# create train and test corpus for estimaiton and perplexity calculation
train_corpus, test_corpus = train_test_split(corpus, test_size=0.1, shuffle=False)

In [None]:
# create LDA Model with basic params
lda_model = LdaMulticore(
    corpus=corpus,
    id2word=id2words,
    num_topics=15,
    passes=15,
    alpha=0.001,
    eta=0.0001,
    random_state=42
)
perp = lda_model.log_perplexity(test_corpus)
print(f'model perplexity: {perp}')

In [None]:
# enbale model visualization
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, id2words, mds="mmds", R=8)

In [None]:
# Create a list to hold the document topic distributions
doc_topic_dists = []
for doc in corpus:
    topic_dist = lda_model.get_document_topics(doc, minimum_probability=0.0)
    doc_topic_dists.append(topic_dist)

In [None]:
# Revert the transformation
row = 4
test_lyrics = sorted([id2words[id] for id, _ in corpus[row]])
test_row = sorted(df.iloc[row].fragment.replace("'","").split(','))

In [None]:
test_topics = lda_model.get_document_topics(corpus[0])

In [None]:
# Convert the list of topic distributions into a numpy array
doc_topic_matrix = gensim.matutils.corpus2dense(doc_topic_dists, num_terms=lda_model.num_topics).transpose()

In [None]:
# join document term probablilies with metadata
df_result = pd.concat([df.reset_index(drop=True), pd.DataFrame(doc_topic_matrix).reset_index(drop=True)], axis=1)

In [None]:
df_result.to_csv('../data/df_result.csv')

In [None]:
lda_model.save('../models/lda_15')