In [None]:
import pyLDAvis
import pyLDAvis.gensim
import pandas as pd
import matplotlib.pyplot as plt
from gensim import corpora
from gensim.models import LdaMulticore

In [None]:
# load previously created result dataframe
df_result = pd.read_csv('../data/df_result.csv', index_col=0)

In [None]:
# load model and term dictionary
lda_model = LdaMulticore.load('../models/lda_15')
id2word = corpora.Dictionary.load('../models/lda_15.id2word')
corpus = corpora.MmCorpus('../models/corpus.mm')

In [None]:
pyLDAvis.enable_notebook()
topic_data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds', sort_topics=False)
pyLDAvis.display(topic_data)


In [None]:
topic_data.topic_info

In [None]:
lambda_value = 0.4
num_terms = 8

for n_topic in range(lda_model.num_topics):
    df_genres = df_result.groupby('genre')
    topic_probs = df_genres[f'{n_topic}'].apply(list).values.tolist()

    # get updated term sorting by LDAvis with given lambda value
    topic = topic_data.topic_info[topic_data.topic_info.Category == f'Topic{n_topic+1}'].copy()
    topic['relevance'] = topic['loglift']*(1-lambda_value)+topic['logprob']*lambda_value
    topic_words = topic.sort_values(by='relevance', ascending=False).Term[:num_terms].values

    # get default sorting of topics
    # topic_terms = lda_model.get_topic_terms(n_topic)
    # topic_words = [id2word[term] for term, _ in topic_terms]
    
    # draw boxplot graphs showing the topic proablilities
    fig = plt.figure(figsize=(8,6))
    plt.boxplot(topic_probs, labels=df_genres.groups)
    plt.title(f'topic {n_topic+1}; common words:{list(topic_words)}')

In [None]:
# assemble topic distrs of topics for ANOVA test
df_result_genre = df_result.groupby('genre').agg({f'{n_topic}':'mean' for n_topic in range(lda_model.num_topics)})