In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim
from gensim import corpora
from gensim.models import LdaMulticore
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
# load previously created result dataframe
df_result = pd.read_csv('../data/df_result.csv', index_col=0)

In [None]:
# load model and term dictionary
lda_model = LdaMulticore.load('../models/lda_15')
id2word = corpora.Dictionary.load('../models/lda_15.id2word')
corpus = corpora.MmCorpus('../models/corpus.mm')

In [None]:
# visualize lda model
pyLDAvis.enable_notebook()
topic_data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds', sort_topics=False)
pyLDAvis.display(topic_data)


In [None]:
topic_data.topic_info

In [None]:
lambda_value = 0.4
num_terms = 8

for n_topic in range(lda_model.num_topics):
    df_genres = df_result.groupby('genre')
    topic_probs = df_genres[f'{n_topic}'].apply(list).values.tolist()

    # get updated term sorting by LDAvis with given lambda value
    topic = topic_data.topic_info[topic_data.topic_info.Category == f'Topic{n_topic+1}'].copy()
    topic['relevance'] = topic['loglift']*(1-lambda_value)+topic['logprob']*lambda_value
    topic_words = topic.sort_values(by='relevance', ascending=False).Term[:num_terms].values

    # get default sorting of topics
    # topic_terms = lda_model.get_topic_terms(n_topic)
    # topic_words = [id2word[term] for term, _ in topic_terms]
    
    # draw boxplot graphs showing the topic proablilities
    fig = plt.figure(figsize=(8,6))
    plt.boxplot(topic_probs, labels=df_genres.groups)
    plt.title(f'topic {n_topic+1}; common words:{list(topic_words)}')

In [None]:
# split X and y data
X = df_result[[f'{n_topic}' for n_topic in range(lda_model.num_topics)]].values
y = df_result[['genre']].values

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)

In [None]:
# scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# get training and test data
param_grid = {
    'solver':['svd','eigen'],
    'shrinkage': np.linspace(0.01, 1, 100) 
}
lda2_model = LinearDiscriminantAnalysis()
search = GridSearchCV(lda2_model, param_grid, scoring='accuracy', cv=5, n_jobs=-1)
results = search.fit(X_train,y_train)

In [None]:
# summarize model training
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
# get the best lda2 model and get predicted test values
best_lda2_model = results.best_estimator_
y_hat = best_lda2_model.predict(X_test)

In [None]:
# get confusion matrix
from sklearn.metrics import confusion_matrix
c_matrix = confusion_matrix(y_test, y_hat)
print(c_matrix)

In [None]:
# get classification report
from sklearn.metrics import classification_report
c_report = classification_report(y_test, y_hat)
print(c_report)