Adapted from https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/

In [1]:
import numpy as np
import pandas as pd
import re, nltk, gensim
import pickle

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from scipy import sparse
%matplotlib inline

In [2]:
# X_train = pickle.load(open('outputs/X_train_count.pickle', "rb"))

data = np.load('outputs/x_train.npy', allow_pickle = True).astype(str)
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(data)

In [3]:
X_train_vect

<333414x139954 sparse matrix of type '<class 'numpy.int64'>'
	with 3313866 stored elements in Compressed Sparse Row format>

In [4]:
list(data)

['buffalo sabres nhl',
 'diseases lentils culture lentils mentioned many times old testament first time recounting incident jacob purchases birthright esau stewed lentils lcb genesis rcb',
 'railroads like lehigh valley railroad important move raw materials finished goods created many new jobs',
 'example would individual animal learns eat buds seedlings food crop destroying normal supply food would later available mature plants',
 'matanuska susitna rivers major salmon spawning streams',
 'mettingen',
 'analog nature also cause great number problems',
 'cooperation republic kazakhstan islamic republic pakistan kazakhstan ministry foreign affairs kazakhstan emerging market pakistani goods',
 'importance philosophy',
 'extreme attain wind speeds mph kmh stretch two miles km across stay ground dozens miles km',
 'theory exams written papers topic musical theory',
 'finland turkey new zealand part team australasia made first appearance olympic games',
 'first single album made top uk char

LDA Model with Default Params with default params

In [5]:
# lda_model = LatentDirichletAllocation()

# lda_output = lda_model.fit_transform(X_train_vect)

In [6]:
# # Log Likelyhood: Higher the better
# print("Log Likelihood: ", lda_model.score(X_train_vect))

# # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
# print("Perplexity: ", lda_model.perplexity(X_train_vect))

# # See model parameters
# pprint(lda_model.get_params())

In [7]:
# pyLDAvis.enable_notebook()

# vectorizer = CountVectorizer()
# X_train_vect = vectorizer.fit_transform(data)

# panel = pyLDAvis.sklearn.prepare(lda_model, X_train_vect, vectorizer, mds='tsne')
# panel

Take sample of training data to perform Grid Search

In [8]:
import random
from random import sample

random.seed(42)

k = int(len(data)*.025)
sampled_data = random.sample(list(data),k)

# sampled_data = [data[i] for i in indicies]


sampled_data

vectorizer = CountVectorizer()
X_train_vect_s = vectorizer.fit_transform(sampled_data)

search_params = {'n_components': [5, 10, 15, 20], 'learning_decay': [.5, .7, .9]}

lda = LatentDirichletAllocation(n_jobs = -1)

model = GridSearchCV(lda, param_grid = search_params, refit = True)

model.fit(X_train_vect_s)

GridSearchCV(estimator=LatentDirichletAllocation(n_jobs=-1),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 10, 15, 20]})

In [9]:
# search_params = {'n_components': [5, 10, 15, 20], 
#                  'learning_decay': [.5, .7, .9]}

# lda = LatentDirichletAllocation()

# model = GridSearchCV(lda, param_grid = search_params, refit = True)

# model.fit(X_train_vect)

In [10]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(X_train_vect_s))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 5}
Best Log Likelihood Score:  -223871.07622527666
Model Perplexity:  12628.567756397826


In [11]:
model.best_params_['learning_decay']

0.9

In [15]:
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(data)

final_lda = LatentDirichletAllocation(n_components = model.best_params_['n_components'], learning_decay = model.best_params_['learning_decay'], n_jobs = -1)

lda_output = final_lda.fit_transform(X_train_vect)

In [16]:
# Log Likelihood: Higher the better
print("Log Likelihood: ", final_lda.score(X_train_vect))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", final_lda.perplexity(X_train_vect))

# See model parameters
pprint(final_lda.get_params())

Log Likelihood:  -32687798.603710953
Perplexity:  10269.149707010338
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.9,
 'learning_method': 'batch',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 5,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': None,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [17]:
pyLDAvis.enable_notebook()

best_panel = pyLDAvis.sklearn.prepare(final_lda, X_train_vect, vectorizer, mds = 'tsne')
best_panel

  default_term_info = default_term_info.sort_values(


In [19]:
# Create Document - Topic Matrix
# lda_output = best_lda_model.transform(X_train)

# column names
# topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_topics)]
topicnames = ["Topic" + str(i) for i in range(len(final_lda.components_))]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

df_document_topic.head()

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.05,0.05,0.05,0.8,0.05,3
Doc1,0.01,0.57,0.01,0.01,0.4,1
Doc2,0.01,0.01,0.09,0.31,0.57,4
Doc3,0.26,0.09,0.01,0.01,0.64,4
Doc4,0.18,0.03,0.03,0.26,0.51,4


In [25]:
# LDAvis_default_filepath = 'outputs/ldavis_default_10.pickle'

# with open(LDAvis_data_filepath, 'wb') as f:
#     pickle.dump(panel, f)

In [21]:
LDAvis_best_filepath = 'outputs/ldavis_best.pickle'

with open(LDAvis_best_filepath, 'wb') as f:
    pickle.dump(best_panel, f)


In [22]:
from pathlib import Path

filepath = Path('outputs/df_document_topic.csv')
filepath.parent.mkdir(parents = True, exist_ok = True)
df_document_topic.to_csv(filepath)

In [26]:
# LDA_default_filepath = 'outputs/lda_default_10.pickle'

# with open(LDA_default_filepath, 'wb') as f:
#     pickle.dump(lda_model, f)

In [24]:
LDA_best_filepath = 'outputs/lda_model_best.pickle'

with open(LDA_best_filepath, 'wb') as f:
    pickle.dump(model, f)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=1bd4d30e-81e1-4e2e-8aa7-19761d96ea9f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>