# Topic Modeling and Document Clustering with LDA

TODO add description

In [1]:
# add scripts/ folder to path
import os, sys

SCRIPTS_PATH = os.environ['DSX_PROJECT_DIR'] + '/scripts'
sys.path.insert(0, SCRIPTS_PATH)

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

import visualization # custom script

In [39]:
DATASET_PATH = "/user-home/libraries/text-analytics/datasets/aclImdb"
TRAIN_PATH = DATASET_PATH + "/train/"
TEST_PATH = DATASET_PATH + "/test/"

## 0. Load files

In [40]:
from sklearn.datasets import load_files

We only load the training data, without labels, and consider it as unlabeled data:

In [41]:
reviews_train = load_files(TRAIN_PATH)
text_train = reviews_train.data
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))

type of text_train: <class 'list'>
length of text_train: 25000


## 1. Preprocessing

Even though the preprocessing is short and straightforward, we probably want to move this to a script at some point.

In [42]:
text_train = [doc.replace(b"<br />", b" ").decode('utf-8') for doc in text_train]

In [43]:
text_train = pd.DataFrame({"review": text_train})

## 2. Feature Engineering

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

We limit the number of features to speed up the topic modeling.

In [45]:
vect = CountVectorizer(max_features=10000, max_df=.15)
X_train = vect.fit_transform(text_train.review)
print("X_train:\n{}".format(repr(X_train)))

X_train:
<25000x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 1948677 stored elements in Compressed Sparse Row format>


In [46]:
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

Number of features: 10000
First 20 features:
['00', '000', '10', '100', '1000', '101', '11', '12', '13', '13th', '14', '15', '150', '16', '17', '18', '18th', '19', '1920', '1920s']
Features 20010 to 20030:
[]
Every 2000th feature:
['00', 'conroy', 'graphic', 'named', 'sharp']


## 3. Build model

### 3.1 Latent Dirichlet Allocation (LDA), 10 topics

Training LDA on the full data is very slow, and training it on a subset of the data gives "bad" topics: instead, we only perform NMF -> see section 3.2

In [22]:
from sklearn.decomposition import LatentDirichletAllocation

In [53]:
# %%time
# lda = LatentDirichletAllocation(n_components=10, learning_method="batch",
#                                 max_iter=25, random_state=0)
# document_topics = lda.fit_transform(X_train)
# print("lda.components_.shape: {}".format(lda.components_.shape))

In [54]:
# # for each topic (a row in the components_), sort the features (ascending).
# # Invert rows with [:, ::-1] to make sorting descending
# sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
# # get the feature names from the vectorizer:
# feature_names = np.array(vect.get_feature_names())

#### Explore the topics:

In [56]:
# # Print out the 10 topics:
# visualization.print_topics(topics=range(10), feature_names=feature_names,
#                            sorting=sorting, topics_per_chunk=5, n_words=10)

### 3.2 Non-Negative Matrix Factorization (NMF), 10 topics

In [47]:
from sklearn.decomposition import NMF

In [57]:
%%time
nmf = NMF(n_components=10, max_iter=25, random_state=0)
document_topics_nmf = nmf.fit_transform(X_train)
print("nmf.components_.shape: {}".format(nmf.components_.shape))

nmf.components_.shape: (10, 10000)
CPU times: user 4.54 s, sys: 11.4 s, total: 16 s
Wall time: 3.22 s


In [58]:
# for each topic (a row in the components_), sort the features (ascending).
# Invert rows with [:, ::-1] to make sorting descending
sorting_nmf = np.argsort(nmf.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

#### Explore the topics

In [59]:
# Print out the 10 topics:
visualization.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting_nmf, topics_per_chunk=5, n_words=10)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
director      show          re            series        horror        
work          shows         thing         original      house         
role          episode       nothing       episode       gore          
performance   tv            didn          new           blood         
quite         season        going         episodes      zombie        
cast          episodes      guy           tv            effects       
though        television    actually      season        budget        
actors        funny         doesn         years         dead          
however       always        minutes       batman        scary         
both          real          want          star          killer        


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
us  

**TODO find an interesting topic and explore it**

In [20]:
# # sort by weight of "music" topic 45
# music = np.argsort(document_topics100[:, 45])[::-1]
# # print the five documents where the topic is most important
# for i in music[:10]:
#     # show first two sentences
#     print(b".".join(text_train[i].split(b".")[:2]) + b".\n")

In [21]:
# fig, ax = plt.subplots(1, 2, figsize=(10, 10))
# topic_names = ["{:>2} ".format(i) + " ".join(words)
#                for i, words in enumerate(feature_names[sorting[:, :2]])]
# # two column bar chart:
# for col in [0, 1]:
#     start = col * 50
#     end = (col + 1) * 50
#     ax[col].barh(np.arange(50), np.sum(document_topics100, axis=0)[start:end])
#     ax[col].set_yticks(np.arange(50))
#     ax[col].set_yticklabels(topic_names[start:end], ha="left", va="top")
#     ax[col].invert_yaxis()
#     ax[col].set_xlim(0, 2000)
#     yax = ax[col].get_yaxis()
#     yax.set_tick_params(pad=130)
# plt.tight_layout()