In [19]:
import pandas as pd
import numpy as np
import scipy as sp
import sklearn
import sys

import spacy

import en_core_web_sm
nlp = en_core_web_sm.load()

import gensim.corpora
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize

import pickle

In [2]:
# Temporarily saving to CSV
reviews_df = pd.read_csv('data/review_text.csv', index_col=0)

In [25]:
def remove_stopwords_and_punct(text):
    doc = nlp(text)
    clean_text = " ".join([token.text for token in doc if not token.is_stop and not token.is_punct])
    return clean_text

In [26]:
clean_reviews = [remove_stopwords_and_punct(text) for text in reviews_df.text.tolist()]

In [27]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000)
x_counts = vectorizer.fit_transform(clean_reviews)

In [28]:
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)

In [29]:
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [30]:
xtfidf_norm

<5447x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1771171 stored elements in Compressed Sparse Row format>

In [31]:
#obtain a NMF model.
model = NMF(n_components=5, init='nndsvd');
#fit the model
model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0,
  max_iter=200, n_components=5, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [32]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {}
    for i in range(model.n_components):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words
    
    return pd.DataFrame(word_dict)

In [33]:
get_nmf_topics(model, 20)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05
0,book,deadpool,hulk,spider,batman
1,story,cable,banner,man,superman
2,great,funny,red,peter,stories
3,series,great,marvel,iron,dc
4,read,merc,bruce,spidey,story
5,good,wade,8217,parker,robin
6,like,love,david,marvel,joker
7,marvel,comic,34,ultimate,comics
8,men,daniel,planet,venom,book
9,volume,kelly,incredible,mary,bruce
