In this notebook I use lsa with count vectors to create 3 topics. I also looked at the explained variance for each topic and tried to distinguish what each topic was clustered around.

In [None]:
from collections import defaultdict

from __future__ import print_function

import matplotlib.pyplot as plt
from nltk.corpus import stopwords

import numpy as np
import pandas as pd

import pickle

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.feature_extraction import text 
from gensim.utils import simple_preprocess

In [None]:
#Reads in a csv, drops Unnamed column, tokenizes, removes reviews that are 10 words or less and creates
#modelling text column with the tokens.
df = pd.read_csv('tokenized_text')

df.drop(columns = 'Unnamed: 0', inplace = True)

df['tokens'] = df['text'].apply(simple_preprocess)

df = df[(df.tokens.str.len() > 10)]

df['modeling_text'] = df['tokens'].apply(lambda x:' '.join(x))

In [None]:
df.head()

In [None]:
#Cast the modeling_text to a variable
example = df['modeling_text']

In [None]:
#Takes a df of word counts and threshold for number of times a word appears and returns a dataframe with
#containing the number of times each word appears. This is useful for adding custom stop words.
def common_words(df_word_count, n):
    df_word_count = df_word_count.T.reset_index()
    df_word_count['Word Total']= df_word_count.iloc[1:,-3558:-1].sum(axis=1)
    common_words = df_word_count[df_word_count['Word Total'] > n]
    return common_words

In [None]:
#Adding custom stop words
stopword = set(stopwords.words('english'))

stopword = stopword.union(set(['food', 'this', 'place', 'the', 'of', 'is', 'came', 'was', 'for', 'have', 'had'
                           ,'and', 'get', 'one', 'food', 'guy','?','!','place', 'good', 'fries','burger', 'burgers',
                            'got', 'eat','great', 'us', 'asked', 'service', 'back', 'time', 'like', 'vegas', 'go',
                            'try', 'animal', 'style', 'double', 'good', 'just', 'always', 'location', 'fresh',
                              'east', 'coast', 'order', 'ordered', 'fast']))

In [None]:
#Converts the modeling text to a count of the words using CountVectorizer
vectorizer = CountVectorizer(min_df = 1, stop_words = stopword)
dtm = vectorizer.fit_transform(example)  # dtm: Document-Term Matrix
df_word_count = pd.DataFrame(dtm.toarray(), index=example, columns=vectorizer.get_feature_names())

In [None]:
vectorizer.get_feature_names()

In [None]:
# Fit LSA. Use algorithm = “randomized” for large datasets 
num_topics = 20
lsa = TruncatedSVD(num_topics, algorithm = 'randomized')
dtm_lsa = lsa.fit_transform(dtm)

In [None]:
lsa.explained_variance_ratio_

In [None]:
#Plotting number of topics against explained variance
plt.figure(figsize=[15,5])
plt.subplot(1,2,1)
plt.plot(lsa.explained_variance_ratio_)
plt.xlabel('Number of Components')
plt.ylabel('Explained Variance Ratio')

In [None]:
#Extracting most common words for topic 1
pd.DataFrame(lsa.components_.round(5),index = ["1",'2', '3'],columns = vectorizer.get_feature_names()).T.sort_values(by='1', ascending=False)

In [None]:
#Extracting most common words for topic 2
pd.DataFrame(lsa.components_.round(5),index = ["1",'2', '3'],columns = vectorizer.get_feature_names()).T.sort_values(by='2', ascending=False)

In [None]:
#Extracting most common words for topic 3
pd.DataFrame(lsa.components_.round(5),index = ["1",'2', '3'],columns = vectorizer.get_feature_names()).T.sort_values(by='3', ascending=False)

In [None]:
#Saving the topic probablilites of each review to a CSV
topic_probs = pd.DataFrame(dtm_lsa.round(5), index = example, columns = ['1','2', '3'])

topic_probs.reset_index().to_csv('lsi_topic_probs')


In [None]:
#This section was trying to extract reviews that had the highest probabilities to be in each topic.
#I wanted to look at them to see if there was an obvious difference between the 3.
text_1 = []
text_2 = []
text_3 = []
for i in range(len(topic_probs)):
    if topic_probs.iloc[i]['1'] > 8:
        text_1.append(topic_probs.iloc[i]['modeling_text'])
    elif topic_probs.iloc[i]['2'] > 5:
        text_2.append(topic_probs.iloc[i]['modeling_text'])
    if topic_probs.iloc[i]['3'] > 2:
        text_3.append(topic_probs.iloc[i]['modeling_text'])

In [None]:
text_1

In [None]:
import csv

with open("text_1","w",newline="") as f:  # open("output.csv","wb") for Python 2
    cw = csv.writer(f)
    cw.writerows(r for r in text_2)

In [None]:
text_1

In [None]:
topic_probs

In [None]:
#Looking at cosine_similarity of the reviews to see if they are related in an obvious way.
from sklearn.metrics.pairwise import cosine_similarity
pd.DataFrame(cosine_similarity(dtm_lsa,dtm_lsa).round(6), columns =example, index = example)