In [12]:
import pandas as pd
datapath = "../data/amazon_reviews/amazon_appliances_reviews.csv"
df = pd.read_csv(datapath)
texts = df['review_body'].tolist()

In [13]:
import re
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def some_cleaning(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\d+','',text)
    text = [word.lower() for word in text.split() if word.lower() not in stop_words]
    text = [word for word in text if len(word) > 3]
    return text

texts = [some_cleaning(text) for text in texts]

In [14]:
# ensure that we have no empty strings, and we also want to filter out very short texts
texts = [text for text in texts if len(text) > 3]

In [17]:
from gensim.corpora.dictionary import Dictionary

gensim_dict = Dictionary(texts)
corpus = [gensim_dict.doc2bow(text) for text in texts]

In [19]:
# create an LSI model
from gensim.models import LsiModel
N_TOPICS = 3
lsi = LsiModel(corpus=corpus, num_topics=N_TOPICS, id2word=gensim_dict)

In [20]:
for topic in lsi.show_topics(num_topics=3, num_words=10):
    topic_id, topic_word_weights = topic
    words = topic_word_weights.split("+")
    w = [word.strip().replace('"', '').split("*")[1] for word in words]
    print(w)
    print("__")

['water', 'machine', 'washer', 'would', 'like', 'clothes', 'time', 'unit', 'cycle', 'dont']
__
['washer', 'water', 'machine', 'clothes', 'fridge', 'unit', 'cycle', 'oven', 'door', 'wash']
__
['water', 'fridge', 'machine', 'dryer', 'freezer', 'service', 'door', 'level', 'warranty', 'oven']
__


In [21]:
lsi.show_topics(num_topics=10, num_words=10)

[(0,
  '0.310*"water" + 0.232*"machine" + 0.210*"washer" + 0.197*"would" + 0.164*"like" + 0.162*"clothes" + 0.158*"time" + 0.142*"unit" + 0.126*"cycle" + 0.121*"dont"'),
 (1,
  '0.348*"washer" + 0.319*"water" + 0.315*"machine" + 0.298*"clothes" + -0.260*"fridge" + -0.223*"unit" + 0.159*"cycle" + -0.145*"oven" + -0.145*"door" + 0.144*"wash"'),
 (2,
  '-0.540*"water" + -0.397*"fridge" + 0.367*"machine" + 0.210*"dryer" + -0.138*"freezer" + 0.136*"service" + -0.116*"door" + -0.114*"level" + 0.104*"warranty" + 0.101*"oven"')]

In [22]:
from gensim.models import LdaModel

lda = LdaModel(corpus=corpus,num_topics=N_TOPICS)
for topic in lda.show_topics(num_topics=N_TOPICS, num_words=10, formatted=False):
    word_ids = [word for word, _ in topic[1]]
    words = [gensim_dict[int(word_id)] for word_id in word_ids]
    print(words)

['machine', 'water', 'washer', 'dishwasher', 'clothes', 'dryer', 'cycle', 'clean', 'wash', 'dishes']
['great', 'fridge', 'unit', 'good', 'well', 'like', 'would', 'wine', 'works', 'price']
['would', 'service', 'unit', 'years', 'oven', 'product', 'months', 'warranty', 'back', 'problem']


In [23]:
some_other_text = "10/10 I absolutely love this thing! It dries my clothes in 2 microseconds!"
some_other_text = some_cleaning(some_other_text)
bow = gensim_dict.doc2bow(some_other_text)
topic_matches = lda.get_document_topics(bow)
best_topic = max(topic_matches, key=lambda x: x[1])
print(f"Predicted topic {best_topic[0]} with probability {best_topic[1]}")

matching_words = [gensim_dict[int(word_id)] for word_id, _ in lda.show_topic(best_topic[0], topn=10)]
print(matching_words)

Predicted topic 0 with probability 0.879182755947113
['machine', 'water', 'washer', 'dishwasher', 'clothes', 'dryer', 'cycle', 'clean', 'wash', 'dishes']


In [29]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
gensimvis.prepare(lda, corpus, gensim_dict)

 [ -9.83386612  -8.84326557  -7.71929277]
 [ -6.78911005  -5.94715111  -6.76982636]
 ...
 [-15.09424225 -15.32287929 -13.54295227]
 [-13.95010499 -15.49622936 -15.27564127]
 [ -9.83386612  -8.84326557  -7.71929277]
 [ -6.78911005  -5.94715111  -6.76982636]
 ...
 [-15.09424225 -15.32287929 -13.54295227]
 [-13.95010499 -15.49622936 -15.27564127]
 [-1.30064836 -0.31004781  0.81392499]
 [-0.35029738  0.49166155 -0.3310137 ]
 ...
 [-0.70733721 -0.93597425  0.84395278]
 [ 0.62050305 -0.92562133 -0.70503324]
 [-1.30064836 -0.31004781  0.81392499]
 [-0.35029738  0.49166155 -0.3310137 ]
 ...
 [-0.70733721 -0.93597425  0.84395278]
 [ 0.62050305 -0.92562133 -0.70503324]
 [ -9.83386612  -8.84326557  -7.71929277]
 [ -6.78911005  -5.94715111  -6.76982636]
 ...
 [-15.09424225 -15.32287929 -13.54295227]
 [-13.95010499 -15.49622936 -15.27564127]
 [-1.30064836 -0.31004781  0.81392499]
 [-0.35029738  0.49166155 -0.3310137 ]
 ...
 [-0.70733721 -0.93597425  0.84395278]
 [ 0.62050305 -0.92562133 -0.70503324