In [65]:
# pip install pyLDAvis

In [66]:
import pandas as pd
import re
import numpy as np
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
tdm = pd.read_csv('../datasets/DTM.csv')
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31858,31859,31860,31861,31862,31863,31864,31865,31866,31867
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
tdm.shape

(31705, 31868)

In [60]:
# Making a sparse matrix
from gensim import matutils, models, utils
import scipy.sparse
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [70]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
from collections import Counter
counter= pd.read_csv('../datasets/word_frequencies.csv')
counter.head()


Unnamed: 0,word,count
0,five,905
1,stars,1730
2,good,16944
3,headphone,943
4,than,3473


In [34]:
df.shape

(32031, 2)

In [71]:
counter['count'].astype(int)
counter.dropna(inplace=True)
counter.reset_index(inplace=True)

In [72]:
word_dict = pd.Series(counter.word.values,index=counter['count']).to_dict()
word_dict

{905: 'five',
 1730: 'stars',
 16944: 'good',
 943: 'headphone',
 3473: 'than',
 817: 'galaxy',
 4107: 'headphones',
 5791: 'at',
 155: 'together',
 5278: 'price',
 41419: 'and',
 121: 'pod',
 15169: 'with',
 4516: 'all',
 15: 'pitch',
 851: 'having',
 526: 'years',
 786: 'warranty',
 42: 'peel',
 11885: 'product',
 67: 'tracks',
 1218: 'issues',
 2247: 'first',
 38: 'suggestion',
 3813: 'dont',
 3401: 'buy',
 2: 'bcos',
 29585: 'is',
 4820: 'no',
 4844: 'bass',
 16242: 'in',
 4: 'looser',
 5084: 'battery',
 3195: 'life',
 3341: 'also',
 51: 'skipping',
 2906: 'awesome',
 323: 'sweat',
 44080: 'i',
 4648: 'like',
 13219: 'this',
 1460: 'customer',
 1613: 'long',
 2176: 'used',
 11567: 'these',
 4073: 'buds',
 22683: 'for',
 545: 'several',
 1503: 'months',
 16348: 'sound',
 2326: 'excellent',
 53: 'adapter',
 68861: 'the',
 2377: 'which',
 10005: 'was',
 2024: 'what',
 1023: 'looking',
 833: 'getting',
 1: 'tiems',
 1596: 'did',
 11157: 'have',
 2830: 'an',
 1390: 'issue',
 1561: 'clea

### Preprocessing for an LDA Model

In [105]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# After first run, adding stop words that reference ear buds, headphones, etc
stop_words.extend(['earbuds','buds', 'headphones', 'wireless', 'wire', 'apple', 'samsung', 
                   'screen', 'protector', 'bose', 'jbl', 'sony', 'wire', 'ipad', 'ua',
                   'trelab', 'iqbuds', 'headphone'])

In [106]:
# Loading the data
df = pd.read_csv('../datasets/df_w_tokens.csv')

In [107]:
df.head()

Unnamed: 0,body,rating,product,tokens
0,five stars good headphone than iphone headphon...,5.0,Sennheiser CX 6.0BT,"['five', 'stars', 'good', 'headphone', 'than',..."
1,duplicate product selling amazon selling first...,1.0,JBL T110BT,"['duplicate', 'product', 'selling', 'amazon', ..."
2,awesome battery durability i like this product,5.0,JBL T110BT,"['awesome', 'battery', 'durability', 'i', 'lik..."
3,awesome customer service i used these buds for...,5.0,Tozo10 Bluetooth Wireless Earbuds,"['awesome', 'customer', 'service', 'i', 'used'..."
4,best earphonelove sennheiser sincei had tried ...,5.0,Sennheiser CX 6.0BT,"['best', 'earphonelove', 'sennheiser', 'sincei..."


In [108]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(df['body']))

print(data_words[:1])

[['five', 'stars', 'good', 'headphone', 'than', 'iphone', 'headphones', 'at', 'lower', 'price', 'and', 'compatible', 'with', 'all', 'mobiles', 'and', 'having', 'years', 'warranty']]


In [109]:
# Building bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['five_stars', 'good', 'headphone', 'than', 'iphone', 'headphones', 'at', 'lower', 'price', 'and', 'compatible', 'with', 'all', 'mobiles', 'and', 'having', 'years', 'warranty']


In [110]:
# Making functions for stopwords, bigrams, trigrams and lemmatization - from https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [111]:
# Calling the functions in order
# Remove Stop Words
# spacy model for NLP and visualization

data_words_nostops = remove_stopwords(data_words)

data_words_bigrams = make_bigrams(data_words_nostops)

nlp = spacy.load('en')

# Lemmatizing and keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['good', 'iphone', 'low', 'price', 'compatible', 'mobile', 'year', 'warranty']]


In [112]:
# Creating a dictionary, corpus, TDF, and viewing
id2word = corpora.Dictionary(data_lemmatized)

texts = data_lemmatized

corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]]


In [113]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('compatible', 1),
  ('good', 1),
  ('iphone', 1),
  ('low', 1),
  ('mobile', 1),
  ('price', 1),
  ('warranty', 1),
  ('year', 1)]]

### Instantiating the LDA

In [122]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=7, # After some tweaking, settling on max of 7 aspects 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [123]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.174*"love" + 0.104*"amazing" + 0.043*"enough" + 0.036*"tozo" + '
  '0.029*"satisfied" + 0.028*"properly" + 0.023*"service" + 0.017*"original" + '
  '0.015*"material" + 0.013*"impressed"'),
 (1,
  '0.027*"easy" + 0.023*"get" + 0.017*"would" + 0.017*"go" + 0.016*"make" + '
  '0.015*"first" + 0.013*"put" + 0.013*"review" + 0.013*"problem" + '
  '0.012*"still"'),
 (2,
  '0.116*"voice" + 0.043*"switch" + 0.042*"simply" + 0.031*"lover" + '
  '0.021*"disappointing" + 0.021*"miss" + 0.017*"complain" + 0.016*"con" + '
  '0.016*"rock" + 0.015*"regular"'),
 (3,
  '0.139*"work" + 0.038*"day" + 0.038*"month" + 0.037*"right" + 0.036*"bad" + '
  '0.027*"leave" + 0.025*"stop" + 0.025*"expect" + 0.024*"better" + '
  '0.024*"airpod"'),
 (4,
  '0.073*"good" + 0.064*"sound" + 0.053*"quality" + 0.050*"product" + '
  '0.046*"great" + 0.024*"buy" + 0.023*"use" + 0.020*"price" + 0.020*"battery" '
  '+ 0.015*"bass"'),
 (5,
  '0.076*"excellent" + 0.075*"purchase" + 0.066*"money" + 0.039*"receive" + '


### Scoring the Model

In [124]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.953678345642307

Coherence Score:  0.4873915485536265


### Visualizing the Results

In [125]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [129]:
pyLDAvis.save_html(vis, 'LDA7.html')

### Modeling with 5 Topics

In [130]:
# Build LDA model with 5 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [131]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.052*"be" + 0.032*"make" + 0.029*"can" + 0.027*"say" + 0.025*"hear" + '
  '0.025*"review" + 0.017*"return" + 0.016*"back" + 0.013*"know" + '
  '0.013*"drop"'),
 (1,
  '0.041*"easy" + 0.018*"bubble" + 0.016*"stop" + 0.015*"work" + '
  '0.015*"instruction" + 0.014*"come" + 0.014*"install" + 0.014*"receive" + '
  '0.013*"clear" + 0.013*"new"'),
 (2,
  '0.023*"cut" + 0.023*"lose" + 0.020*"sometimes" + 0.019*"change" + '
  '0.015*"play" + 0.015*"pocket" + 0.013*"minute" + 0.012*"charger" + '
  '0.012*"next" + 0.012*"fantastic"'),
 (3,
  '0.043*"use" + 0.037*"ear" + 0.035*"work" + 0.023*"would" + 0.022*"charge" + '
  '0.021*"pair" + 0.020*"phone" + 0.019*"case" + 0.018*"get" + 0.017*"time"'),
 (4,
  '0.077*"good" + 0.067*"sound" + 0.056*"quality" + 0.053*"product" + '
  '0.047*"great" + 0.025*"buy" + 0.021*"price" + 0.021*"battery" + '
  '0.015*"bass" + 0.015*"love"')]


In [132]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.86477907629249

Coherence Score:  0.5485265285123982


In [133]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'LDA5.html')
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### LDA Model with 3 Topics

In [134]:
# Build LDA model with 3 topics
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3, 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [135]:
# Print the Keyword in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.091*"good" + 0.066*"quality" + 0.065*"sound" + 0.051*"product" + '
  '0.029*"great" + 0.025*"price" + 0.025*"battery" + 0.023*"buy" + '
  '0.018*"bass" + 0.017*"earphone"'),
 (1,
  '0.030*"easy" + 0.028*"work" + 0.018*"month" + 0.017*"product" + '
  '0.013*"bubble" + 0.012*"stop" + 0.011*"instruction" + 0.010*"install" + '
  '0.010*"buy" + 0.010*"receive"'),
 (2,
  '0.023*"ear" + 0.023*"use" + 0.015*"work" + 0.013*"charge" + 0.013*"great" + '
  '0.013*"pair" + 0.013*"phone" + 0.012*"be" + 0.012*"would" + 0.012*"case"')]


In [136]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.763972961377351

Coherence Score:  0.49996614574931186


In [137]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(vis, 'LDA3.html')
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
