In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline


import gensim #the library for Topic modelling
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim_models #LDA visualization library

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

In [2]:
data = pd.read_csv('product_review.csv')
df=pd.DataFrame(data)
df.head(5)

Unnamed: 0,id,asins,brand,categories,colors,dateAdded,dateUpdated,dimension,ean,keys,...,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sizes,upc,weight
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!",,,Cristina M,,,205 grams
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,Allow me to preface this with a little history...,One Simply Could Not Ask For More,,,Ricky,,,205 grams
2,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,4.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I am enjoying it so far. Great for reading. Ha...,Great for those that just want an e-reader,,,Tedd Gardiner,,,205 grams
3,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I bought one of the first Paperwhites and have...,Love / Hate relationship,,,Dougal,,,205 grams
4,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I have to say upfront - I don't like coroporat...,I LOVE IT,,,Miljan David Tanic,,,205 grams


In [3]:
df.shape

(1597, 27)

In [4]:
df.isna().sum()

id                         0
asins                      0
brand                      0
categories                 0
colors                   823
dateAdded                  0
dateUpdated                0
dimension               1032
ean                      699
keys                       0
manufacturer             632
manufacturerNumber       695
name                       0
prices                     0
reviews.date             380
reviews.doRecommend     1058
reviews.numHelpful       697
reviews.rating           420
reviews.sourceURLs         0
reviews.text               0
reviews.title             17
reviews.userCity        1597
reviews.userProvince    1597
reviews.username          17
sizes                   1597
upc                      699
weight                   911
dtype: int64

In [5]:
df_title= df['reviews.title']

In [6]:
df_title.shape

(1597,)

In [7]:
df_title.isna().sum()

17

In [8]:
df_title.dropna(inplace=True)

In [9]:
df_title.isna().sum()

0

In [10]:
print('Head\n',df_title.head(5))

print('-----------------------------------------------------------------------')

print('Tail\n',df_title.tail(5))

Head
 0                Paperwhite voyage, no regrets!
1             One Simply Could Not Ask For More
2    Great for those that just want an e-reader
3                      Love / Hate relationship
4                                     I LOVE IT
Name: reviews.title, dtype: object
-----------------------------------------------------------------------
Tail
 1592    I would be disappointed with myself if i produ...
1593                          Battery draining remote!!!!
1594        replacing an even worse remote. Waste of time
1595                                           Overpriced
1596    I am sending all of this crap back to amazon a...
Name: reviews.title, dtype: object


In [11]:
df_title= df_title.reset_index(drop=True)

In [12]:
print('HEAD\n',df_title.head(5))
print('------------------------------------------------------------------')
print('TAIL\n',df_title.tail(5))

HEAD
 0                Paperwhite voyage, no regrets!
1             One Simply Could Not Ask For More
2    Great for those that just want an e-reader
3                      Love / Hate relationship
4                                     I LOVE IT
Name: reviews.title, dtype: object
------------------------------------------------------------------
TAIL
 1575    I would be disappointed with myself if i produ...
1576                          Battery draining remote!!!!
1577        replacing an even worse remote. Waste of time
1578                                           Overpriced
1579    I am sending all of this crap back to amazon a...
Name: reviews.title, dtype: object


In [13]:
stop= set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [14]:
df_title = df_title.apply(clean)

In [15]:
df_title.head(2)

0    [paperwhite, voyage, regret]
1       [one, simply, could, ask]
Name: reviews.title, dtype: object

In [16]:
#Gensim Model for Topics

In [17]:
# #create dictionary
# dictionary = corpora.Dictionary(df['reviews.title'])
# #Total number of non-zeroes in the BOW matrix (sum of the number of unique words per document over the entire corpus).
# print(dictionary.num_nnz)

In [18]:
# doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['Feedback_clean'] ]
# print(len(doc_term_matrix))

In [19]:
lda = gensim.models.ldamodel.LdaModel

In [20]:
# num_topics=5
# %time 
# ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

In [21]:
# Topics=ldamodel.print_topics(num_topics=num_topics)

In [22]:
df_title=df_title.astype(str)

In [23]:
df_title.head(2)

0    ['paperwhite', 'voyage', 'regret']
1     ['one', 'simply', 'could', 'ask']
Name: reviews.title, dtype: object

In [24]:
len(df_title)

1580

In [25]:
vectorizer = CountVectorizer(analyzer='word',                  # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z]{3,}',     # num chars > 3
                             max_features=50000)               # max number of uniq words
                            

In [26]:
data_vectorized = vectorizer.fit_transform(df_title)

In [27]:
print(data_vectorized)

  (0, 530)	1
  (0, 826)	1
  (0, 620)	1
  (1, 684)	1
  (1, 44)	1
  (2, 326)	1
  (2, 828)	1
  (2, 239)	1
  (3, 447)	1
  (3, 336)	1
  (3, 623)	1
  (4, 447)	1
  (5, 326)	1
  (5, 193)	1
  (5, 608)	1
  (5, 538)	1
  (5, 342)	2
  (5, 638)	1
  (5, 858)	1
  (6, 684)	1
  (6, 44)	1
  (6, 538)	1
  (6, 342)	2
  (6, 638)	1
  (6, 858)	1
  :	:
  (1573, 153)	1
  (1573, 309)	1
  (1573, 630)	1
  (1573, 508)	1
  (1574, 157)	1
  (1574, 626)	1
  (1574, 824)	1
  (1574, 561)	1
  (1575, 626)	1
  (1575, 200)	1
  (1575, 581)	1
  (1576, 60)	1
  (1576, 626)	1
  (1576, 214)	1
  (1577, 773)	1
  (1577, 626)	1
  (1577, 631)	1
  (1577, 832)	1
  (1577, 849)	1
  (1578, 524)	1
  (1579, 26)	1
  (1579, 663)	1
  (1579, 168)	1
  (1579, 100)	1
  (1579, 738)	1


In [28]:
data_vectorized.getnnz(axis=None)

7378

In [29]:
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  0.5423484614592982 %


In [30]:
lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=13,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=3,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

LatentDirichletAllocation(batch_size=3, learning_method='online', max_iter=13,
                          n_components=5, n_jobs=-1, random_state=100)


In [31]:
len(lda_output)

1580

In [32]:
lda_output

array([[0.05000189, 0.05000361, 0.05000448, 0.79998677, 0.05000325],
       [0.06667138, 0.06667542, 0.40054693, 0.39943184, 0.06667442],
       [0.05000124, 0.55051075, 0.05000282, 0.05000249, 0.2994827 ],
       ...,
       [0.03333445, 0.19880916, 0.70118545, 0.03333561, 0.03333534],
       [0.59996193, 0.10001262, 0.10000988, 0.10000826, 0.10000731],
       [0.03333425, 0.03333512, 0.69981959, 0.03333518, 0.20017586]])

In [33]:
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -40369.63165255857
Perplexity:  166.54407117079913
{'batch_size': 3,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 13,
 'mean_change_tol': 0.001,
 'n_components': 5,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [34]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

In [35]:
print(lda_model.n_components)

5


In [36]:
# topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

In [37]:
topicnames = ['Helpfull','Great','Use','Awesome','Nice']

In [38]:
docnames = ["Doc" + str(i) for i in range(len(df_title))]

In [39]:
topicnames

['Helpfull', 'Great', 'Use', 'Awesome', 'Nice']

In [40]:
len(docnames)

1580

In [41]:
len(lda_output)

1580

In [42]:
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic_index'] = dominant_topic
df_document_topic['Topic']=df_document_topic['dominant_topic_index']
df_document_topic.replace({'Topic' : { 0 : 'Helpfull', 1 : 'Great', 2 : 'Use', 3 : 'Awesome', 4 : 'Nice'}},inplace=True)


# Styling
# def color_red(val):
#     color = 'red' if val > .1 else 'black'
#     return 'color: {col}'.format(col=color)

# def make_bold(val):
#     weight = 700 if val > .1 else 400
#     return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15)#.style.applymap(color_red).applymap(make_bold)
df_document_topics


Unnamed: 0,Helpfull,Great,Use,Awesome,Nice,dominant_topic_index,Topic
Doc0,0.05,0.05,0.05,0.8,0.05,3,Awesome
Doc1,0.07,0.07,0.4,0.4,0.07,2,Use
Doc2,0.05,0.55,0.05,0.05,0.3,1,Great
Doc3,0.3,0.05,0.05,0.05,0.55,4,Nice
Doc4,0.6,0.1,0.1,0.1,0.1,0,Helpfull
Doc5,0.58,0.24,0.02,0.02,0.13,0,Helpfull
Doc6,0.65,0.03,0.15,0.15,0.03,0,Helpfull
Doc7,0.92,0.02,0.02,0.02,0.02,0,Helpfull
Doc8,0.58,0.02,0.02,0.35,0.02,0,Helpfull
Doc9,0.58,0.24,0.02,0.02,0.13,0,Helpfull


In [43]:
# #Word cloud

# from matplotlib import pyplot as plt
# from wordcloud import WordCloud, STOPWORDS
# import matplotlib.colors as mcolors

# cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

# cloud = WordCloud(stopwords=stop,
#                   background_color='white',
#                   width=2500,
#                   height=1800,
#                   max_words=10,
#                   colormap='tab10',
#                   color_func=lambda *args, **kwargs: cols[i],
#                   prefer_horizontal=1.0)

# topics = lda_model.show_topics(formatted=False)

# fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

# for i, ax in enumerate(axes.flatten()):
#     fig.add_subplot(ax)
#     topic_words = dict(topics[i][1])
#     cloud.generate_from_frequencies(topic_words, max_font_size=300)
#     plt.gca().imshow(cloud)
#     plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
#     plt.gca().axis('off')


# plt.subplots_adjust(wspace=0, hspace=0)
# plt.axis('off')
# plt.margins(x=0, y=0)
# plt.tight_layout()
# plt.show()