# Unlocking E-Commerce Growth through NLP-Driven Customer Review Analysis Introduction -- LDA Topic Modeling

#### Libraries

In [1]:
# import libraries  
import numpy as np
import pandas as pd
import seaborn as sns
import string
import pprint

import matplotlib.pyplot as plt
%matplotlib inline

# For lemmatisation
import spacy          
import nltk

# gensim for LDA 
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools pyLDA visualization
import pyLDAvis
import pyLDAvis.gensim  
#from pyLDAvis import gensim_models as pg

# Ignore warning
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  """


In [4]:
df = pd.read_csv('Product Review Large Data.csv')
df.sample(30)

Unnamed: 0,id,asins,brand,categories,colors,dateAdded,dateUpdated,dimension,ean,keys,...,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sizes,upc,weight
4715,ACCFKYE2ARGG67WC,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,3.0,,There should be more bass like JBL headphones....,Nice,,,,,,
3701,ACCFR3Q77R6RRGAC,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,4.0,,If u r bass lover then dont choose it otherwis...,Worth the money,,,,,,
10359,ACCEVQZABYWJHRHF,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,Sound is good and enough for normal listener B...,Terrific purchase,,,,,,
2602,ACCFHGZFS7GB9CVM,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,I m writing this review after 2days of extensi...,Just wow!,,,,,,
8914,ACCFSKBJYWZKXGCP,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,Good,Wonderful,,,,,,
10302,ACCEVQZABYWJHRHF,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,"Recieved same as in the description, Sound Qua...",Terrific purchase,,,,,,
7221,ACCFVWN4PGNTEFGY,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,Box is very slippery. Else everything is good.,Perfect product!,,,,,,
4832,ACCFKYE2ARGG67WC,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,4.0,,It definitely satisfy your expectations.. bass...,Delightful,,,,,,
9746,ACCFSDGXX3S6DVBG,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,4.0,,Good,Good quality product,,,,,,
1809,ACCFZGAQJGYCYDCM,,Flipkart,"Flipkart Headphone, Devices & Accessories, Blu...",,2019-07-02T14:40:43Z,2020-08-33T08:28:46Z,,,,...,5.0,,Best Product In Best Price.,Best in the market!,,,,,,


### Preprocessing

1. Tokenize each review (using gensim)
2. Remove stop words (including punctuations)
3. Lemmatize (using spacy)

In [6]:
# tokenize using gensim simple_preprocess
def sent_to_words(sentences, deacc=True): # deacc=True removes punctuations
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence)))  

# to list
data = df['reviews.text'].values.tolist()
data_words = list(sent_to_words(data))

In [7]:
print(data_words[3])

['bought', 'one', 'of', 'the', 'first', 'paperwhites', 'and', 'have', 'been', 'very', 'pleased', 'with', 'it', 'its', 'been', 'constant', 'companion', 'and', 'suppose', 'ive', 'read', 'on', 'average', 'book', 'every', 'three', 'days', 'for', 'the', 'past', 'however', 'many', 'years', 'on', 'it', 'wouldnt', 'give', 'it', 'up', 'youd', 'have', 'to', 'pry', 'it', 'from', 'my', 'cold', 'dead', 'fingers', 'for', 'sundry', 'logistical', 'reasons', 'ive', 'also', 'made', 'good', 'use', 'of', 'amazons', 'kindle', 'app', 'on', 'my', 'iphone', 'no', 'paperwhite', 'screen', 'naturally', 'and', 'all', 'the', 'cool', 'usability', 'that', 'delivers', 'but', 'it', 'works', 'well', 'and', 'has', 'its', 'own', 'attractions', 'as', 'companion', 'to', 'the', 'kindle', 'of', 'course', 'there', 'are', 'aspects', 'of', 'the', 'paperwhite', 'which', 'would', 'like', 'to', 'critique', 'ah', 'you', 'knew', 'that', 'was', 'coming', 'somewhere', 'didnt', 'you', 'as', 'member', 'of', 'bookbub', 'get', 'daily', 'l

In [8]:
# create list of stop words

from nltk.corpus import stopwords
stop_words = stopwords.words('english') + list(string.punctuation)


In [9]:
# Removing stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# remove stop words
data_words_nostops = remove_stopwords(data_words)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])

In [11]:
print(data_lemmatized[3])

['first', 'paperwhite', 'constant', 'companion', 'average', 'book', 'day', 'many', 'year', 'd', 'cold', 'dead', 'finger', 'sundry', 'logistical', 'reason', 'good', 'use', 'amazon', 'kindle', 'iphone', 'paperwhite', 'screen', 'cool', 'usability', 'deliver', 'well', 'attraction', 'course', 'paperwhite', 'critique', 'member', 'bookbub', 'daily', 'list', 'alert', 'book', 'deal', 'genre', 'many', 'good', 'world', 'day', 'book', 'good', 'stuff', 'accumulative', 'effect', 'number', 'book', 'upward', 'time', 'mind', 'page', 'turning', 'action', 'kindle', 'glacial', 'slow', 'slow', 'general', 'consensus', 'many', 'book', 'kindle', 'begin', 'mad', 'amazon', 'state', 'thousand', 'book', 'figure', 'second', 'paperwhite', 'read', 'action', 'first', 'read']


In [12]:
print(' '.join(data_words[3]), '\n')

bought one of the first paperwhites and have been very pleased with it its been constant companion and suppose ive read on average book every three days for the past however many years on it wouldnt give it up youd have to pry it from my cold dead fingers for sundry logistical reasons ive also made good use of amazons kindle app on my iphone no paperwhite screen naturally and all the cool usability that delivers but it works well and has its own attractions as companion to the kindle of course there are aspects of the paperwhite which would like to critique ah you knew that was coming somewhere didnt you as member of bookbub get daily list of alerts and book deals in my chosen genres take on many of them however ive found that even with the best will in the world cant keep up some days it seems that for every book read ive bought two theres just so much good stuff out there the accumulative effect of this is that the number of books actually on my paperwhite has been creeping ever upwa

In [13]:
# After lemmatization
print(' '.join(data_lemmatized[3]))

first paperwhite constant companion average book day many year d cold dead finger sundry logistical reason good use amazon kindle iphone paperwhite screen cool usability deliver well attraction course paperwhite critique member bookbub daily list alert book deal genre many good world day book good stuff accumulative effect number book upward time mind page turning action kindle glacial slow slow general consensus many book kindle begin mad amazon state thousand book figure second paperwhite read action first read


### Creating Dictionary and Corpus

In [14]:
# create dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

In [15]:
print(corpus[3])

[(1, 2), (8, 2), (25, 1), (26, 4), (35, 1), (43, 1), (46, 1), (53, 6), (76, 3), (80, 3), (99, 1), (100, 1), (104, 1), (105, 2), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1), (111, 1), (112, 1), (113, 1), (114, 1), (115, 1), (116, 1), (117, 1), (118, 1), (119, 1), (120, 1), (121, 1), (122, 1), (123, 1), (124, 1), (125, 1), (126, 2), (127, 1), (128, 1), (129, 1), (130, 3), (131, 1), (132, 1), (133, 1), (134, 1), (135, 1), (136, 1), (137, 1), (138, 2), (139, 1), (140, 1), (141, 2), (142, 1), (143, 1), (144, 1), (145, 1), (146, 1), (147, 1), (148, 1), (149, 1)]


In [16]:
# Format of corpus (term,frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('adjustment', 1),
  ('amazon', 1),
  ('auto', 1),
  ('basis', 1),
  ('case', 1),
  ('certain', 1),
  ('change', 1),
  ('custom', 1),
  ('day', 2),
  ('delivery', 1),
  ('easy', 1),
  ('expense', 1),
  ('extra', 1),
  ('fine', 1),
  ('friend', 1),
  ('glad', 1),
  ('great', 1),
  ('hard', 1),
  ('international', 1),
  ('level', 1),
  ('light', 3),
  ('model', 1),
  ('money', 1),
  ('need', 1),
  ('option', 1),
  ('page', 1),
  ('paperwhite', 3),
  ('party', 1),
  ('press', 1),
  ('pricey', 1),
  ('reading', 1),
  ('receptive', 1),
  ('regardless', 1),
  ('regret', 1),
  ('review', 1),
  ('screen', 1),
  ('service', 1),
  ('setting', 2),
  ('shipping', 2),
  ('specific', 2),
  ('spending', 1),
  ('thing', 1),
  ('third', 1),
  ('time', 3),
  ('tracking', 1),
  ('trouble', 1),
  ('use', 1),
  ('voyage', 3),
  ('week', 1),
  ('worry', 1)]]

### Building Topic Model 

Define 30 topics to start with. 


In [21]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=30, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1000,
                                           passes=100,
                                           alpha=0.1,
                                           per_word_topics=True)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [22]:
# print the 30 topics
pprint.pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(22,
  '0.451*"nice" + 0.328*"headphone" + 0.023*"iphone" + 0.022*"waste" + '
  '0.014*"chance" + 0.013*"complete" + 0.011*"gift" + 0.010*"fun" + '
  '0.009*"android" + 0.008*"regular"'),
 (12,
  '0.203*"bad" + 0.141*"product" + 0.096*"review" + 0.049*"star" + '
  '0.048*"quality" + 0.048*"item" + 0.041*"big" + 0.037*"cheap" + 0.032*"pair" '
  '+ 0.028*"end"'),
 (28,
  '0.080*"connectivity" + 0.078*"build" + 0.069*"light" + 0.051*"weight" + '
  '0.044*"feature" + 0.037*"premium" + 0.036*"magnetic" + 0.030*"look" + '
  '0.025*"soft" + 0.024*"switch"'),
 (27,
  '0.183*"happy" + 0.158*"super" + 0.106*"purchase" + 0.065*"music" + '
  '0.033*"mind" + 0.026*"neck" + 0.026*"single" + 0.026*"amazing" + '
  '0.025*"band" + 0.022*"listening"'),
 (8,
  '0.055*"screen" + 0.047*"support" + 0.046*"large" + 0.042*"main" + '
  '0.040*"camera" + 0.034*"version" + 0.031*"thing" + 0.029*"gig" + '
  '0.029*"dual" + 0.025*"tablet"'),
 (7,
  '0.209*"worth" + 0.105*"bit" + 0.096*"year" + 0.090*"high" + 0.08

## Evaluation

In [23]:
# evaluation
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4877212671869651


## Visualization
**pyLDAvis** library help to see excellent interactive visualization.

In [24]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
