In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from pprint import pprint


In [None]:
# gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  from collections import Iterable


1. Read the .csv file using Pandas. Take a look at the top few records

In [None]:
ReviewData = pd.read_csv('K8 Reviews v0.2.csv').drop('sentiment', axis=1)
ReviewData.head()

Unnamed: 0,review
0,Good but need updates and improvements
1,"Worst mobile i have bought ever, Battery is dr..."
2,when I will get my 10% cash back.... its alrea...
3,Good
4,The worst phone everThey have changed the last...


In [None]:
ReviewData.shape

(14675, 1)

In [None]:
ReviewData.isnull().sum()

review    0
dtype: int64

In [None]:
ReviewData.head()

Unnamed: 0,review
0,Good but need updates and improvements
1,"Worst mobile i have bought ever, Battery is dr..."
2,when I will get my 10% cash back.... its alrea...
3,Good
4,The worst phone everThey have changed the last...


In [None]:
ReviewData['clean_review'] = ReviewData['review'].str.lower()
ReviewData.head()

Unnamed: 0,review,clean_review
0,Good but need updates and improvements,good but need updates and improvements
1,"Worst mobile i have bought ever, Battery is dr...","worst mobile i have bought ever, battery is dr..."
2,when I will get my 10% cash back.... its alrea...,when i will get my 10% cash back.... its alrea...
3,Good,good
4,The worst phone everThey have changed the last...,the worst phone everthey have changed the last...


In [None]:
ReviewData['clean_review'] = ReviewData['clean_review'].str.replace(r'[^a-zA-Z\s]', ' ',regex=True)
ReviewData.head()

Unnamed: 0,review,clean_review
0,Good but need updates and improvements,good but need updates and improvements
1,"Worst mobile i have bought ever, Battery is dr...",worst mobile i have bought ever battery is dr...
2,when I will get my 10% cash back.... its alrea...,when i will get my cash back its alrea...
3,Good,good
4,The worst phone everThey have changed the last...,the worst phone everthey have changed the last...


In [None]:
ReviewData['clean_review'] = ReviewData['clean_review'].str.replace(r'\s{2,}', ' ',regex=True)
ReviewData.head()

Unnamed: 0,review,clean_review
0,Good but need updates and improvements,good but need updates and improvements
1,"Worst mobile i have bought ever, Battery is dr...",worst mobile i have bought ever battery is dra...
2,when I will get my 10% cash back.... its alrea...,when i will get my cash back its already january
3,Good,good
4,The worst phone everThey have changed the last...,the worst phone everthey have changed the last...


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
ReviewData['clean_review'] = ReviewData['clean_review'].apply(lambda x: word_tokenize(x))
ReviewData.head()

Unnamed: 0,review,clean_review
0,Good but need updates and improvements,"[good, but, need, updates, and, improvements]"
1,"Worst mobile i have bought ever, Battery is dr...","[worst, mobile, i, have, bought, ever, battery..."
2,when I will get my 10% cash back.... its alrea...,"[when, i, will, get, my, cash, back, its, alre..."
3,Good,[good]
4,The worst phone everThey have changed the last...,"[the, worst, phone, everthey, have, changed, t..."


In [None]:
#Remove unnecessary words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
ReviewData['clean_review'] = ReviewData['clean_review'].apply(lambda x:[word for word in x if word not in stopwords.words('english') and len(word)>3 and word.isalpha()])
ReviewData.head()

Unnamed: 0,review,clean_review
0,Good but need updates and improvements,"[good, need, updates, improvements]"
1,"Worst mobile i have bought ever, Battery is dr...","[worst, mobile, bought, ever, battery, drainin..."
2,when I will get my 10% cash back.... its alrea...,"[cash, back, already, january]"
3,Good,[good]
4,The worst phone everThey have changed the last...,"[worst, phone, everthey, changed, last, phone,..."


In [None]:
#keeping records with more than single words
ReviewData = ReviewData[ReviewData['clean_review'].map(lambda x: len(x)) > 1].reset_index(drop=True)

In [None]:
#lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
ReviewData['clean_review'] = ReviewData['clean_review'].apply(lambda x: [WordNetLemmatizer().lemmatize(word) for word in x])
ReviewData.head()

Unnamed: 0,review,clean_review
0,Good but need updates and improvements,"[good, need, update, improvement]"
1,"Worst mobile i have bought ever, Battery is dr...","[worst, mobile, bought, ever, battery, drainin..."
2,when I will get my 10% cash back.... its alrea...,"[cash, back, already, january]"
3,The worst phone everThey have changed the last...,"[worst, phone, everthey, changed, last, phone,..."
4,Only I'm telling don't buyI'm totally disappoi...,"[telling, buyi, totally, disappointedpoor, bat..."


In [None]:
#extracting only noun
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
ReviewData['clean_review'] = ReviewData['clean_review'].apply(lambda x: [word for word in x if nltk.pos_tag([word])[0][1] == 'NN'])
ReviewData.head()

Unnamed: 0,review,clean_review
0,Good but need updates and improvements,"[need, update, improvement]"
1,"Worst mobile i have bought ever, Battery is dr...","[mobile, bought, battery, hell, backup, hour, ..."
2,when I will get my 10% cash back.... its alrea...,"[cash, january]"
3,The worst phone everThey have changed the last...,"[phone, everthey, phone, problem, amazon, phon..."
4,Only I'm telling don't buyI'm totally disappoi...,"[buyi, disappointedpoor, batterypoor, camerawa..."


In [None]:
ReviewData = ReviewData[ReviewData['clean_review'].map(lambda x: len(x)) > 1].reset_index(drop=True)
ReviewData.head()

Unnamed: 0,review,clean_review
0,Good but need updates and improvements,"[need, update, improvement]"
1,"Worst mobile i have bought ever, Battery is dr...","[mobile, bought, battery, hell, backup, hour, ..."
2,when I will get my 10% cash back.... its alrea...,"[cash, january]"
3,The worst phone everThey have changed the last...,"[phone, everthey, phone, problem, amazon, phon..."
4,Only I'm telling don't buyI'm totally disappoi...,"[buyi, disappointedpoor, batterypoor, camerawa..."


In [None]:
#using document term matrix
dictionary = corpora.Dictionary(ReviewData['clean_review'])
print(dictionary)

Dictionary(6724 unique tokens: ['improvement', 'need', 'update', 'amazon', 'backup']...)


In [None]:
doc_term_matrix = ReviewData['clean_review'].apply(lambda x: dictionary.doc2bow(x))
doc_term_matrix[:10]
#each tokenize word has been assigned index value their count in corpus

0                             [(0, 1), (1, 1), (2, 1)]
1    [(3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1...
2                                   [(19, 1), (20, 1)]
3                  [(3, 2), (21, 1), (22, 3), (23, 1)]
4        [(24, 1), (25, 1), (26, 1), (27, 1), (28, 1)]
5    [(14, 1), (22, 1), (29, 1), (30, 1), (31, 1), ...
6                           [(5, 1), (36, 1), (37, 1)]
7    [(14, 2), (22, 2), (23, 2), (34, 1), (38, 1), ...
8                 [(44, 1), (45, 1), (46, 1), (47, 1)]
9                  [(8, 1), (22, 1), (48, 1), (49, 1)]
Name: clean_review, dtype: object

In [None]:
from IPython.display import clear_output

In [None]:
lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(corpus=doc_term_matrix, num_topics=12, id2word=dictionary, passes=10, random_state=1)
clear_output()
# corpus require document term matrix
# num_topics is used to define number of topics to create from corpus
# id2word require mapping of words
# passes is used to define number of iteration

In [None]:
ldamodel.print_topics()
# we have printed all 12 topics and their keywords generated by LDA

[(0,
  '0.067*"device" + 0.051*"lenovo" + 0.042*"note" + 0.027*"feature" + 0.023*"call" + 0.020*"android" + 0.019*"killer" + 0.017*"video" + 0.017*"user" + 0.017*"volta"'),
 (1,
  '0.173*"camera" + 0.124*"quality" + 0.090*"phone" + 0.035*"processor" + 0.034*"performance" + 0.029*"sound" + 0.022*"battery" + 0.021*"mark" + 0.017*"superb" + 0.015*"speed"'),
 (2,
  '0.113*"camera" + 0.070*"phone" + 0.050*"awesome" + 0.042*"battery" + 0.031*"mode" + 0.030*"front" + 0.022*"depth" + 0.020*"rear" + 0.019*"performance" + 0.016*"super"'),
 (3,
  '0.180*"battery" + 0.120*"problem" + 0.102*"heating" + 0.075*"issue" + 0.047*"fast" + 0.041*"backup" + 0.037*"drain" + 0.035*"network" + 0.031*"mobile" + 0.018*"camera"'),
 (4,
  '0.214*"phone" + 0.040*"battery" + 0.037*"issue" + 0.033*"month" + 0.029*"heat" + 0.024*"time" + 0.020*"usage" + 0.019*"day" + 0.016*"bought" + 0.015*"heating"'),
 (5,
  '0.120*"price" + 0.116*"phone" + 0.067*"feature" + 0.048*"range" + 0.040*"delivery" + 0.018*"amazon" + 0.016*

In [None]:
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=ldamodel,texts=ReviewData['clean_review'],\
                                     dictionary=dictionary, coherence='c_v')
print('\nCoherence score: ', coherence_model_lda.get_coherence())


Coherence score:  0.5375844571364063


In [None]:
# computing coherence score for different size of topic
def calculate_topic_cv(ldamodel,texts,dictionary,topic_range):
  cv_score = []
  topic_num = []
  for i in range(2,topic_range):
    topic_num.append(i)
    lda = gensim.models.ldamodel.LdaModel
    ldamodel = lda(doc_term_matrix, num_topics=i, id2word=dictionary, passes=10, random_state=1)
    cv_score.append(CoherenceModel(model=ldamodel, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence())

    clear_output()
  return topic_num,cv_score

In [None]:
topic_num,cv_score = calculate_topic_cv(ldamodel,ReviewData['clean_review'],dictionary,15)

In [None]:
pd.DataFrame(zip(topic_num,cv_score),columns=['Topic','Coherence_score']).set_index('Topic').sort_values('Coherence_score', ascending=False)

Unnamed: 0_level_0,Coherence_score
Topic,Unnamed: 1_level_1
7,0.606436
6,0.597779
5,0.587942
8,0.581643
9,0.575069
10,0.558955
13,0.544947
11,0.539907
3,0.539523
12,0.537584


In [None]:
# we will going thorugh with the topic number 7 because there is overlapping with other most of topics
lda = gensim.models.ldamodel.LdaModel
ldamodel = lda(corpus=doc_term_matrix, num_topics=7, id2word=dictionary, passes=10, random_state=1)
clear_output()
print(CoherenceModel(model=ldamodel, texts=ReviewData['clean_review'], dictionary=dictionary, coherence='c_v').get_coherence())

0.606435640898136


In [None]:
ldamodel.print_topics()

[(0,
  '0.096*"lenovo" + 0.074*"note" + 0.064*"phone" + 0.020*"device" + 0.017*"mobile" + 0.017*"service" + 0.016*"update" + 0.015*"call" + 0.014*"feature" + 0.011*"android"'),
 (1,
  '0.101*"camera" + 0.079*"quality" + 0.058*"phone" + 0.032*"waste" + 0.029*"money" + 0.023*"processor" + 0.022*"network" + 0.020*"sound" + 0.019*"signal" + 0.016*"worth"'),
 (2,
  '0.101*"phone" + 0.099*"camera" + 0.045*"price" + 0.034*"awesome" + 0.034*"mobile" + 0.031*"performance" + 0.027*"battery" + 0.027*"feature" + 0.024*"quality" + 0.020*"excellent"'),
 (3,
  '0.162*"problem" + 0.130*"mobile" + 0.087*"heating" + 0.063*"issue" + 0.039*"battery" + 0.031*"network" + 0.013*"update" + 0.012*"heat" + 0.010*"software" + 0.008*"star"'),
 (4,
  '0.113*"battery" + 0.108*"phone" + 0.036*"issue" + 0.032*"fast" + 0.027*"charge" + 0.026*"hour" + 0.024*"heating" + 0.022*"backup" + 0.022*"time" + 0.022*"drain"'),
 (5,
  '0.187*"product" + 0.090*"amazon" + 0.039*"return" + 0.025*"delivery" + 0.020*"lenovo" + 0.020*"

Creating a lookup table for topics

In [None]:
topic_lookup_data = pd.DataFrame(ldamodel.print_topics(), columns=['Topic_Number', 'Topic_Keywords'])
topic_lookup_data['Topic_Name'] = ['Camera issue', 'Sound issue', 'Mixed issues', 'Heating issue', 'turbo charger', 'Connectivity', 'Battery']
topic_lookup_data = topic_lookup_data[['Topic_Number', 'Topic_Name', 'Topic_Keywords']]
topic_lookup_data['Topic_Keywords'] = topic_lookup_data.Topic_Keywords.str.replace(r'[^a-z]',' ',regex=True).apply(lambda x: x.split())
topic_lookup_data.style.set_properties(subset=['Topic_Keywords'], **{'width': '300px'})


Unnamed: 0,Topic_Number,Topic_Name,Topic_Keywords
0,0,Camera issue,"['lenovo', 'note', 'phone', 'device', 'mobile', 'service', 'update', 'call', 'feature', 'android']"
1,1,Sound issue,"['camera', 'quality', 'phone', 'waste', 'money', 'processor', 'network', 'sound', 'signal', 'worth']"
2,2,Mixed issues,"['phone', 'camera', 'price', 'awesome', 'mobile', 'performance', 'battery', 'feature', 'quality', 'excellent']"
3,3,Heating issue,"['problem', 'mobile', 'heating', 'issue', 'battery', 'network', 'update', 'heat', 'software', 'star']"
4,4,turbo charger,"['battery', 'phone', 'issue', 'fast', 'charge', 'hour', 'heating', 'backup', 'time', 'drain']"
5,5,Connectivity,"['product', 'amazon', 'return', 'delivery', 'lenovo', 'service', 'replacement', 'customer', 'time', 'phone']"
6,6,Battery,"['screen', 'phone', 'option', 'work', 'display', 'glass', 'cast', 'call', 'time', 'light']"


creating new column and inserting topic numbers and names

In [None]:
for index,sent in enumerate(ldamodel[doc_term_matrix]):
  topic_num = []
  topic_details = sorted(sent, key=lambda x: x[1], reverse=True)[:2] # getting top two topics in descending order
  topic_num.append(topic_details[0][0]) #appending top topic
  if len(topic_details) > 1:
    if topic_details[1][1] > 0.35: # appending second topic only if it has more than 35% influence on current row
      topic_num.append(topic_details[1][0])
  ReviewData.loc[index, 'Topic_Number'] = ','.join(str(x) for x in sorted(topic_num))

In [None]:
for index,topic_num in enumerate(ReviewData.Topic_Number):
  topic_name_list = []
  for single_topic_num in topic_num.split(','):
    single_topic_num = int(single_topic_num)
    topic_name_list.append(topic_lookup_data.loc[topic_lookup_data.Topic_Number == single_topic_num, 'Topic_Name'][single_topic_num])
#extracting topic name from lookup table
ReviewData.loc[index, 'Topic_Name'] =' & '.join(topic_name_list)

In [None]:
ReviewData.head()

Unnamed: 0,review,clean_review,Topic_Number,Topic_Name
0,Good but need updates and improvements,"[need, update, improvement]",3,
1,"Worst mobile i have bought ever, Battery is dr...","[mobile, bought, battery, hell, backup, hour, ...",4,
2,when I will get my 10% cash back.... its alrea...,"[cash, january]",35,
3,The worst phone everThey have changed the last...,"[phone, everthey, phone, problem, amazon, phon...",4,
4,Only I'm telling don't buyI'm totally disappoi...,"[buyi, disappointedpoor, batterypoor, camerawa...",1,


In [None]:
ReviewData.loc[ReviewData.Topic_Number.str.contains('5'),['review', 'Topic_Name']].head(10).style.set_properties(subset=['review'], **{'width': '300px'})

Unnamed: 0,review,Topic_Name
2,when I will get my 10% cash back.... its already 15 January..,
14,"Best product according to their prize range and it's specification comparison to another mobile(under range) and I had bought this phone during amazon seal so I had 2 paid only 10,999 but I have also hdpf credit card in which I was to paid only Rs 9998.. .. So for me... It was 1 of the best deal ever which I hv done on Amazon..",
18,It is not a very good product camera are very poor ...Os is not good...Battery draining very quickly...Like a odinary phone..It was fully unexpected product from Lenovo..,
25,Not a good one. Dual SIM are not working. Poor customer service.,
31,"By purchasing this we have loosed by, Rs.3000 within Span of 30 days. No trust worthy deals from Amazon.",
55,Very very good product and its really fast,
57,Best mobile.Great specifications .Camera is too cleare focus is too sharp and fast thanks Amazon and Lenovo for such mobile.Must buy guys,
58,Verigood Verigood.best,
60,Excellent Product,
61,Bought this product recently from amazon. Thoroughly dissatisfied with the phone. A thumbs down for this one.,
