<a href="https://colab.research.google.com/github/karthik-k27/Topic-Analysis-of-Review-Data/blob/main/Topic_Analysis_of_Review_Data_Ver1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DESCRIPTION

### Help a leading mobile brand understand the voice of the customer by analyzing the reviews of their product on Amazon and the topics that customers are talking about. You will perform topic modeling on specific parts of speech. You’ll finally interpret the emerging topics.

# Problem Statement: 

###  A popular mobile phone brand, Lenovo has launched their budget smartphone in the Indian market. The client wants to understand the VOC (voice of the customer) on the product. This will be useful to not just evaluate the current product, but to also get some direction for developing the product pipeline. The client is particularly interested in the different aspects that customers care about. Product reviews by customers on a leading e-commerce site should provide a good view.

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import nltk

In [None]:
# importing datasets
ds = pd.read_csv("K8 Reviews v0.2.csv")
ds.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [None]:
ds.shape

(14675, 2)

In [None]:
# Splitting the dataset into training and testing
from sklearn.model_selection import train_test_split
ds_train,ds_test=train_test_split(ds,test_size=0.2,random_state=45)


In [None]:
#Normalizing the review texts
reviews = list(ds.iloc[:,1])
sentiments = list(ds.iloc[:,0])
reviews[0:5]

['Good but need updates and improvements',
 "Worst mobile i have bought ever, Battery is draining like hell, backup is only 6 to 7 hours with internet uses, even if I put mobile idle its getting discharged.This is biggest lie from Amazon & Lenove which is not at all expected, they are making full by saying that battery is 4000MAH & booster charger is fake, it takes at least 4 to 5 hours to be fully charged.Don't know how Lenovo will survive by making full of us.Please don;t go for this else you will regret like me.",
 'when I will get my 10% cash back.... its already 15 January..',
 'Good',
 'The worst phone everThey have changed the last phone but the problem is still same and the amazon is not returning the phone .Highly disappointing of amazon']

In [None]:
'''
#Punctuation removal
import re, string
nopunc_reviews=[]
for i in reviews:
    result = re.sub('[%s]' % re.escape(string.punctuation),' ', i)
    nopunc_reviews.append(result)

    
print(len(nopunc_reviews))
print(nopunc_reviews[0:4])
print(result[0:4])
'''

"\n#Punctuation removal\nimport re, string\nnopunc_reviews=[]\nfor i in reviews:\n    result = re.sub('[%s]' % re.escape(string.punctuation),' ', i)\n    nopunc_reviews.append(result)\n\n    \nprint(len(nopunc_reviews))\nprint(nopunc_reviews[0:4])\nprint(result[0:4])\n"

In [None]:
#Tokenization
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
token1 = word_tokenize(reviews[0])
token1

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Good', 'but', 'need', 'updates', 'and', 'improvements']

In [None]:
tokens=[]

for token in reviews:
    tokens.append(word_tokenize(token))

print(tokens[0:4])

[['Good', 'but', 'need', 'updates', 'and', 'improvements'], ['Worst', 'mobile', 'i', 'have', 'bought', 'ever', ',', 'Battery', 'is', 'draining', 'like', 'hell', ',', 'backup', 'is', 'only', '6', 'to', '7', 'hours', 'with', 'internet', 'uses', ',', 'even', 'if', 'I', 'put', 'mobile', 'idle', 'its', 'getting', 'discharged.This', 'is', 'biggest', 'lie', 'from', 'Amazon', '&', 'Lenove', 'which', 'is', 'not', 'at', 'all', 'expected', ',', 'they', 'are', 'making', 'full', 'by', 'saying', 'that', 'battery', 'is', '4000MAH', '&', 'booster', 'charger', 'is', 'fake', ',', 'it', 'takes', 'at', 'least', '4', 'to', '5', 'hours', 'to', 'be', 'fully', 'charged.Do', "n't", 'know', 'how', 'Lenovo', 'will', 'survive', 'by', 'making', 'full', 'of', 'us.Please', 'don', ';', 't', 'go', 'for', 'this', 'else', 'you', 'will', 'regret', 'like', 'me', '.'], ['when', 'I', 'will', 'get', 'my', '10', '%', 'cash', 'back', '...', '.', 'its', 'already', '15', 'January..'], ['Good']]


In [None]:
#POS tagging
import nltk
nltk.download('averaged_perceptron_tagger')
pos_tokens=[]

for tag in tokens:
    pos_tokens.append(nltk.pos_tag(tag))

print(pos_tokens[0:4])


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[[('Good', 'JJ'), ('but', 'CC'), ('need', 'VBP'), ('updates', 'NNS'), ('and', 'CC'), ('improvements', 'NNS')], [('Worst', 'NNP'), ('mobile', 'NN'), ('i', 'NN'), ('have', 'VBP'), ('bought', 'VBN'), ('ever', 'RB'), (',', ','), ('Battery', 'NNP'), ('is', 'VBZ'), ('draining', 'VBG'), ('like', 'IN'), ('hell', 'NN'), (',', ','), ('backup', 'NN'), ('is', 'VBZ'), ('only', 'RB'), ('6', 'CD'), ('to', 'TO'), ('7', 'CD'), ('hours', 'NNS'), ('with', 'IN'), ('internet', 'JJ'), ('uses', 'NNS'), (',', ','), ('even', 'RB'), ('if', 'IN'), ('I', 'PRP'), ('put', 'VBP'), ('mobile', 'JJ'), ('idle', 'NN'), ('its', 'PRP$'), ('getting', 'VBG'), ('discharged.This', 'NN'), ('is', 'VBZ'), ('biggest', 'JJS'), ('lie', 'NN'), ('from', 'IN'), ('Amazon', 'NNP'), ('&', 'CC'), ('Lenove', 'NNP'), ('which', 'WDT'), ('is', 'VBZ'), ('not', 'RB'), ('at', 'IN'), ('all

## Topic Modeling

In [None]:
for word,pos in pos_tokens[0]:
    if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
        print(word,pos)

updates NNS
improvements NNS


In [None]:
test=[]
for word,pos in pos_tokens[1]:
    test.append(word)

print(test)

['Worst', 'mobile', 'i', 'have', 'bought', 'ever', ',', 'Battery', 'is', 'draining', 'like', 'hell', ',', 'backup', 'is', 'only', '6', 'to', '7', 'hours', 'with', 'internet', 'uses', ',', 'even', 'if', 'I', 'put', 'mobile', 'idle', 'its', 'getting', 'discharged.This', 'is', 'biggest', 'lie', 'from', 'Amazon', '&', 'Lenove', 'which', 'is', 'not', 'at', 'all', 'expected', ',', 'they', 'are', 'making', 'full', 'by', 'saying', 'that', 'battery', 'is', '4000MAH', '&', 'booster', 'charger', 'is', 'fake', ',', 'it', 'takes', 'at', 'least', '4', 'to', '5', 'hours', 'to', 'be', 'fully', 'charged.Do', "n't", 'know', 'how', 'Lenovo', 'will', 'survive', 'by', 'making', 'full', 'of', 'us.Please', 'don', ';', 't', 'go', 'for', 'this', 'else', 'you', 'will', 'regret', 'like', 'me', '.']


In [None]:
for word,pos in pos_tokens[0]:
    print(word,pos)

Good JJ
but CC
need VBP
updates NNS
and CC
improvements NNS


In [None]:
#Extracting Nouns
nouns =[]
noun_tokens=[]

index=[]
temp =0
count=0
i=0
while i<len(pos_tokens):
    
    for word,pos in pos_tokens[i]:
    
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
            nouns.append(word)
            count+=1
    
    noun_tokens.append(nouns[temp:count])
    temp=count
    
    i+=1
    

      
nouns[0:4] 

['updates', 'improvements', 'Worst', 'mobile']

In [None]:
print(len(pos_tokens))

print(nouns[0:20])

print(reviews[0:5])

14675
['updates', 'improvements', 'Worst', 'mobile', 'i', 'Battery', 'hell', 'backup', 'hours', 'uses', 'idle', 'discharged.This', 'lie', 'Amazon', 'Lenove', 'battery', 'charger', 'hours', 'Lenovo', 'don']
['Good but need updates and improvements', "Worst mobile i have bought ever, Battery is draining like hell, backup is only 6 to 7 hours with internet uses, even if I put mobile idle its getting discharged.This is biggest lie from Amazon & Lenove which is not at all expected, they are making full by saying that battery is 4000MAH & booster charger is fake, it takes at least 4 to 5 hours to be fully charged.Don't know how Lenovo will survive by making full of us.Please don;t go for this else you will regret like me.", 'when I will get my 10% cash back.... its already 15 January..', 'Good', 'The worst phone everThey have changed the last phone but the problem is still same and the amazon is not returning the phone .Highly disappointing of amazon']


In [None]:
print(len(noun_tokens), len(pos_tokens))
noun_tokens[0:3]

14675 14675


[['updates', 'improvements'],
 ['Worst',
  'mobile',
  'i',
  'Battery',
  'hell',
  'backup',
  'hours',
  'uses',
  'idle',
  'discharged.This',
  'lie',
  'Amazon',
  'Lenove',
  'battery',
  'charger',
  'hours',
  'Lenovo',
  'don'],
 ['%', 'cash', 'January..']]

## Lemmatize

In [None]:
pos_tokens[0]

[('Good', 'JJ'),
 ('but', 'CC'),
 ('need', 'VBP'),
 ('updates', 'NNS'),
 ('and', 'CC'),
 ('improvements', 'NNS')]

In [None]:

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
lemmatizer=WordNetLemmatizer()
stemmer=SnowballStemmer("english")

lemma_stem = []
lemst_tokens=[]
temp=0
count=0

i=0
while i < (len(noun_tokens)):
    for j in noun_tokens[i]:
        lemma_stem.append(stemmer.stem(lemmatizer.lemmatize(j,pos="n")))
        count+=1
        
    lemst_tokens.append(lemma_stem[temp:count])
    temp=count
    i+=1


print(len(lemma_stem))
print(lemma_stem[0:20])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
96760
['updat', 'improv', 'worst', 'mobil', 'i', 'batteri', 'hell', 'backup', 'hour', 'us', 'idl', 'discharged.thi', 'lie', 'amazon', 'lenov', 'batteri', 'charger', 'hour', 'lenovo', 'don']


In [None]:
print(len(lemst_tokens))
print(lemst_tokens[0:4])

14675
[['updat', 'improv'], ['worst', 'mobil', 'i', 'batteri', 'hell', 'backup', 'hour', 'us', 'idl', 'discharged.thi', 'lie', 'amazon', 'lenov', 'batteri', 'charger', 'hour', 'lenovo', 'don'], ['%', 'cash', 'january..'], []]


## Stopwords

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))


filtered_tokens=[]

result=[]
for i in lemst_tokens:
    output = [j for j in i if not j in stop_words]
    filtered_tokens.append(output)
    

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
output = [j for j in lemst_tokens[0] if not j in stop_words]

In [None]:
output

['updat', 'improv']

In [None]:
print(filtered_tokens[0:4])

[['updat', 'improv'], ['worst', 'mobil', 'batteri', 'hell', 'backup', 'hour', 'us', 'idl', 'discharged.thi', 'lie', 'amazon', 'lenov', 'batteri', 'charger', 'hour', 'lenovo'], ['%', 'cash', 'january..'], []]


In [None]:
len(filtered_tokens)

14675

## LDA

In [None]:
import gensim
dictionary = gensim.corpora.Dictionary(filtered_tokens)

In [None]:
count = 0
for k, v in dictionary.iteritems():
    print(k,v)
    count +=1
    if count>10:
        break

0 improv
1 updat
2 amazon
3 backup
4 batteri
5 charger
6 discharged.thi
7 hell
8 hour
9 idl
10 lenov


In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in filtered_tokens]

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0],
                                                    dictionary[bow_doc_x[i][0]],
                                                    bow_doc_x[i][1]))

In [None]:
lda_model = gensim.models.LdaModel(bow_corpus,
                                  num_topics=12,
                                  id2word=dictionary,
                                  passes=10)

In [None]:
lda_model1 = gensim.models.LdaMulticore(bow_corpus,
                                  num_topics=12,
                                  id2word=dictionary,
                                  passes=10,
                                  workers=1)

In [None]:
lda_model2 = gensim.models.LdaMulticore(bow_corpus,
                                  num_topics=12,
                                  id2word=dictionary,
                                  passes=10,
                                  workers=2)

In [None]:
for idx, topic in lda_model2.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.125*"money" + 0.098*"servic" + 0.083*"worst" + 0.058*"wast" + 0.049*"valu" + 0.048*"lenovo" + 0.030*"center" + 0.028*"custom" + 0.027*"handset" + 0.025*"pleas"


Topic: 1 
Words: 0.065*"speaker" + 0.055*"call" + 0.055*"app" + 0.037*"sound" + 0.031*"record" + 0.025*"ram" + 0.024*"bad" + 0.023*"excel" + 0.021*"featur" + 0.021*"set"


Topic: 2 
Words: 0.090*"perform" + 0.040*"updat" + 0.038*"mode" + 0.030*"processor" + 0.028*"game" + 0.027*"android" + 0.025*"softwar" + 0.022*"depth" + 0.019*"super" + 0.018*"stock"


Topic: 3 
Words: 0.211*"lenovo" + 0.196*"note" + 0.126*"k8" + 0.020*"k4" + 0.020*"featur" + 0.020*"mi" + 0.015*"k5" + 0.015*"dolbi" + 0.013*"redmi" + 0.013*"atmo"


Topic: 4 
Words: 0.066*"great" + 0.054*"look" + 0.044*"worth" + 0.043*"power" + 0.039*"process" + 0.039*"touch" + 0.035*"hr" + 0.033*"photo" + 0.032*"galleri" + 0.026*"bill"


Topic: 5 
Words: 0.110*"issu" + 0.087*"network" + 0.051*"sim" + 0.033*"%" + 0.030*"call" + 0.026*"problem" + 0.025*"jio" 

## Accuracy check

In [None]:
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model2,texts=filtered_tokens,dictionary=dictionary,coherence='c_v')

coherence_lda=coherence_model_lda.get_coherence()

print('\nCoherence Score:', coherence_lda)


Coherence Score: 0.5567012076335874


In [None]:

unseen_document = list(ds_test.iloc[:,1])
print(unseen_document[0:5])
print(len(unseen_document))

['Really killer experience...', 'Heating a lot and want to return', 'Nice phone', 'Awesome product.a) good battery backupb) optimised performance along with 4 GB ram and octacore processorc) Dolby Atmos speakerd) overall a worthy phonee) awesome cameraHighly recommended to all who are comparing with xi red mi note 5 pro', 'Vary Nice product']
2935


In [None]:
def preprocess(text):
    #Punctuation removal
    import re, string
    nopunc_reviews=[]
    for i in text:
        result = re.sub('[%s]' % re.escape(string.punctuation),
                        ' ', i)
        nopunc_reviews.append(result)
    #Tokenization
    from nltk.tokenize import word_tokenize
    tokens=[]

    for token in nopunc_reviews:
        tokens.append(word_tokenize(token))
        
    #POS tagging
    pos_tokens=[]

    for tag in tokens:
        pos_tokens.append(nltk.pos_tag(tag))
        
    # Topic Modeling
      ##Extracting Nouns
        nouns =[]
        noun_tokens=[]
        index=[]
        temp =0
        count=0
        i=0
    while i<len(pos_tokens):
        for word,pos in pos_tokens[i]:
            if (pos == 'NN' or pos == 'NNP' or 
                pos == 'NNS' or pos == 'NNPS'):
                nouns.append(word)
                count+=1
            
        noun_tokens.append(nouns[temp:count])
        temp=count
        
        i+=1
        
        ##Lemmatize and stemming
    from nltk.stem import WordNetLemmatizer, SnowballStemmer
    lemmatizer=WordNetLemmatizer()
    stemmer=SnowballStemmer("english")

    lemma_stem = []
    lemst_tokens=[]
    temp=0
    count=0

    i=0
    while i < (len(noun_tokens)):
        for j in noun_tokens[i]:
            lemma_stem.append(stemmer.stem(
                lemmatizer.lemmatize(j,pos="n")))
            count+=1
            
        lemst_tokens.append(lemma_stem[temp:count])
        temp=count
        
        i+=1
        
        
        ##Stopwords
        
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))


    filtered_tokens=[]

    result=[]
    for i in lemst_tokens:
        output = [j for j in i if not j in stop_words]
        filtered_tokens.append(output)
        
        
    return filtered_tokens
    
        
        


In [None]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

TypeError: ignored

In [None]:
def preprocess(text):
    #Punctuation removal
    import re, string
    nopunc_reviews=[]
    for i in reviews:
        result = re.sub('[%s]' % re.escape(string.punctuation),
                        ' ', i)
        nopunc_reviews.append(result)
    #Tokenization
    from nltk.tokenize import word_tokenize
    tokens=[]

    for token in nopunc_reviews:
        tokens.append(word_tokenize(token))
        
    #POS tagging
    pos_tokens=[]

    for tag in tokens:
        pos_tokens.append(nltk.pos_tag(tag))
        
    # Topic Modeling
      ##Extracting Nouns
        nouns =[]
        noun_tokens=[]
        index=[]
        temp =0
        count=0
        i=0
    while i<len(pos_tokens):
        for word,pos in pos_tokens[i]:
            if (pos == 'NN' or pos == 'NNP' or 
                pos == 'NNS' or pos == 'NNPS'):
                nouns.append(word)
                count+=1
            
        noun_tokens.append(nouns[temp:count])
        temp=count
        
        i+=1
        
        ##Lemmatize and stemming
    from nltk.stem import WordNetLemmatizer, SnowballStemmer
    lemmatizer=WordNetLemmatizer()
    stemmer=SnowballStemmer("english")

    lemma_stem = []
    lemst_tokens=[]
    temp=0
    count=0

    i=0
    while i < (len(noun_tokens)):
        for j in noun_tokens[i]:
            lemma_stem.append(stemmer.stem(
                lemmatizer.lemmatize(j,pos="n")))
            count+=1
            
        lemst_tokens.append(lemma_stem[temp:count])
        temp=count
        
        i+=1
        
        
        ##Stopwords
        
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words("english"))


    filtered_tokens=[]

    result=[]
    for i in lemst_tokens:
        output = [j for j in i if not j in stop_words]
        filtered_tokens.append(output)
        
        
    return filtered_tokens
    
        
        


In [None]:
for index, score in sorted(lda_model[bow_vector], key=lambda)