# DESCRIPTION

### Help a leading mobile brand understand the voice of the customer by analyzing the reviews of their product on Amazon and the topics that customers are talking about. You will perform topic modeling on specific parts of speech. You’ll finally interpret the emerging topics.

# Problem Statement: 

###  A popular mobile phone brand, Lenovo has launched their budget smartphone in the Indian market. The client wants to understand the VOC (voice of the customer) on the product. This will be useful to not just evaluate the current product, but to also get some direction for developing the product pipeline. The client is particularly interested in the different aspects that customers care about. Product reviews by customers on a leading e-commerce site should provide a good view.

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import nltk

In [2]:
# importing datasets
ds = pd.read_csv("K8 Reviews v0.2.csv")
ds.head()

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


In [3]:
ds.shape

(14675, 2)

In [4]:
#Normalizing the review texts
reviews = list(ds.iloc[:,1])
reviews[0:5]

['Good but need updates and improvements',
 "Worst mobile i have bought ever, Battery is draining like hell, backup is only 6 to 7 hours with internet uses, even if I put mobile idle its getting discharged.This is biggest lie from Amazon & Lenove which is not at all expected, they are making full by saying that battery is 4000MAH & booster charger is fake, it takes at least 4 to 5 hours to be fully charged.Don't know how Lenovo will survive by making full of us.Please don;t go for this else you will regret like me.",
 'when I will get my 10% cash back.... its already 15 January..',
 'Good',
 'The worst phone everThey have changed the last phone but the problem is still same and the amazon is not returning the phone .Highly disappointing of amazon']

In [5]:
#Tokenization
from nltk.tokenize import word_tokenize
token1 = word_tokenize(reviews[0])
token1

['Good', 'but', 'need', 'updates', 'and', 'improvements']

In [6]:
type(token1)

list

In [7]:
len(token1)

6

In [8]:
tokens=[]
def reviews_tokens(text):
    token = word_tokenize(text)
    return token

for i in range(len(reviews)):
    token=reviews_tokens(reviews[i])
    tokens.append(token)

print(tokens[0:4])

[['Good', 'but', 'need', 'updates', 'and', 'improvements'], ['Worst', 'mobile', 'i', 'have', 'bought', 'ever', ',', 'Battery', 'is', 'draining', 'like', 'hell', ',', 'backup', 'is', 'only', '6', 'to', '7', 'hours', 'with', 'internet', 'uses', ',', 'even', 'if', 'I', 'put', 'mobile', 'idle', 'its', 'getting', 'discharged.This', 'is', 'biggest', 'lie', 'from', 'Amazon', '&', 'Lenove', 'which', 'is', 'not', 'at', 'all', 'expected', ',', 'they', 'are', 'making', 'full', 'by', 'saying', 'that', 'battery', 'is', '4000MAH', '&', 'booster', 'charger', 'is', 'fake', ',', 'it', 'takes', 'at', 'least', '4', 'to', '5', 'hours', 'to', 'be', 'fully', 'charged.Do', "n't", 'know', 'how', 'Lenovo', 'will', 'survive', 'by', 'making', 'full', 'of', 'us.Please', 'don', ';', 't', 'go', 'for', 'this', 'else', 'you', 'will', 'regret', 'like', 'me', '.'], ['when', 'I', 'will', 'get', 'my', '10', '%', 'cash', 'back', '....', 'its', 'already', '15', 'January', '..'], ['Good']]


In [9]:
len(tokens)

14675

In [10]:
#POS tagging

pos_tokens=[]
def pos_tag(text):
    tag = nltk.pos_tag(text)
    return tag

for i in range(len(tokens)):
    tag=pos_tag(tokens[i])
    pos_tokens.append(tag)

print(pos_tokens[0:4])



[[('Good', 'JJ'), ('but', 'CC'), ('need', 'VBP'), ('updates', 'NNS'), ('and', 'CC'), ('improvements', 'NNS')], [('Worst', 'NNP'), ('mobile', 'NN'), ('i', 'NN'), ('have', 'VBP'), ('bought', 'VBN'), ('ever', 'RB'), (',', ','), ('Battery', 'NNP'), ('is', 'VBZ'), ('draining', 'VBG'), ('like', 'IN'), ('hell', 'NN'), (',', ','), ('backup', 'NN'), ('is', 'VBZ'), ('only', 'RB'), ('6', 'CD'), ('to', 'TO'), ('7', 'CD'), ('hours', 'NNS'), ('with', 'IN'), ('internet', 'JJ'), ('uses', 'NNS'), (',', ','), ('even', 'RB'), ('if', 'IN'), ('I', 'PRP'), ('put', 'VBP'), ('mobile', 'JJ'), ('idle', 'NN'), ('its', 'PRP$'), ('getting', 'VBG'), ('discharged.This', 'NN'), ('is', 'VBZ'), ('biggest', 'JJS'), ('lie', 'NN'), ('from', 'IN'), ('Amazon', 'NNP'), ('&', 'CC'), ('Lenove', 'NNP'), ('which', 'WDT'), ('is', 'VBZ'), ('not', 'RB'), ('at', 'IN'), ('all', 'DT'), ('expected', 'VBN'), (',', ','), ('they', 'PRP'), ('are', 'VBP'), ('making', 'VBG'), ('full', 'JJ'), ('by', 'IN'), ('saying', 'VBG'), ('that', 'DT'), (

## Topic Modeling

In [11]:
#Extracting Nouns
nouns =[]
noun_tokens=[]
#sort_review=[]
index=[]

def noun(word,pos,i):
    if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
        wd=word
        if i not in index:
            index.append(i)
    else:
        wd=[]
    return wd

for i in range(len(pos_tokens)):
    for word,pos in pos_tokens[i]:
        nn=noun(word,pos,i)
        nouns.append(nn)
        
        
sorted_review=[reviews[i] for i in index]         
sorted_pos_tokens = [pos_tokens[i] for i in index]
sorted_tokens = [tokens[i] for i in index]
    
def rm():
    return nouns.remove([])

for i in range(len(nouns)):
    if [] in nouns:
        rm()    
    

In [12]:
nouns[0:4]

['updates', 'improvements', 'Worst', 'mobile']

In [13]:
sorted_tokens = [tokens[i] for i in index]

In [14]:
print(sorted_tokens[0][0])

Good


## Lemmatize

In [15]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

lm_sr_tk = []
def lm(text):
    return lemmatizer.lemmatize(text,pos="n")


for i in sorted_tokens:
    for j in i:
        lemma=lm(j)
        lm_sr_tk.append(lemma)


print(lm_sr_tk)



## Stopwords and punctuation removal

In [16]:
from nltk.corpus import stopwords
stop_word = set(stopwords.words("english"))
output = [i for i in lm_sr_tk if not i in stop_word]

import re, string
result=[]
for i in output:
    res = re.sub('[%s]' % re.escape(string.punctuation),'', i)
    result.append(res)

In [17]:
print(result)



In [18]:
def rm():
    return result.remove('')

for i in range(len(result)):
    if '' in result:
        rm()   

## LDA

In [30]:
import gensim
dictionary = gensim.corpora.Dictionary(result)

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

## Text Summarization

In [20]:
from gensim.summarization import summarize



text_to_summarize = result

summary = summarize(text_to_summarize)
print(summary)

TypeError: expected string or bytes-like object