## Topic Analysis of Review Data
### Bhupinder Singh Jolly

In [110]:
import pandas as pd
import numpy as np

In [111]:
reviews = pd.read_csv("K8 Reviews v0.2.csv")
reviews.head(10)

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...
5,0,Only I'm telling don't buyI'm totally disappoi...
6,1,"Phone is awesome. But while charging, it heats..."
7,0,The battery level has worn down
8,0,It's over hitting problems...and phone hanging...
9,0,A lot of glitches dont buy this thing better g...


In [112]:
reviews_lower = [sent.lower() for sent in reviews.review.values]
reviews_lower[0]

'good but need updates and improvements'

In [113]:
import nltk
from nltk import word_tokenize

In [114]:
reviews_token = [word_tokenize(sent) for sent in reviews_lower]
reviews_token[0]

['good', 'but', 'need', 'updates', 'and', 'improvements']

In [115]:
from nltk import pos_tag

In [116]:
reviews_pos_tag = [nltk.pos_tag(tokens) for tokens in reviews_token]
reviews_pos_tag[0]

[('good', 'JJ'),
 ('but', 'CC'),
 ('need', 'VBP'),
 ('updates', 'NNS'),
 ('and', 'CC'),
 ('improvements', 'NNS')]

In [117]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [118]:
tagged_tupple = nltk.pos_tag(["great"])
tagged_tupple[0]

('great', 'JJ')

In [119]:
import re

In [120]:
reviews_noun = []
for sent in reviews_pos_tag:
    reviews_noun.append([token for token in sent if re.search("NN.*",token[1])])
reviews_noun[0]

[('updates', 'NNS'), ('improvements', 'NNS')]

In [121]:
from nltk.stem import WordNetLemmatizer

In [122]:
lemm = WordNetLemmatizer()

In [123]:
reviews_lemm = []
for sent in reviews_noun:
    reviews_lemm.append([lemm.lemmatize(word[0]) for word in sent])

reviews_lemm[0]

['update', 'improvement']

In [124]:
from string import punctuation
from nltk.corpus import stopwords

In [125]:
stop_word = stopwords.words("english")
stop_update = stop_word + list(punctuation) + ["..."] + [".."]

In [126]:
reviews_sw_removed = []
for sent in reviews_lemm:
    reviews_sw_removed.append([term for term in sent if term not in stop_update])

In [127]:
reviews_sw_removed[1]

['mobile',
 'battery',
 'hell',
 'backup',
 'hour',
 'us',
 'idle',
 'discharged.this',
 'lie',
 'amazon',
 'lenove',
 'battery',
 'charger',
 'hour']

In [128]:
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models import ldamodel

In [129]:
id2word = corpora.Dictionary(reviews_sw_removed)
texts = reviews_sw_removed

In [130]:
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[200])

[(36, 1), (143, 1), (314, 1), (415, 1), (416, 1)]


In [131]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=12, random_state=42, passes=10, per_word_topics=True)

In [132]:
print(lda_model.print_topics())

[(0, '0.138*"mobile" + 0.040*"call" + 0.036*"screen" + 0.031*"feature" + 0.030*"option" + 0.020*"music" + 0.017*"software" + 0.016*"app" + 0.015*"video" + 0.015*"card"'), (1, '0.151*"money" + 0.128*"...." + 0.071*"waste" + 0.056*"value" + 0.046*"glass" + 0.038*"speaker" + 0.024*"gorilla" + 0.022*"set" + 0.022*"ok" + 0.020*"piece"'), (2, '0.216*"note" + 0.113*"k8" + 0.090*"lenovo" + 0.030*"sound" + 0.023*"dolby" + 0.020*"killer" + 0.018*"gallery" + 0.018*"system" + 0.018*"atmos" + 0.018*"excellent"'), (3, '0.078*"phone" + 0.040*"day" + 0.038*"amazon" + 0.035*"service" + 0.034*"issue" + 0.027*"time" + 0.027*"lenovo" + 0.026*"battery" + 0.024*"month" + 0.023*"device"'), (4, '0.280*"product" + 0.176*"problem" + 0.080*"network" + 0.075*"issue" + 0.066*"heating" + 0.021*"jio" + 0.021*"sim" + 0.019*"volta" + 0.010*"connection" + 0.009*"signal"'), (5, '0.093*"heat" + 0.070*"....." + 0.052*"processor" + 0.038*"everything" + 0.038*"budget" + 0.031*"...." + 0.030*"core" + 0.025*"display" + 0.017*

In [133]:
coh_model_lda = CoherenceModel(model=lda_model,texts=reviews_sw_removed,
                              dictionary=id2word,coherence='c_v')
coh_lda = coh_model_lda.get_coherence()

print("\n Coherence Score : ",coh_lda)


 Coherence Score :  0.475339388396195


## Looking at the topics and each terms following can be combined –
-	Topic 1, 6 and 8 possibly talks about 'pricing'
-	Topic 3, 7 and 9 closely talks about 'battery related issues'
-	Topic 0, 2 and 11 vaguely talks about 'performance'



### 10 topics seems to be the right number of topics from the data.
### We’ll create a topic model with 10 topics.


In [134]:
lda_model8 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,random_state=42, passes=10,per_word_topics=True)

In [135]:
coh_model_lda8 = CoherenceModel(model=lda_model8,texts=reviews_sw_removed,
                              dictionary=id2word,coherence='c_v')
coh_lda8 = coh_model_lda8.get_coherence()

print("\n Coherence Score : ",coh_lda8)


 Coherence Score :  0.5442702792781539


### The coherence is now 0.54 which is a significant increase from 0.48 previously.

In [136]:
x= lda_model8.show_topics(formatted=False)
topic_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]


In [137]:
for topic, words in topic_words:
    print(str(topic)+"::"+str(words))
print()

0::['mobile', 'call', 'screen', 'feature', 'option', 'app', 'card', 'cast', 'image', 'music']
1::['money', 'waste', 'value', 'handset', 'plz', 'headset', 'offer', 'signal', 'facility', 'drainage']
2::['note', 'k8', 'lenovo', 'dolby', 'feature', 'sound', 'system', 'killer', 'atmos', 'excellent']
3::['phone', 'issue', 'day', 'problem', 'lenovo', 'service', 'time', 'month', 'amazon', 'update']
4::['product', 'problem', 'network', 'issue', 'heating', 'amazon', 'sim', 'h', 'return', 'jio']
5::['....', 'battery', 'phone', 'heat', 'hour', 'charge', 'time', '.....', 'charging', 'hr']
6::['price', 'phone', 'feature', 'range', 'budget', 'specification', 'buy', 'smartphone', 'super', 'speaker']
7::['charger', 'hai', 'ho', 'turbo', 'box', 'piece', 'cable', 'bill', 'bhi', 'hi']
8::['delivery', 'superb', 'device', 'headphone', 'thanks', 'earphone', 'set', 'ok', 'pls', 'expectation']
9::['camera', 'battery', 'phone', 'quality', 'performance', 'backup', 'mode', 'issue', 'life', 'day']



In [138]:
data = {("Topic 01 --> ","Product Accessories"),
        ("Topic 02 --> ","Review & Issues"),
        ("Topic 03 --> ","Product Features"),
        ("Topic 04 --> ","Service Issues"),
        ("Topic 05 --> ","Networking Issues"),
        ("Topic 06 --> ","Battery Related Issues"),
        ("Topic 07 --> ","Overall General Phone Features"),
        ("Topic 08 --> ","Mobile Accessories"),
        ("Topic 09 --> ","Delivery Services"),
        ("Topic 10 --> ","Camera Quality")}
columns = ("Topic","Business Name")

In [145]:
review_data = pd.DataFrame(data,columns=columns).sort_values("Topic")


In [146]:
print(review_data.to_string(index=False))

         Topic                   Business Name
 Topic 01 -->              Product Accessories
 Topic 02 -->                  Review & Issues
 Topic 03 -->                 Product Features
 Topic 04 -->                   Service Issues
 Topic 05 -->                Networking Issues
 Topic 06 -->           Battery Related Issues
 Topic 07 -->   Overall General Phone Features
 Topic 08 -->               Mobile Accessories
 Topic 09 -->                Delivery Services
 Topic 10 -->                   Camera Quality
