# TripAdvisor Customer Review Analysis

## Import Libraries and Packages used for the analysis

In [1]:
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 16:54:48) 
[Clang 4.0.1 (tags/RELEASE_401/final)]


In [2]:
#Plot
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Data Packages
import math
import pandas
import numpy as np

#Progress bar
from tqdm import tqdm

#Counter
from collections import Counter

#Operation
import operator

#Natural Language Processing Packages
import re
import nltk

## Download Resources
nltk.download("vader_lexicon")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tag import PerceptronTagger
from nltk.data import find

## Machine Learning
import sklearn
import sklearn.metrics as metrics

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zhuojun/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhuojun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zhuojun/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/zhuojun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Data Prepreation
We first crawl the customer review data for hotels in Markham from TripAdvisor and parse the data into a csv file for easier manipulation.

-crawler used: https://github.com/aesuli/trip-advisor-crawler

-crawling commend: $python3 trip−advisor−crawler .py −o data ca:181720

-parsing commend: $python3 trip−advisor−parser.py −d data −o markham.csv

We then load the data into a dataframe.


In [3]:
#Read data from pandas
hotelDF = pandas.read_csv('markham.csv')
hotelDF.columns=['idNum','filePath','hotelName','reviewColumn','ratingScore','groundTruth']

In [4]:
hotelDF.head()

Unnamed: 0,idNum,filePath,hotelName,reviewColumn,ratingScore,groundTruth
0,86280330,markham\ca\181720\181948\107186787.html,Staybridge Suites Toronto,my husband and i stayed there for 9 nights as ...,4,positive
1,85071982,markham\ca\181720\181948\107186787.html,Staybridge Suites Toronto,Our stay was not the worst I've ever stayed in...,3,negative
2,79574338,markham\ca\181720\181948\107186787.html,Staybridge Suites Toronto,I stayed here during a weeknight in July with ...,3,negative
3,78931757,markham\ca\181720\181948\107186787.html,Staybridge Suites Toronto,Spent 5 weeks here while my home was prepped -...,5,positive
4,78277874,markham\ca\181720\181948\107186787.html,Staybridge Suites Toronto,My family booked this hotel and at first we we...,5,positive


## Sentiment Analysis
After loading the data, we calculate the sentiment value for each review using the VADER.<br>
We then perform analysis on hotel ranking.

In [5]:
# Fist define the function used for sentiment calculation
# Use vader to evaluated sentiment of reviews
def evalSentences(reviews, to_df=False, columns=[]):
    #Instantiate an instance to access SentimentIntensityAnalyzer class
    sid = SentimentIntensityAnalyzer()
    pdlist = []
    if to_df:
        for review in tqdm(reviews):
            ss = sid.polarity_scores(review)
            pdlist.append([review]+[ss['compound']])
        reviewDf = pandas.DataFrame(pdlist)
        reviewDf.columns = columns
        return reviewDf
    
    else:
        for review in tqdm(reviews):
            print(review)
            ss = sid.polarity_scores(review)
            for k in sorted(ss):
                print('{0}: {1}, '.format(k, ss[k]), end='')
            print()

In [6]:
# Calculate sentiment value for the reviews
reviews = hotelDF['reviewColumn'].values
reviewDF = evalSentences(reviews, to_df=True, columns=['reviewCol','vaderScore'])

100%|██████████| 3894/3894 [00:06<00:00, 574.23it/s]


In [7]:
reviewDF.head()

Unnamed: 0,reviewCol,vaderScore
0,my husband and i stayed there for 9 nights as ...,0.9731
1,Our stay was not the worst I've ever stayed in...,-0.2973
2,I stayed here during a weeknight in July with ...,-0.6042
3,Spent 5 weeks here while my home was prepped -...,0.9537
4,My family booked this hotel and at first we we...,0.9985


After calculating the sentiment values, we rank the hotel by average rating score and sentiment score

In [8]:
analyzeDF = hotelDF.drop(['filePath'], axis=1,)
vaderlist = reviewDF['vaderScore']
analyzeDF['vaderScore'] = vaderlist

In [9]:
analyzeDF.head()

Unnamed: 0,idNum,hotelName,reviewColumn,ratingScore,groundTruth,vaderScore
0,86280330,Staybridge Suites Toronto,my husband and i stayed there for 9 nights as ...,4,positive,0.9731
1,85071982,Staybridge Suites Toronto,Our stay was not the worst I've ever stayed in...,3,negative,-0.2973
2,79574338,Staybridge Suites Toronto,I stayed here during a weeknight in July with ...,3,negative,-0.6042
3,78931757,Staybridge Suites Toronto,Spent 5 weeks here while my home was prepped -...,5,positive,0.9537
4,78277874,Staybridge Suites Toronto,My family booked this hotel and at first we we...,5,positive,0.9985


### Top Hotels
We first take a look the top hotels based on both average rating score and sentiment value.

Top 5 hotels by average rating score

In [10]:
# create vader DF and group by hotelName and mean of vader values.
ratingDF = analyzeDF.drop(['idNum','reviewColumn','groundTruth','vaderScore'],axis=1)
ratingDF = ratingDF.groupby('hotelName').mean()
ratingDF = ratingDF.sort_values(by = 'ratingScore', ascending=False)
ratingDF.head()

Unnamed: 0_level_0,ratingScore
hotelName,Unnamed: 1_level_1
Hampton Inn &amp; Suites by Hilton Toronto Markham,4.447458
TownePlace Suites Toronto Northeast/Markham,4.351351
Hilton Garden Inn Toronto/Markham,4.305927
Homewood Suites by Hilton Toronto-Markham,4.303398
Courtyard Toronto Northeast/Markham,4.300813


Top 5 hotels by average vader score

In [11]:
# create vader DF and group by hotelName and mean of vader values.
vaderDF = analyzeDF.drop(['idNum','reviewColumn','ratingScore','groundTruth'],axis=1)
vaderDF = vaderDF.groupby('hotelName').mean()
vaderDF = vaderDF.sort_values(by = 'vaderScore', ascending=False)
vaderDF.head()

Unnamed: 0_level_0,vaderScore
hotelName,Unnamed: 1_level_1
TownePlace Suites Toronto Northeast/Markham,0.835615
Hampton Inn &amp; Suites by Hilton Toronto Markham,0.835335
Monte Carlo Inn &amp; Suites Downtown Markham,0.83192
Courtyard Toronto Northeast/Markham,0.80744
Edward Village Markham,0.803865


Compare the two lists of top hotels

In [12]:
# Get the overlapping hotels in both top5 lists 
overlapDF= pandas.concat([ratingDF[:5],vaderDF[:5]], axis=1, join='outer',sort=False)
overlapDF

Unnamed: 0,ratingScore,vaderScore
Hampton Inn &amp; Suites by Hilton Toronto Markham,4.447458,0.835335
TownePlace Suites Toronto Northeast/Markham,4.351351,0.835615
Hilton Garden Inn Toronto/Markham,4.305927,
Homewood Suites by Hilton Toronto-Markham,4.303398,
Courtyard Toronto Northeast/Markham,4.300813,0.80744
Monte Carlo Inn &amp; Suites Downtown Markham,,0.83192
Edward Village Markham,,0.803865


We can see that the top hotels in two lists are mostly overlapped, which means that the sentiment value we calculated from the reviews have a high correlation to the rating score provided by the user.

### Bottom Hotels
We then take a look the bottom hotels ranked by average rating score and sentiment value.

Bottom 5 hotels by average rating score

In [13]:
ratingBottomDF = ratingDF.sort_values(by = 'ratingScore', ascending=True)
ratingBottomDF.head()

Unnamed: 0_level_0,ratingScore
hotelName,Unnamed: 1_level_1
Park Inn By Radisson Toronto-Markham,3.605863
Monte Carlo Inns Markham,3.657143
Comfort Inn Toronto Northeast,3.756477
Courtyard Toronto Markham,3.828877
Delta Markham,3.853211


Bottom 5 hotels by average vader score

In [14]:
vaderBottomDF = vaderDF.sort_values(by = 'vaderScore', ascending=True)
vaderBottomDF.head()

Unnamed: 0_level_0,vaderScore
hotelName,Unnamed: 1_level_1
Monte Carlo Inns Markham,0.643021
Park Inn By Radisson Toronto-Markham,0.662436
Hilton Toronto / Markham Suites Conference Centre &amp; Spa,0.685487
Comfort Inn Toronto Northeast,0.686889
Residence Inn Toronto Markham,0.703197


Compare the two lists of bottom hotels

In [15]:
# Get the overlapping hotels in both bottom5 lists 
overlapBottomDF= pandas.concat([ratingBottomDF[:5],vaderBottomDF[:5]], axis=1, join='outer',sort=False)
overlapBottomDF

Unnamed: 0,ratingScore,vaderScore
Park Inn By Radisson Toronto-Markham,3.605863,0.662436
Monte Carlo Inns Markham,3.657143,0.643021
Comfort Inn Toronto Northeast,3.756477,0.686889
Courtyard Toronto Markham,3.828877,
Delta Markham,3.853211,
Hilton Toronto / Markham Suites Conference Centre &amp; Spa,,0.685487
Residence Inn Toronto Markham,,0.703197


We can see that the two bottom lists also have overlapping hotels, which reinforces the correlation between rating score and sentiment value.

## Frequency Analysis - words
We rank the top-50 most frequent non-stop words in reviews for both positive and negative with ground truth sentiment and sentiment values.

In [16]:
# First transform the hotelDF dataframe to include top 500 frequent words as features
def dataFrameTransformation(hotelDf, reviewDF, k=500):
    reviews = reviewDF['reviewCol'].values
    
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()
    
    stop = set(stopwords.words('english'))
    
    # Top-k frequent terms
    counter = Counter()
    for review in reviews:
        review = [lemmatizer.lemmatize(words.lower(),'v') for words in tokenizer.tokenize(review)]
        review = [stemmer.stem(words) for words in review]
        counter.update([word for word in review if word not in stop and len(word) > 2])

    topk = counter.most_common(k)   
    
    #Find out if a particular review has the word from topk list
    freqReview = []
    for i in range(len(reviews)):
        tempCounter = Counter([word.lower() for word in re.findall(r'\w+',reviews[i])])
        topkinReview = [1 if tempCounter[word] > 0 else 0 for (word,wordCount) in topk]
        freqReview.append(topkinReview)
        
        
    #Prepare freqReviewDf
    freqReviewDf = pandas.DataFrame(freqReview)
    dfName = []
    for c in topk:
        dfName.append(c[0])
    freqReviewDf.columns = dfName #assign the column names to be each of the top 500 most frequent words
    finalreviewDf = reviewDF.join(freqReviewDf)
    finaldf = hotelDf[['hotelName','ratingScore','groundTruth']].join(finalreviewDf)
    return topk, finaldf

In [17]:
topk, finaldf = dataFrameTransformation(hotelDF, reviewDF, k=500)
finaldf.head()

Unnamed: 0,hotelName,ratingScore,groundTruth,reviewCol,vaderScore,room,hotel,stay,veri,thi,...,els,compar,screen,zoo,entir,condit,okay,soft,typic,tournament
0,Staybridge Suites Toronto,4,positive,my husband and i stayed there for 9 nights as ...,0.9731,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Staybridge Suites Toronto,3,negative,Our stay was not the worst I've ever stayed in...,-0.2973,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Staybridge Suites Toronto,3,negative,I stayed here during a weeknight in July with ...,-0.6042,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,Staybridge Suites Toronto,5,positive,Spent 5 weeks here while my home was prepped -...,0.9537,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Staybridge Suites Toronto,5,positive,My family booked this hotel and at first we we...,0.9985,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We now get the top 50 most frequent word for both positve and negative reviews

In [18]:
# We are only intereseted in this three column for overall analysis
itemAnalysisDf = finaldf[['reviewCol','groundTruth','vaderScore']]

In [19]:
# Define the function to get top k most frequent words
def getTopK(df, k, label_value, label_column, operation, value_column='reviewCol'):
    stop = set(stopwords.words('english'))
    #Add possible Stop Words for Hotel Reviews
    stop.add('hotel')
    stop.add('room')
    stop.add('rooms')
    stop.add('stay')
    stop.add('staff')
    
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()
    
    counter = Counter()
    for review in df.loc[operation(df[label_column],label_value)][value_column]:
        review = [lemmatizer.lemmatize(words.lower(),'v') for words in tokenizer.tokenize(review)]
        review = [stemmer.stem(words) for words in review]
        counter.update([word for word in review if word not in stop and len(word) > 2])
    
    topk = [word for (word,freq) in counter.most_common(k)]
    return topk

Top 50 postive and negative words sorted using groud truth

In [20]:
top50GroundPos = getTopK(df = itemAnalysisDf, k = 50, 
                         label_value = 'positive', label_column = 'groundTruth', 
                         operation = operator.eq)
top50GroundNeg = getTopK(df = itemAnalysisDf, k = 50, 
                         label_value = 'negative', label_column = 'groundTruth',
                        operation = operator.eq)

Top 50 postive and negative words sorted using vader value

In [21]:
top50VaderPos = getTopK(df = itemAnalysisDf, k = 50, 
                         label_value = 0, label_column = 'vaderScore', 
                         operation = operator.gt)
top50VaderNeg = getTopK(df = itemAnalysisDf, k = 50, 
                         label_value = 0, label_column = 'vaderScore',
                        operation = operator.lt)

Compare the lists

In [22]:
print(set(top50GroundPos)&set(top50VaderPos))

{'time', 'thi', 'veri', 'suit', 'check', 'lot', 'get', 'friendli', 'free', 'also', 'pool', 'make', 'great', 'good', 'close', 'area', 'tip', 'help', 'front', 'even', 'famili', 'realli', 'desk', 'food', 'price', 'well', 'would', 'place', 'onli', 'nice', 'restaur', 'day', 'need', 'toronto', 'bed', 'night', 'breakfast', 'park', 'see', 'comfort', 'locat', 'recommend', 'like', 'clean', 'markham', 'one', 'servic', 'busi', 'use'}


In [23]:
print(set(top50GroundNeg)&set(top50VaderNeg))

{'come', 'time', 'could', 'thi', 'veri', 'suit', 'check', 'get', 'also', 'tell', 'make', 'good', 'area', 'floor', 'tip', 'front', 'even', 'take', 'desk', 'say', 'find', 'ask', 'work', 'would', 'onli', 'day', 'nice', 'back', 'need', 'call', 'bed', 'night', 'breakfast', 'see', 'like', 'clean', 'one', 'servic', 'book', 'look', 'give', 'use'}


There are a decent amount of locale-specific words appearing as top words in both positive and negatives lists since customers usually provide their feedback on those specific facilities or services. <br>
We can also see that some "postive" words appear frequently in the negative reviews. This indicates that it would be beneficial to perform the frequency analysis on phrases instead of words to gain more insights.

## Frequency Analysis - phrases
Now we perform the frequency analysis on phrases

In [24]:
# Evaluating n-gram based evaluation metrics for automatic keyphrase extraction.
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""

In [25]:
# Noun Phrase Extraction Support Functions
from nltk.corpus import stopwords
stop = stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

# generator, generate leaves one by one
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP' or t.label()=='JJ' or t.label()=='RB'):
        yield subtree.leaves()

# stemming, lematizing, lower case... 
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem(word)
    word = lemmatizer.lemmatize(word)
    return word

# stop-words and length control
def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stop)
    return accepted

# generator, create item once a time
def get_terms(tree):
    for leaf in leaves(tree):
        term = [normalise(w) for w,t in leaf if acceptable_word(w) ]
        # Phrase only
        if len(term)>1:
            yield term

In [26]:
# Flatten phrase lists to get tokens for analysis
def flatten(npTokenList):
    finalList =[]
    for phrase in npTokenList:
        token = ''
        for word in phrase:
            token += word + ' '
        finalList.append(token.rstrip())
    return finalList

In [27]:
# Revise the previous dataframe transform function...
def newDataFrameTransformation(hotelDf, reviewDF, k=50):
    reviews = reviewDF['reviewCol'].values
    
    tokenizer = RegexpTokenizer(r'\w+')
    tagger = PerceptronTagger()
    pos_tag = tagger.tag
    chunker = nltk.RegexpParser(grammar)
    
    # Top-k frequent terms
    counter = Counter()
    for review in reviews:
        counter.update(flatten([word for word in get_terms(chunker.parse(pos_tag(tokenizer.tokenize(review))))]))
    
    topk = counter.most_common(k)     
    
    #Find out if a particular review has the word from topk list
    freqReview = []
    for i in range(len(reviews)):
        tempCounter = Counter(flatten([word for word in get_terms(chunker.parse(pos_tag(tokenizer.tokenize(reviews[i]))))]))
        topkinReview = [1 if tempCounter[word] > 0 else 0 for (word,wordCount) in topk]
        freqReview.append(topkinReview)
        
        
    #Prepare freqReviewDf
    freqReviewDf = pandas.DataFrame(freqReview)
    dfName = []
    for c in topk:
        dfName.append(c[0])
    freqReviewDf.columns = dfName
    finalreviewDf = reviewDF.join(freqReviewDf)
    finaldf = hotelDf[['hotelName','ratingScore','groundTruth']].join(finalreviewDf)
    return topk, finaldf

In [28]:
topk_phrase, finaldf_phrase = newDataFrameTransformation(hotelDF, reviewDF, k=50)
finaldf_phrase.head()

Unnamed: 0,hotelName,ratingScore,groundTruth,reviewCol,vaderScore,room tip,front desk,front desk staff,free breakfast,bedroom suit,...,breakfast area,downtown toronto,night stay,free wifi,comfort inn,hampton inn,full kitchen,great stay,mani restaur,great locat
0,Staybridge Suites Toronto,4,positive,my husband and i stayed there for 9 nights as ...,0.9731,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,Staybridge Suites Toronto,3,negative,Our stay was not the worst I've ever stayed in...,-0.2973,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Staybridge Suites Toronto,3,negative,I stayed here during a weeknight in July with ...,-0.6042,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Staybridge Suites Toronto,5,positive,Spent 5 weeks here while my home was prepped -...,0.9537,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Staybridge Suites Toronto,5,positive,My family booked this hotel and at first we we...,0.9985,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# We are only intereseted in this three column for overall analysis
itemAnalysisDf_phrase = finaldf_phrase[['reviewCol','groundTruth','vaderScore']]

In [30]:
# Define a function for getting top k phrases
def getTopKPhrase(df, k, label_value, label_column, operation, value_column='reviewCol'):
    
    tokenizer = RegexpTokenizer(r'\w+')
    tagger = PerceptronTagger()
    pos_tag = tagger.tag
    # Create phrase tree
    chunker = nltk.RegexpParser(grammar)
    
     # Top-k frequent phrases
    counter = Counter()
         
    for review in df.loc[operation(df[label_column],label_value)][value_column]:
        counter.update(flatten([word for word in get_terms(chunker.parse(pos_tag(tokenizer.tokenize(review))))]))
    
    topk = [word for (word, freq) in counter.most_common(k)]
    return topk

Top 50 positive and negative phrases sorted using ground truth

In [31]:
top50PhraseGroundPos = getTopKPhrase(df = itemAnalysisDf_phrase, k = 50, 
                         label_value = 'positive', label_column = 'groundTruth',
                        operation = operator.eq)
top50PhraseGroundNeg = getTopKPhrase(df = itemAnalysisDf_phrase, k = 50, 
                         label_value = 'negative', label_column = 'groundTruth',
                        operation = operator.eq)

Top 50 positive and negative phrases sorted using vader value

In [32]:
top50PhraseVaderPos = getTopKPhrase(df = itemAnalysisDf_phrase, k = 50, 
                         label_value = 0, label_column = 'vaderScore',
                        operation = operator.gt)
top50PhraseVaderNeg = getTopKPhrase(df = itemAnalysisDf_phrase, k = 50, 
                         label_value = 0, label_column = 'vaderScore',
                        operation = operator.le)

Compare the two lists

In [33]:
print(set(top50PhraseGroundPos)&set(top50PhraseVaderPos))

{'front desk staff', 'great place', 'hilton garden inn', 'downtown toronto', 'hotel staff', 'live room', 'bedroom suit', 'clean staff', 'homewood suit', 'friendli staff', 'hot breakfast', 'park lot', 'nice hotel', 'hotel room', 'room tip', 'great stay', 'full kitchen', 'major highway', 'front desk', 'continent breakfast', 'doubl bed', 'great locat', 'easi access', 'next time', 'free breakfast', 'great hotel', 'great experi', 'hampton inn', 'first time', 'hot tub', 'good locat', 'complimentari breakfast', 'resid inn', 'good valu', 'breakfast buffet', 'free wifi', 'room servic', 'next morn', 'pool area', 'queen bed', 'busi travel', 'mani restaur', 'busi trip', 'comfort bed', 'breakfast area'}


In [34]:
print(set(top50PhraseGroundNeg)&set(top50PhraseVaderNeg))

{'front desk staff', 'hotel staff', 'next door', 'live room', 'bedroom suit', 'comfort inn', 'park lot', 'star hotel', 'hotel room', 'room tip', 'long time', 'custom servic', 'night stay', 'front desk', 'credit card', 'continent breakfast', 'hotel manag', 'new room', 'hot tub', 'resid inn', 'breakfast buffet', 'staff member', 'next day', 'recept area stink inconsist internet', 'next morn', 'room servic', 'sofa bed', 'slow internet wifi connect recept area stink', 'pool area', 'second night', 'queen bed', 'busi travel', 'smoke room'}


We can see that the phrase frequency analysis returns a set of noun phrases for positive and negative reviews. Most of these phrases are locale-specific and they can be used as key words to describe the hotel. Together with the sentiment attached to the phrases, potential customers can get a sense of the hotel withou reading thoroughly through the reviews.

## Mutual Information Analysis - words
We then use mutual information with ground truth sentiment to rank the top-50 most sentiment-bearing non-stopwords in the review.

In [35]:
# get Top K mutual information terms from the dataframe
def getMI(topk, df, label_column='groundTruth'):
    miScore = []
    for word in topk:
        miScore.append([word]+[metrics.mutual_info_score(df[label_column], df[word])])
    miScoredf = pandas.DataFrame(miScore).sort_values(1,ascending=False)
    miScoredf.columns = ['Word','MI Score']
    return miScoredf

In [36]:
# get separate dataframe for positive and negative reviews
finalposdf = finaldf.loc[finaldf['groundTruth']=='positive']
finalnegdf = finaldf.loc[finaldf['groundTruth']=='negative']

Calculate MI score for top50 positive non-stopwords 

In [37]:
miScoredf_pos = getMI(top50GroundPos, finalposdf)
miScoredf_pos.head()

Unnamed: 0,Word,MI Score
22,help,1.748601e-15
4,good,1.44329e-15
2,clean,1.332268e-15
17,get,9.15934e-16
43,close,8.881784e-16


Calculate MI score for top50 negative non-stopwords

In [38]:
miScoredf_neg = getMI(top50GroundNeg, finalnegdf)
miScoredf_neg.head()

Unnamed: 0,Word,MI Score
45,park,8.881784e-16
31,look,8.326673e-16
29,need,7.771561e-16
48,well,1.665335e-16
18,day,1.665335e-16


We can see that some "positive" words still ranked pretty high in the MI score list for negative reviews.<br>
Therefore, we need to perform tha analysis on noun phrases to get further results.

## Mutual Information Analysis - phrases
We now peform the analysis for noun phrases.

Calculate top50 positive noun phrases

In [39]:
# get postive and negative dataframe separately
def getDF(topk, reviewDF, hotelDF, label_value):
    reviews = reviewDF['reviewCol'].values
    
    tokenizer = RegexpTokenizer(r'\w+')
    tagger = PerceptronTagger()
    pos_tag = tagger.tag
    chunker = nltk.RegexpParser(grammar)
    
    #Find out if a particular review has the word from topk list
    freqReview = []
    for i in range(len(reviews)):
        tempCounter = Counter(flatten([word for word in get_terms(chunker.parse(pos_tag(tokenizer.tokenize(reviews[i]))))]))
        topkinReview = [1 if tempCounter[word] > 0 else 0 for word in topk]
        freqReview.append(topkinReview)
        
    #Prepare freqReviewDf
    freqReviewDf = pandas.DataFrame(freqReview)
    dfName = []
    for c in topk:
        dfName.append(c)
    freqReviewDf.columns = dfName
    finalreviewDf = reviewDF.join(freqReviewDf)
    finaldf = hotelDF[['hotelName','ratingScore','groundTruth']].join(finalreviewDf)
    finaldf = finaldf.loc[finaldf['groundTruth']==label_value]
    
    return finaldf

In [40]:
# get separate dataframe for positive and negative reviews
finalposdf_phrase = getDF(top50PhraseGroundPos, reviewDF, hotelDF, 'positive')
finalnegdf_phrase = getDF(top50PhraseGroundNeg, reviewDF, hotelDF, 'negative')

In [41]:
miScoredf_posphrase = getMI(top50PhraseGroundPos, finalposdf_phrase)
miScoredf_posphrase.head()

Unnamed: 0,Word,MI Score
30,doubl bed,1.771153e-15
43,great locat,1.771153e-15
32,live room,1.771153e-15
18,resid inn,1.771153e-15
40,toronto area,1.771153e-15


In [42]:
miScoredf_negphrase = getMI(top50PhraseGroundNeg, finalnegdf_phrase)
miScoredf_negphrase.head()

Unnamed: 0,Word,MI Score
4,next morn,8.881784e-16
25,smoke room,8.430756e-16
32,good valu,8.430756e-16
29,busi travel,8.430756e-16
27,staff member,8.430756e-16
