In [41]:
#Plot
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Data Packages
import math
import pandas
import numpy as np
import pandas as pd
#Progress bar
from tqdm import tqdm

#Counter
from collections import Counter

#Operation
import operator

#Natural Language Processing Packages
import re
import nltk
import string

## Download Resources
nltk.download("vader_lexicon")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tag import PerceptronTagger
from nltk.data import find

## Machine Learning
import sklearn
import sklearn.metrics as metrics

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Litos\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Litos\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Litos\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Litos\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Use vader to evaluated sentiment of reviews (vader score for each entire review)
def evalSentences(sentences, to_df=False, columns=[]):
    #Instantiate an instance to access SentimentIntensityAnalyzer class
    sid = SentimentIntensityAnalyzer()
    pdlist = []
    if to_df:
        for sentence in tqdm(sentences):
            ss = sid.polarity_scores(sentence)
            pdlist.append([sentence]+[ss['compound']])
        reviewDf = pandas.DataFrame(pdlist)
        reviewDf.columns = columns
        return reviewDf
    
    else:
        for sentence in tqdm(sentences):
            print(sentence)
            ss = sid.polarity_scores(sentence)
            for k in sorted(ss):
                print('{0}: {1}, '.format(k, ss[k]), end='')
            print()

In [3]:
sentence = 'quick service'
sid = SentimentIntensityAnalyzer()
ss = sid.polarity_scores(sentence)
ss

{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

# Load Data

In [74]:
cur_df = pd.read_csv('../../data/yelp/Data.csv', index_col=0,  encoding='latin-1')

In [5]:
reviews = cur_df['review_text'].values

In [50]:
reviewDF = evalSentences(reviews, to_df=True, columns=['reviewCol','vader'])

100%|█████████████████████████████████████████████████████████████████████████| 157038/157038 [04:47<00:00, 546.65it/s]


In [52]:
cur_df['Binary_vader'] = (reviewDF['vader']> 0.05)*[1]

In [51]:
#Save
reviewDF.to_csv('../../data/yelp/reviewDF.csv')

In [7]:
# Load reviewDF
reviewDF = pd.read_csv('../../data/yelp/reviewDF.csv', index_col=0,  encoding='latin-1')

In [9]:
reviewDF.head()

Unnamed: 0,reviewCol,vader
0,I ordered the lemon mango slush and the lemon ...,0.9758
1,"Came here on a Sunday afternoon, it wasn't bus...",0.9763
2,{Grapefruit & Yakult Green Tea with Aloe Jelly...,0.917
3,Saw this newly opened bubble tea shop and want...,0.9898
4,Happy Lemon has become my new favourite place ...,0.9922


In [53]:
cur_df['vader'] = reviewDF['vader']
# Get vader binary values
cur_df['Binary_vader'] = (reviewDF['vader'] > 0.05)*[1]

# Find most sentiment words by Mutual Information

## Data preprocessing

In [37]:
from nltk.stem import WordNetLemmatizer
def preprocess(text):

    text = text.replace('.',' ').replace('/t',' ').replace('\t',' ').replace('/',' ').replace('-',' ')
    
    # Tokenize
    text = nltk.word_tokenize(text)
    
    # Lowercase
    text = [w.lower() for w in text]

    # Remove Punctuation
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in text]

    # Remove tokens that are not alphabetic
    text = [w for w in text if w.isalpha()]

    # Remove Stopwords
    # Get english stopwords
    en_stopwords = set(stopwords.words('english'))
    en_stopwords.remove('off')
    text = [w for w in text if w not in en_stopwords]
    
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(w) for w in text]

    text = " " + " ".join(str(x) for x in text) + " "

    text = text.replace('whitish', 'white')
    text = text.replace('bisquity', ' biscuit ')
    text = text.replace('carmel', ' caramel ')
    text = text.replace('flower', ' floral ')
    text = text.replace('piny', ' pine ')
    text = text.replace('off white', 'offwhite')
    text = text.replace('goden', 'gold')
    text = text.replace('yello', 'yellow')
    text = text.replace('reddish', ' red ') 
    text = text.replace('favorite', 'favourite ') 
    

    # Reset to token
    text = nltk.word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    text = [w.translate(table) for w in text]
    text = [w for w in text if w.isalpha()]
#     en_stopwords = set(stopwords.words('english'))
    text = [w for w in text if w not in en_stopwords]
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(w) for w in text]
    
    return text

def preprocess_sentence(sentence):
    return ' '.join(str(x) for x in preprocess(sentence))

# Compute Mutual Info

In [12]:
# get Top K mutual information terms from the dataframe
def getMI(topk, df, label_column='groundTruth'):
    miScore = []
    for word in topk:
        miScore.append([word[0]]+[metrics.mutual_info_score(finaldf[label_column], finaldf[word[0]])])
    miScoredf = pandas.DataFrame(miScore).sort_values(1,ascending=0)
    miScoredf.columns = ['Word','MI Score']
    return miScoredf

# Noun Phrase

In [13]:
# Sample text
text = """The Buddha, the Godhead, resides quite as comfortably in the circuits of a digital
computer or the gears of a cycle transmission as he does at the top of a mountain
or in the petals of a flower. To think otherwise is to demean the Buddha...which is
to demean oneself."""

In [14]:
tagger = PerceptronTagger()
# Part of Speech Tagging
# Google: https://en.wikipedia.org/wiki/Part-of-speech_tagging
pos_tag = tagger.tag
taggedToks = pos_tag(re.findall(r'\w+', text))
taggedToks

[('The', 'DT'),
 ('Buddha', 'NNP'),
 ('the', 'DT'),
 ('Godhead', 'NNP'),
 ('resides', 'VBZ'),
 ('quite', 'RB'),
 ('as', 'IN'),
 ('comfortably', 'RB'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('circuits', 'NNS'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('digital', 'JJ'),
 ('computer', 'NN'),
 ('or', 'CC'),
 ('the', 'DT'),
 ('gears', 'NNS'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('cycle', 'NN'),
 ('transmission', 'NN'),
 ('as', 'IN'),
 ('he', 'PRP'),
 ('does', 'VBZ'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('top', 'NN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('mountain', 'NN'),
 ('or', 'CC'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('petals', 'NNS'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('flower', 'NN'),
 ('To', 'TO'),
 ('think', 'VB'),
 ('otherwise', 'RB'),
 ('is', 'VBZ'),
 ('to', 'TO'),
 ('demean', 'VB'),
 ('the', 'DT'),
 ('Buddha', 'NNP'),
 ('which', 'WDT'),
 ('is', 'VBZ'),
 ('to', 'TO'),
 ('demean', 'VB'),
 ('oneself', 'PRP')]

In [15]:
# This grammar is described in the paper by S. N. Kim,
# T. Baldwin, and M.-Y. Kan.
# Evaluating n-gram based evaluation metrics for automatic
# keyphrase extraction.
# Technical report, University of Melbourne, Melbourne 2010.
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""

In [16]:
# Create phrase tree
chunker = nltk.RegexpParser(grammar)
tree= chunker.parse(taggedToks)

In [17]:
# Noun Phrase Extraction Support Functions
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

# generator, generate leaves one by one
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP' or t.label()=='JJ' or t.label()=='RB'):
        yield subtree.leaves()

# stemming, lematizing, lower case... 
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem(word)
    word = lemmatizer.lemmatize(word)
    return word

# stop-words and length control
def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted

# generator, create item once a time
def get_terms(tree):
    for leaf in leaves(tree):
#         term = [normalise(w) for w,t in leaf if acceptable_word(w) ]
        term = [w for w,t in leaf if acceptable_word(w) ] # without normalise
        # Phrase only
        if len(term)>1:
            yield term

In [18]:
# Traverse tree and get noun phrases
npTokenList = [word for word in get_terms(tree)]

npTokenList

[['digital', 'computer'], ['cycle', 'transmission']]

In [19]:
# Flatten phrase lists to get tokens for analysis
def flatten(npTokenList):
    finalList =[]
    for phrase in npTokenList:
        token = ''
        for word in phrase:
            token += word + ' '
#         finalList.append(token.rstrip())
        finalList.append(token)
    return finalList

In [20]:
finalList = flatten(npTokenList)

In [21]:
finalList

['digital computer ', 'cycle transmission ']

# Noun Phrases

In [54]:
from ast import literal_eval

# Revise the previous dataframe transform function...
def newDataFrameTransformation(hotelDf, reviewDF, reviewcol_name = 'conca_review',k= 1000):
#     reviews = reviewDF[reviewcol_name].values
    reviews = hotelDf[reviewcol_name].values
    
    # Top-k frequent terms
    counter = Counter()
    for review in tqdm(reviews):
        counter.update(flatten([word
                            for word 
                            in get_terms(chunker.parse(pos_tag(re.findall(r'\w+', review)))) 
                            ]))
    topk = counter.most_common(k)        
    
    #Find out if a particular review has the word from topk list
    freqReview = []
    for i in tqdm(range(len(reviews))):
        tempCounter = Counter(flatten([word 
                                       for word 
                                       in get_terms(chunker.parse(pos_tag(re.findall(r'\w+',reviews[i]))))]))
        topkinReview = [1 if tempCounter[word] > 0 else 0 for (word,wordCount) in topk]
        freqReview.append(topkinReview)
         
    #Prepare freqReviewDf
    freqReviewDf = pandas.DataFrame(freqReview)
    dfName = []
    for c in topk:
        dfName.append(c[0])
    freqReviewDf.columns = dfName
    finalreviewDf = reviewDF.join(freqReviewDf)
    finaldf = hotelDf[['business_id','rating','Binary','review','conca_review','UserIndex','ItemIndex']].join(finalreviewDf)
    return topk, finaldf

In [56]:
# Compute PMI for all terms and all possible labels
def pmiForAllCal(df, label_column='Binary', topk=topk_phrase):
    #Try calculate all the pmi for top k and store them into one pmidf dataframe
    pmilist = []
    pmiposlist = []
    pmineglist = []
    for word in tqdm(topk):
#         pmilist.append([word[0]]+[pmiCal(df,word[0])])
        pmiposlist.append([word[0]]+[pmiIndivCal(df,word[0],1,label_column)])
        pmineglist.append([word[0]]+[pmiIndivCal(df,word[0],0,label_column)])
    pmidf = pandas.DataFrame(pmilist)
    pmiposlist = pandas.DataFrame(pmiposlist)
    pmineglist = pandas.DataFrame(pmineglist)
    pmiposlist.columns = ['word','pmi']
    pmineglist.columns = ['word','pmi']
#     pmidf.columns = ['word','pmi']
#     return pmiposlist, pmineglist, pmidf
    return pmiposlist, pmineglist

def pmiIndivCal(df,x,gt, label_column='Binary'):
    px = sum(df[label_column]==gt)/len(df)
    py = sum(df[x]==1)/len(df)
    pxy = len(df[(df[label_column]==gt) & (df[x]==1)])/len(df)
    if pxy==0:#Log 0 cannot happen
        pmi = math.log((pxy+0.0001)/(px*py))
    else:
        pmi = math.log(pxy/(px*py))
    return pmi

# Simple example of getting pairwise mutual information of a term
def pmiCal(df, x):
    pmilist=[]
    for i in [1,0]:
        for j in [0,1]:
            px = sum(df['Binary']==i)/len(df)
            py = sum(df[x]==j)/len(df)
            pxy = len(df[(df['Binary']==i) & (df[x]==j)])/len(df)
            if pxy==0:#Log 0 cannot happen
                pmi = math.log((pxy+0.0001)/(px*py))
            else:
                pmi = math.log(pxy/(px*py))
            pmilist.append([i]+[j]+[px]+[py]+[pxy]+[pmi])
    pmidf = pandas.DataFrame(pmilist)
    pmidf.columns = ['x','y','px','py','pxy','pmi']
    return pmidf

NameError: name 'topk_phrase' is not defined

In [None]:
topk_phrase, finaldf_phrase = newDataFrameTransformation(cur_df, reviewDF, k = 2000)

topk_phrase

## Train 

In [None]:
# PMI based on ground-truth rating
pmiposlist, pmineglist = pmiForAllCal(finaldf_phrase, topk=topk_phrase)

In [None]:
# PMI based on vader sentiment analysis on review texts. (entire review)
pmiposlist_vader, pmineglist_vader = pmiForAllCal(finaldf_phrase, topk=topk_phrase,label_column='Binary_vader')

In [201]:
# Save 
import pickle as pkl
with open("pmineglist", "wb") as output_file:
    pkl.dump(pmineglist, output_file)
with open("pmiposlist", "wb") as output_file:
    pkl.dump(pmiposlist, output_file)
with open("pmineglist_vader", "wb") as output_file:
    pkl.dump(pmineglist_vader, output_file)
with open("pmiposlist_vader", "wb") as output_file:
    pkl.dump(pmiposlist_vader, output_file)

In [69]:
# topk_phrase, finaldf_phrase
with open("finaldf_phrase", "wb") as output_file:
    pkl.dump(finaldf_phrase, output_file)

## Top positive/negative keyphrases

In [87]:
neg_rating = pmineglist.sort_values('pmi',ascending=0)
neg_rating = neg_rating['word'][:50].tolist()

In [91]:
neg_rating

['bad place',
 'okay noth',
 'decent place',
 'ok noth',
 'decent food',
 'second chanc',
 'terribl servic',
 'mediocr food',
 'decent servic',
 'eye contact',
 'sub par',
 'slow servic',
 'high hope',
 'dri side',
 'bit bland',
 'separ bill',
 'high price',
 'empti tabl',
 'poor servic',
 'room temperatur',
 'littl bland',
 'good dish',
 'bad tast',
 'averag price',
 'asian legend',
 'quick meal',
 'good overal',
 'bad servic',
 'salti side',
 'high side',
 'swiss chalet',
 'non exist',
 'plu side',
 'extra star',
 'wow factor',
 'long wait time',
 'bad day',
 'dim sum place',
 'bit pricey',
 'instant noodl',
 'chicken piec',
 'good locat',
 'small portion',
 'beef noodl',
 'good place',
 'much sauc',
 'decent portion',
 'good noth',
 'deer garden',
 'la carnita']

In [89]:
pos_rating = pmiposlist.sort_values('pmi',ascending=0)
pos_rating = pos_rating['word'][:50].tolist()

In [92]:
pos_rating

['amaz experi',
 'favourit restaur',
 'amaz servic',
 'wonder experi',
 'amaz food',
 'great recommend',
 'favorit place',
 'hidden gem',
 'hair cut',
 'perfect balanc',
 'favourit place',
 'littl gem',
 'much fun',
 'favourit spot',
 'escap room',
 'super help',
 'great coffe',
 'real deal',
 'great custom servic',
 'person favourit',
 'great qualiti',
 'top notch',
 'tast menu',
 'delici food',
 'great job',
 'super nice',
 'awesom place',
 'great experi',
 'delici meal',
 'excel servic',
 'perfect amount',
 'singl time',
 'mani flavour',
 'great staff',
 'nail salon',
 'farmer market',
 'bang bang',
 'sure everyth',
 'perfect place',
 'sushi bar',
 'favourit thing',
 'great servic',
 'vietnames coffe',
 'special occas',
 'great price',
 'good reason',
 'foie gra',
 'friendli staff',
 'board game',
 'filet mignon']

# Aspect-Oriented Table for each item

In [25]:
# Where does the keyword rank in the pos/neg keyphrase list
temp1 = np.where(pos_rating['word'].str.contains('fast'))
pos_rating.loc[temp1]

NameError: name 'pos_rating' is not defined

In [26]:
# First find the reviews that contains the word 
key_word = 'comfortable'
target_df = cur_df['review_text'].loc[np.where(cur_df['review_text'].str.contains(key_word))]
target_df_index = np.where(cur_df['review_text'].str.contains(key_word))[0]
cur_df.loc[target_df_index].head()

Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,business_id,friend_count,ghost,img_dsc,img_url,nr,photo_count,rating,...,Day,Binary,review,conca_review,keyVector,keyphrases_indices_length,UserIndex,ItemIndex,Binary_vader,vader
147,637,637,gISrxk4A5dfrjDivkC-L-Q,50,False,[],[],False,293.0,3.0,...,11,0,"['douce', 'france', 'nice', 'french', 'place',...",douce france nice french place decoration bea...,"[22, 38, 77, 150, 167, 172, 179]",7,102,5150,1,0.8706
163,693,693,gISrxk4A5dfrjDivkC-L-Q,53,False,"['Photo of Douce France - Toronto, ON, Canada'...",['https://s3-media2.fl.yelpcdn.com/bphoto/yPXm...,False,602.0,4.0,...,3,0,"['went', 'brunch', 'sunday', 'pre', 'set', 'sn...",went brunch sunday pre set snack came croissa...,"[31, 38, 47, 49, 53, 67, 148, 172]",8,9,5150,1,0.9818
189,773,773,5C57zUQdzvNrCus8JBawmQ,120,False,"['Photo of Magic Noodle - North York, ON, Cana...",['https://s3-media4.fl.yelpcdn.com/bphoto/QJdO...,False,471.0,3.0,...,1,0,"['build', 'noodle', 'availablecons', 'broth', ...",build noodle availablecons broth way saltyif ...,"[0, 4, 9, 52, 54, 77, 111, 172, 175, 206]",10,650,716,1,0.9742
303,1578,1578,PxH02Eu2Z4MUycBLU80D8g,175,False,[],[],False,22.0,5.0,...,5,1,"['ate', 'due', 'grupon', 'coupon', 'purchased'...",ate due grupon coupon purchased bit far west ...,"[5, 19, 21, 56, 83, 94, 105, 111, 153, 172, 17...",13,1745,3105,1,0.986
304,1608,1608,PxH02Eu2Z4MUycBLU80D8g,1,False,[],[],False,24.0,5.0,...,2,1,"['went', 'restaurant', 'winterlicious', 'pleas...",went restaurant winterlicious pleasantly surp...,"[17, 19, 56, 59, 112, 158, 166, 167, 172, 187]",10,261,3105,1,0.9769


In [27]:
# Check one of the review text from above
cur_df['review_text'][164]

"This bakery cafe definitely gets an ooh la la! Simply the best croissants in the city. So what if they're not made locally? They're made in France with French flour and butter. You can't get better than that! Baked fresh every day, so delicious! Their coffee is excellent \xa0and their hot chocolate Angelina is to die for. Also tried their Parisian sandwich made with a real demi-baguette, ham, and brie; simple and good.Inside the cafe you feel transported to France because of the decor and because the staff speaks French. So many goodies to try! I will have to come back. Merci!!"

In [28]:
# How many reviews contains this keyword
len(target_df_index[0])

TypeError: object of type 'numpy.int64' has no len()

## Draft

In [203]:
# First find the original text
cur_df.loc[1790]['review_text']

"It took 12 min after sitting down at the table in a normal not so busy restaurant before the server came through to ask for H2O or order. 12 min. The server was quite good after but 12min! C'mon. Great ambiance. the Baileys Horchata quite nice, the Old Fashioned wasn't an old fashioned. Bartender mistake. Gave me a Manhattan instead. Old fashioned made again. Decent. But why the small ice cubes!! \xa0Sacrilege diluting it all. The breakfast burger was not bad but the Shashuka was quite quite nice. Overall. Not a bad experience once the initial hiccup was done."

In [249]:
# Get the sentences of the orignal text
from nltk.tokenize import sent_tokenize
text = cur_df.loc[1790]['review_text']
sent_tokenize(text)

['It took 12 min after sitting down at the table in a normal not so busy restaurant before the server came through to ask for H2O or order.',
 '12 min.',
 'The server was quite good after but 12min!',
 "C'mon.",
 'Great ambiance.',
 "the Baileys Horchata quite nice, the Old Fashioned wasn't an old fashioned.",
 'Bartender mistake.',
 'Gave me a Manhattan instead.',
 'Old fashioned made again.',
 'Decent.',
 'But why the small ice cubes!!',
 'Sacrilege diluting it all.',
 'The breakfast burger was not bad but the Shashuka was quite quite nice.',
 'Overall.',
 'Not a bad experience once the initial hiccup was done.']

In [64]:
sentences = sent_tokenize(text)

In [61]:
# Process the sentence for the keyword to match
preprocess_sentence(sentences[0])

'suppose admirable chain restaurant channeling craft beer goodness using lure horde relatively clueless tourist white shirt friend know better come along anyway unfortunately beer underwhelming rather hit miss usually representative style implied name occasionally downright bad'

In [62]:
# Find the sentence contains the keyphrase
key_word = 'experience'
sentences = sent_tokenize(text)
# ind = np.where([key_word in sentence.lower() for sentence in sentences]) # we can do a preprocess here for each sentence
ind = np.where([key_word in preprocess_sentence(sentence) for sentence in sentences])
target_sentences = [preprocess_sentence(sentences[ind[0][i]]) for i in range(len(ind[0]))]
target_sentences

['order next find inevitably despite better judgement finally service typically attentive flavourless people kinda job well minimum personality injected experience']

In [289]:
# find relative positive or negative
sentence = target_sentences
sid = SentimentIntensityAnalyzer()
vader_scores = [sid.polarity_scores(sentence)['compound'] for sentence in sentence]
vader_scores

[0.4754, -0.0332]

In [66]:
# find relative positive or negative
sentence = target_sentences
sid = SentimentIntensityAnalyzer()
vader_scores = [sid.polarity_scores(sentence)['compound'] for sentence in sentence]
vader_scores

[-0.1492]

# Final function for generating the table

In [68]:
from nltk.tokenize import sent_tokenize
def get_vader_scores_for_sentences_containing_the_keyword(key_word,cur_df,original_review = 'review_text',processed_review = 'conca_review'):
    # First find the reviews that contains the word 
    sid = SentimentIntensityAnalyzer()
    target_df_index = np.where(cur_df[processed_review].str.contains(key_word))[0]
    # for each review, get the vader scores 
    vader_scores = []
    for i in target_df_index:
        
        # For each review first find the original text
#         text = cur_df.loc[i][original_review]
        text = cur_df[original_review][i]
        
        # Get the sentences of the orignal text
        sentences = sent_tokenize(text)
        
        # Find the sentence containing the key word
        ind = np.where([key_word in preprocess_sentence(sentence) for sentence in sentences])
        target_sentences = [preprocess_sentence(sentences[ind[0][i]]) for i in range(len(ind[0]))]
        
        # find relative positive or negative vader scores
        sentence = target_sentences
        vader_score = [sid.polarity_scores(sentence)['compound'] for sentence in sentence]
        vader_scores.append(vader_score)
        
    return vader_scores

def find_dataframe_for_item(cur_df, ItemIndex, ItemIndex_col = 'ItemIndex'):
    """
    Return the dataframe that only contains the reviews 
    """
    new_df = cur_df.loc[np.where(cur_df[ItemIndex_col] == ItemIndex)]
    new_df = new_df.reset_index()
    return new_df

def count_pos_neg(vader_result, threshold = 0):
    """
    From the vader score lists, get the positive/negative list  
    """
    pos_count = 0
    neg_count = 0
    if len(vader_result) == 0:
        return [0,0]
    for i in range(len(vader_result)):
        for j in range(len(vader_result[i])):
            if vader_result[i][j] > threshold:
                pos_count += 1
            else:
                neg_count += 1
    return [pos_count,neg_count]

def get_aspect_ratio(cur_df, ItemIndex, aspect_keywords, ItemIndex_col = 'ItemIndex', threshold = 0, original_review = 'review_text',processed_review = 'conca_review'):
    target_df = find_dataframe_for_item(cur_df,ItemIndex)
    vader_scores = get_vader_scores_for_sentences_containing_the_keyword(aspect_keywords,target_df)
    ratio = count_pos_neg(vader_scores,threshold=threshold)
    return ratio

def get_category_count(cur_df, ItemIndex, keywords):
    res = [0, 0]
    for keyword in keywords:
        a,b = get_aspect_ratio(cur_df, ItemIndex, keyword,
                ItemIndex_col = 'ItemIndex', threshold = 0, original_review = 'review_text',processed_review = 'conca_review')
        res[0]+= a
        res[1]+= b
    return res

def get_item_aspect_table(cur_df, ItemIndex, categories):
    
    for category in categories:
        keywords = categories[category]
        count = get_category_count(cur_df, ItemIndex, keywords)
        print ('Category ',category,' has the count: ', count, ' and ratio is ', count[0]/count[1])

In [70]:
get_aspect_ratio(cur_df, 8010, 'experience',
                ItemIndex_col = 'ItemIndex', threshold = 0, original_review = 'review_text',processed_review = 'conca_review')

[0, 0]

In [76]:
# Mannually selected key word for each category 
food_quality = ['eating','taste','quality','food','dessert',
                'texture','fresh', 'meat','vegetable','seafood','dish']
drink = ['coffee','tea','beer','drink']
service = ['service','care','friendly','customer','custom']
price = ['cheap','price','pricy','expensive']
ambiance = ['vibe','ambiance','atmosphere','room']
location = ['location','parking','street','shop','subway']
other = ['bathroom','washroom','table','seat','place','experience']

In [78]:
# expanded key words for each category 
food_key = ['taco', 'curry', 'potato', 'crispy', 'shrimp', 'bread',  'ramen', 'pizza',  'sandwich', 
            'sushi', 'egg', 'fish',  'burger', 'cheese', 'salad', 'pork', 'beef', 'noodle',
           'meat', 'chicken', 'dim sum', 'squid','tempura','tapioca','olive',
            'octopus','croissant','honey','scallop','congee',
           'skewer','miso','lettuce','avocado','calamari','kimchi','patty',
           'sesame','tart','four','crepe','tuna','wrap','vegan','coconut','corn','poutine','toast','belly',
           'oyster','sausage','duck','tofu','sashimi', 'lamb','mango','bacon','tomato'
            ,'lobster','rib','waffle','bun','wing','dumpling','bean','steak','salmon',
           'pasta','fried chicken','pork belly','spring roll','fried rice',
            'pork bone soup']
drink_key = ['beer','coffee','cocktail', 'tea', 'espresso','pop','juice','bubble tea','latte','wine','milk','milk tea','green tea']
dessert_key = ['chocolate','cake','ice cream','donut','cookie','cone','cheesecake','matcha','pancake']
friut_key = ['apple','lemon','strawberry','banana','fruit','grape','mango','watermelon','peach','pear']
seasoning=['sugar','oil','soy','leaf','spice','butter','ginger','pepper','peanut','garlic']
infrastruture_key = [ 'parking', 'store','shopping','nail','theatre','movie','washroom','bathroom',
                    'window','station','chair','table','seat','plaza','market', 'mall', 'booth']
service_key = ['quick', 'clean', 'busy',  'friendly','convenient','refill','soggy','greeted','bright','crowded','overpriced',
              'cheaper','immediately','dog','quiet','efficient','spacious','pleasant','fair','complaint','disappointing','fancy',
             'comfortable', 'dark','cozy','helpful','tax','nicely','honestly', 'pricey','yummy','music','chip','attentive',
              'reasonable','wait']
taste_key = ['traditional', 'fresh','spicy','flavorful','fluffy','smooth','frozen','sweetness','mayo','gravy','healthy','rare',
            'refreshing','crunchy','chili','crust','stick','steamed','greasy','dip','gelato','salt','stuffed','topped','smoked',
            'roasted','seasoned','chewy','pot','solid','sour', 'baked', 'juicy','creamy','deep fried']
category_key = ['chinese', 'fast', 'thai', 'bar', 'fry', 'fried', 'dessert', 'dinner', 'lunch', 'soup', 
                'mexico', 'italian','mexican','vietnamese','buffet','takeout','casual','pub','bakery','indian','classic',
               'modern','french','asian','birthday', 'vegetarian', 'downtown', 'bbq','japanese','breakfast','seafood',
               'brunch'] 


food_quality = ['eating','taste','quality','food',
                'texture', 'dish','cuisine']+food_key + taste_key + seasoning
dessert_quality = ['dessert']+ dessert_key
drink = drink_key
friut_quality = friut_key 
service = ['service','care','customer','custom'] + service_key
price = ['cheap','price','pricy','expensive']
ambiance = ['vibe','ambiance','atmosphere','room']
location = ['location','parking','street','shop','subway']
infrastructure = ['insfrastructure'] + infrastruture_key
recommend_to_others = ['girlfriend','boyfriend','wife','husband','child','friend','recommend','parent','grandparent','kid',
                       'family','brother','sister']

In [91]:
print ('[%s]' % ', '.join(map(str, recommend_to_others)))

[girlfriend, boyfriend, wife, husband, child, friend, recommend, parent, grandparent, kid, family, brother, sister]


In [81]:
categories = {'food_quality':food_quality,
              'dessert':dessert_quality,
              'drink': drink,
              'friut':friut_quality,
              'service': service,
              'price': price,
              'ambiance': ambiance,
              'location': location,
              'infrastructure':infrastructure,
              'recommend_to_others': recommend_to_others}

In [82]:
get_item_aspect_table(cur_df, 7312, categories)

Category  ambiance  has the count:  [12, 8]  and ratio is  1.5
Category  recommend_to_others  has the count:  [45, 17]  and ratio is  2.6470588235294117
Category  infrastructure  has the count:  [21, 16]  and ratio is  1.3125
Category  location  has the count:  [12, 6]  and ratio is  2.0
Category  drink  has the count:  [12, 5]  and ratio is  2.4
Category  food_quality  has the count:  [378, 212]  and ratio is  1.7830188679245282
Category  dessert  has the count:  [19, 10]  and ratio is  1.9
Category  price  has the count:  [7, 5]  and ratio is  1.4
Category  service  has the count:  [105, 38]  and ratio is  2.763157894736842
Category  friut  has the count:  [6, 1]  and ratio is  6.0


In [92]:
get_item_aspect_table(cur_df, 8517, categories)

Category  ambiance  has the count:  [31, 10]  and ratio is  3.1
Category  recommend_to_others  has the count:  [59, 14]  and ratio is  4.214285714285714
Category  infrastructure  has the count:  [57, 30]  and ratio is  1.9
Category  location  has the count:  [15, 5]  and ratio is  3.0
Category  drink  has the count:  [46, 24]  and ratio is  1.9166666666666667
Category  food_quality  has the count:  [666, 238]  and ratio is  2.7983193277310923
Category  dessert  has the count:  [18, 7]  and ratio is  2.5714285714285716
Category  price  has the count:  [54, 36]  and ratio is  1.5
Category  service  has the count:  [158, 73]  and ratio is  2.164383561643836
Category  friut  has the count:  [10, 1]  and ratio is  10.0


### Get examples

In [106]:
def get_vader_scores_for_sentences_containing_the_keyword(key_word,cur_df,original_review = 'review_text',processed_review = 'conca_review'):
    # First find the reviews that contains the word 
    sid = SentimentIntensityAnalyzer()
    target_df_index = np.where(cur_df[processed_review].str.contains(key_word))[0]
    # for each review, get the vader scores 
    vader_scores = []
    for i in target_df_index:
        
        # For each review first find the original text
#         text = cur_df.loc[i][original_review]
        text = cur_df[original_review][i]
        
        # Get the sentences of the orignal text
        sentences = sent_tokenize(text)
        
        # Find the sentence containing the key word
        ind = np.where([key_word in preprocess_sentence(sentence) for sentence in sentences])
        target_sentences = [preprocess_sentence(sentences[ind[0][i]]) for i in range(len(ind[0]))]
        
        # find relative positive or negative vader scores
        sentence = target_sentences
        vader_score = [sid.polarity_scores(sentence)['compound'] for sentence in sentence]
        vader_scores.append(vader_score)
        
    return vader_scores

def print_examples_for_sentences_containing_the_keyword(key_word,cur_df,original_review = 'review_text',processed_review = 'conca_review'):
    # First find the reviews that contains the word 
    sid = SentimentIntensityAnalyzer()
    target_df_index = np.where(cur_df[processed_review].str.contains(key_word))[0]
    # for each review, get the vader scores 
    vader_scores = []
    all_sentences = []
    res = {}
    for i in target_df_index:
        
        # For each review first find the original text
#         text = cur_df.loc[i][original_review]
        text = cur_df[original_review][i]
        
        # Get the sentences of the orignal text
        sentences = sent_tokenize(text)
        
        # Find the sentence containing the key word
        ind = np.where([key_word in preprocess_sentence(sentence) for sentence in sentences])
        target_sentences = [preprocess_sentence(sentences[ind[0][i]]) for i in range(len(ind[0]))]

        # find relative positive or negative vader scores
        sentence = target_sentences
        vader_score = [sid.polarity_scores(sentence)['compound'] for sentence in sentence]
        for sentence in sentence:
            ind_vader_score = sid.polarity_scores(sentence)['compound']
            res[sentence] = ind_vader_score
        
#         print (vader_score,sentence)
#         res[sentence[0]] = vader_score 
#         vader_scores.append(vader_score)
#         all_sentences.append(sentence)
        
#     return all_sentences, vader_scores
    return res


In [125]:
key_word = 'recommend'
ItemIndex = 7312
example_df = find_dataframe_for_item(cur_df, ItemIndex, ItemIndex_col = 'ItemIndex')
res = print_examples_for_sentences_containing_the_keyword(key_word,example_df,original_review = 'review_text',processed_review = 'conca_review')

In [112]:
res

{'always great service meal filling delicious': 0.8316,
 'best service bar none red lobster go ask patrick': 0.6369,
 'coming location year service food always fantastic came night craving crab leg': 0.5574,
 'droolservice good food came quick server attentive bringing u refill drink even empty': 0.2732,
 'enjoyed every last bite overall decent meal great attentive service everyone left stuffed satisfied': 0.8807,
 'establishment appears clean service good': 0.6808,
 'food average though service excellent price relatively cheap especially lunch around average': 0.5719,
 'food service good gave tip best scrumptious shrimp ever': 0.8807,
 'great customer service always': 0.6249,
 'never disappointed service red lobster': 0.3724,
 'often disappointed terrible service restaurant': -0.7351,
 'ordered alfredo shrimp pasta came clam chowder dessert place close service excellent even though nt many customer staff pretty hardworking liked': 0.8658,
 'please rave impeccable service': 0.3182,
 'r

In [126]:
res

{'definitely recommend restaurant casual date night family celebration craving seafood': 0.7184,
 'highly recommended': 0.2716,
 'per recommendation shrimp linguine last glad': 0.4588,
 'prior u ordering friendly gave u recommendation since first time': 0.4939,
 'really enjoyed maple bacon shrimp highly recommend trying new menu item': 0.7496,
 'something waitress recommended would thought would made better': 0.5719,
 'stuffed maine lobster great highly recommendthe food size huge great big stomach': 0.8885,
 'waiter joseph really pleasant gave u recommendation appetizer choose appetizer orderedred lobster signature pizza crisp thin crust pizza topped maritime lobster meat melted cheese fresh tomato sweet basil': 0.8357,
 'waitress kind gave u recommendation based told': 0.5267,
 'would nt really recommend average place': 0.4201,
 'would recommend coming food portion great': 0.765}

In [98]:
sentences

[['start off service excellent'],
 ['service good cheese biscuit delicious'],
 ['service decent waiter nt really check u often liked'],
 ['term food service definitely experienced better'],
 ['service like chain restaurant', 'great customer service always'],
 ['enjoyed every last bite overall decent meal great attentive service everyone left stuffed satisfied'],
 ['always great service meal filling delicious'],
 ['droolservice good food came quick server attentive bringing u refill drink even empty'],
 ['time last couple month say service amazing'],
 ['service great'],
 ['coming location year service food always fantastic came night craving crab leg',
  'service quick friendly'],
 ['service'],
 ['establishment appears clean service good'],
 ['service decent',
  'server friendly service bit slow side especially lunch hour'],
 ['please rave impeccable service',
  'often disappointed terrible service restaurant',
  'never disappointed service red lobster',
  'server trained provide good c

In [129]:
key_word = 'food'
ItemIndex = 8517
example_df = find_dataframe_for_item(cur_df, ItemIndex, ItemIndex_col = 'ItemIndex')
res = print_examples_for_sentences_containing_the_keyword(key_word,example_df,original_review = 'review_text',processed_review = 'conca_review')

In [130]:
res

{'also loved sticky meat dumpling deep fried shrimp roll mayo pineapple sticky rice good flaky taro pastry good bbq pork pastry delicious every time come since though food get slightly le le good': 0.9493,
 'although nt make trip sure could nt tell nt looking around room clientele well good crown princess feel much cleaner chinese restaurant downtown even serious style important food darn good': 0.886,
 'anyway back food': 0.0,
 'anyways pleasantly surprised quality food crown princess': 0.6124,
 'bill came tax tip food fed hungry female like think average appetite': 0.3612,
 'bit pricey compared place worth food always fresh hot since order piece paper restaurant always packed make reservation': 0.4939,
 'ca nt really go wrong thisordered baked bbq pork puff pastry personally nt like dish find dry one sister favourite dishordered pan fried snowpea leaf ordered deep fried wonton sour sauce dish big joke thought would real wonton filling basically bowl sweet sour sauce deep fried dumpli