In [12]:
import numpy as np
import pandas as pd
import nltk,string
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import math
import os
from nltk import FreqDist
from nltk.corpus import stopwords
import random

import en_core_web_sm
from nltk.util import ngrams

nlp = en_core_web_sm.load()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
data_path = 'data/reviewSelected100.json'
os.path.exists(data_path)

True

In [14]:
random.seed(69)
print(print(random.random()))

0.6842409524120733
None


In [15]:
review_df = pd.read_json(data_path, lines=True,encoding = "ISO-8859-1")

In [16]:
#Randomly generate 1 index
review_random_idx = np.array(np.random.rand(1)*len(review_df), dtype=np.int32)
#print out the business id selected
print(review_df.iloc[review_random_idx]['business_id'].iloc[0])

rN3gHTjqx5sOnoUpJ-6jyg


In [17]:
#set the selected business id as the id to be investigated
selected_id = review_df.iloc[review_random_idx]['business_id'].iloc[0]

In [18]:
#fix this id for subsequent usage
selected_id_str = str(selected_id)
print(selected_id_str)

rN3gHTjqx5sOnoUpJ-6jyg


In [19]:
#Extract out the sentences related to the random id
review_random_df = review_df.iloc[review_random_idx]
review_random_df = review_random_df.reset_index()

In [20]:
#Tokenize the text
review_random_df['tokenize'] = review_random_df['text'].apply(nltk.word_tokenize)
#Tag those individual tokens respectively
review_random_df['pos_tag'] = review_random_df['tokenize'].apply(nltk.pos_tag)

In [21]:
review_random_df.to_json(r'output/specific_business_review_3_final.json', orient='records', lines=True)

In [22]:
tokenized_path = 'output/specific_business_review_3_final.json'
reviews_adjectives_df = pd.read_json(tokenized_path, lines=True,encoding = "ISO-8859-1")

In [23]:
#checking some basic information
print(len(reviews_adjectives_df))
print(len(review_df))

1
15300


# Find all reviews that are related to a certain store

In [24]:
#extract all the reviews which are related to one store id 
def extract_reviews(store_id):
  if type(store_id) == str:
    selected_id = store_id
  else:
    selected_id = str(store_id)
  selected_store_reviews = []
  for index, element in review_df.iterrows():
    if element['business_id'] == selected_id:
        selected_store_reviews.append(element['text'])
  return selected_store_reviews

In [25]:
#test the function
extracted_review = extract_reviews(selected_id_str)
#print out the reviews for the store id selected 
print(extracted_review)

["First of all, this place is a gem! Mark is an amazing server, ask for him. What did we eat? I think everything. All was fabulous. Let me recap....\n\nRib eye, the king of steaks. Perfectly cooked and seasoned with a salt and pepper rub. I also ordered a side of Mac and cheese. If you like cheese, do it! It has a veal reduction in it which makes it good. Filet mignon 7 ounce, Perfectly bloody, and yummy....moo!!!. Lobster risotto, yummy but they used a bit of tarragon, not my favorite but good aside form hat personal preference. We did not eat dessert but we peeked at other tables and we're pleased with the views.", 'Beautiful dining room. Delicious petite filet. I ordered mine charred medium rare and it was cooked to perfection, The runner, Felix could not have been more attentive. When I spoke with the manager to extend my compliments he said that Felix has been in the building since the Elvis days. If you love a great steak, give Edge a try.', "It was pricey but the staff was aweso

# Use spacy nlp to tokenize and pos tag each text

In [26]:
#use spacy library to process the text extracted
def spacy_process(input_list: list):
    final_result = []
    #use nlp which is a spacy in-built function which will return the text and doc which is in token datatype
    doc = nlp(str(input_list))
    #token consists of text and its pos tags
    for token in doc:
        result = []
        result.append(token.text)
        result.append(token.pos_)
        final_result.append(result)
    return final_result,doc

In [27]:
#test the function
processed_review,result_doc = spacy_process(extracted_review)

In [28]:
#print out the processed output
print(processed_review)

[['[', 'PUNCT'], ['"', 'PUNCT'], ['First', 'ADV'], ['of', 'ADP'], ['all', 'DET'], [',', 'PUNCT'], ['this', 'DET'], ['place', 'NOUN'], ['is', 'AUX'], ['a', 'DET'], ['gem', 'NOUN'], ['!', 'PUNCT'], ['Mark', 'PROPN'], ['is', 'AUX'], ['an', 'DET'], ['amazing', 'ADJ'], ['server', 'NOUN'], [',', 'PUNCT'], ['ask', 'VERB'], ['for', 'ADP'], ['him', 'PRON'], ['.', 'PUNCT'], ['What', 'PRON'], ['did', 'AUX'], ['we', 'PRON'], ['eat', 'VERB'], ['?', 'PUNCT'], ['I', 'PRON'], ['think', 'VERB'], ['everything', 'PRON'], ['.', 'PUNCT'], ['All', 'DET'], ['was', 'AUX'], ['fabulous', 'ADJ'], ['.', 'PUNCT'], ['Let', 'VERB'], ['me', 'PRON'], ['recap', 'VERB'], ['....', 'PUNCT'], ['\\n\\nRib', 'NOUN'], ['eye', 'NOUN'], [',', 'PUNCT'], ['the', 'DET'], ['king', 'NOUN'], ['of', 'ADP'], ['steaks', 'NOUN'], ['.', 'PUNCT'], ['Perfectly', 'ADV'], ['cooked', 'VERB'], ['and', 'CCONJ'], ['seasoned', 'VERB'], ['with', 'ADP'], ['a', 'DET'], ['salt', 'NOUN'], ['and', 'CCONJ'], ['pepper', 'NOUN'], ['rub', 'NOUN'], ['.', 'PU

In [29]:
print(result_doc)

["First of all, this place is a gem! Mark is an amazing server, ask for him. What did we eat? I think everything. All was fabulous. Let me recap....\n\nRib eye, the king of steaks. Perfectly cooked and seasoned with a salt and pepper rub. I also ordered a side of Mac and cheese. If you like cheese, do it! It has a veal reduction in it which makes it good. Filet mignon 7 ounce, Perfectly bloody, and yummy....moo!!!. Lobster risotto, yummy but they used a bit of tarragon, not my favorite but good aside form hat personal preference. We did not eat dessert but we peeked at other tables and we're pleased with the views.", 'Beautiful dining room. Delicious petite filet. I ordered mine charred medium rare and it was cooked to perfection, The runner, Felix could not have been more attentive. When I spoke with the manager to extend my compliments he said that Felix has been in the building since the Elvis days. If you love a great steak, give Edge a try.', "It was pricey but the staff was aweso

# Find out the relevant pairs

In [30]:
# extact all the phrases that has adverb + adjectives
def adjective_pair_extraction_1(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_1 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADV'and (element.pos_ == 'ADJ')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_1.append(temp_list_tuple)
        selected_combination_1 = list(filter(lambda x: x, selected_combination_1))
    return selected_combination_1,count

In [31]:
# extact all the phrases that has adjectives + adverbs
def adjective_pair_extraction_2(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_2 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADJ'and (element.pos_ == 'ADV')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_2.append(temp_list_tuple)
        selected_combination_2 = list(filter(lambda x: x, selected_combination_2))
    return selected_combination_2,count

In [32]:
# extact all the phrases that has adjectives + adjectives
def adjective_pair_extraction_3(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_3 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADJ'and (element.pos_ == 'ADJ')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_3.append(temp_list_tuple)
        selected_combination_3 = list(filter(lambda x: x, selected_combination_3))
    return selected_combination_3,count

In [33]:
# extact all the phrases that has adverbs + adverbs  + adjectives
def adjective_pair_extraction_4(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_4 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADV'and (element.pos_ == 'ADV') and (element_after.pos_ == 'ADJ')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
            temp_list.append(element_after.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_4.append(temp_list_tuple)
        selected_combination_4 = list(filter(lambda x: x, selected_combination_4))
    return selected_combination_4,count

In [34]:
# extact all the phrases that has adjectives + prep  + noun
def adjective_pair_extraction_5(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_5 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element.pos_) == 'ADJ' and (element.pos_ == 'ADP') and (element_after.pos_ == 'NOUN')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
            temp_list.append(element_after.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_5.append(temp_list_tuple)
        selected_combination_5 = list(filter(lambda x: x, selected_combination_5))
    return selected_combination_5,count

In [35]:
# extact all the phrases that has adjectives + prep  + prep + noun
def adjective_pair_extraction_6(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_6 = []
    count = 0
    for index in range(len(doc_2)-2):
        temp_list = []
        element_1 = doc_2[index-1]
        element_2 = doc_2[index]
        element_3 = doc_2[index+1]
        element_4 = doc_2[index+2]
        if((element_1.pos_) == 'ADJ' and (element_2.pos_ == 'ADP') and (element_3.pos_ == 'ADP') and element_4.pos_ == 'NOUN'):
            count += 1
            temp_list.append(element_1.text)
            temp_list.append(element_2.text)
            temp_list.append(element_3.text)
            temp_list.append(element_4.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_6.append(temp_list_tuple)
        selected_combination_6 = list(filter(lambda x: x, selected_combination_6))
    return selected_combination_6,count

In [36]:
adjective_phrase_result_1,count_1 = adjective_pair_extraction_1(result_doc)

In [37]:
print(count_1)
print(adjective_phrase_result_1)

196
[('Perfectly', 'bloody'), ('more', 'attentive'), ('too', 'aldente'), ('most', 'tender'), ('too', 'stuffy'), ('very', 'friendly'), ('as', 'good'), ('really', 'good'), ('really', 'bright'), ('distractingly', 'bright'), ('very', 'professional'), ('very', 'knowledgeable'), ('very', 'clean'), ('very', 'nice'), ('pleasantly', 'surprised'), ('so', 'good'), ('so', 'good'), ('super', 'nice'), ('\\nToo', 'bad'), ('too', 'bright'), ('very', 'good'), ('too', 'much'), ('so', 'good'), ('very', 'hot'), ('indeed', 'hot'), ('so', 'good'), ('very', 'attentive'), ('So', 'tasty'), ('very', 'impressed'), ('very', 'accommodating'), ('very', 'upscale'), ('so', 'delicious'), ('very', 'good'), ('amazingly', 'attentive'), ('so', 'amazing'), ('methodically', 'slow'), ('pretty', 'nice'), ('so', 'many'), ('just', 'perfect.\\n\\nI'), ('mostly', 'choice'), ('so', 'tender'), ('amazingly', 'juicy'), ('very', 'nice'), ('pretty', 'nice'), ('very', 'attentive'), ('most', 'recent'), ('amazingly', 'fresh'), ('how', 'de

In [38]:
adjective_phrase_result_2,count_2 = adjective_pair_extraction_2(result_doc)

In [39]:
print(count_2)
print(adjective_phrase_result_2)

21
[('good', 'aside'), ('undercooked', 'somewhat'), ('famous', 'someday'), ('great', 'however'), ('delicious', 'as'), ('good', 'however'), ('busy', 'literally'), ('ecstatic', 'when'), ('good', 'at'), ('good', 'especially'), ('good', 'as'), ('hungry', 'again'), ('much', 'anymore'), ('little', 'too'), ('Overall', 'very'), ('fresh', 'here'), ('amazing', 'too'), ('excellent', 'too'), ('sure', 'why'), ('sure', 'where'), ('pricey', 'so')]


In [40]:
adjective_phrase_result_3,count_3 = adjective_pair_extraction_3(result_doc)

In [41]:
print(count_3)
print(adjective_phrase_result_3)

52
[('Delicious', 'petite'), ('several', 'other'), ('roasted', 'garlic'), ('brûlée', '\\n\\nspecial'), ('more', 'often.\\nNice'), ('often.\\nNice', 'romantic'), ('White', 'light'), ('light', 'flaky'), ('flaky', 'mild'), ('many', 'positive'), ('medium', 'rare'), ('few', 'more'), ('\\n\\nThe', 'real'), ('yummy', 'cheesy'), ('delicious', 'aged'), ('ultra', '-'), ('-', 'high'), ('oz', 'prime'), ('Japanese', 'wagu'), ('Super', 'awesome'), ('other', 'good'), ('few', 'other'), ('other', 'large'), ('few', 'more'), ('MANY', 'good'), ('few', 'good'), ('little', 'cold'), ('enough', 'great'), ('own', 'little'), ('masterful', 'culinary'), ('Real', 'blue'), ('flat', 'bread.\\n\\nI'), ('capacious', 'contemporary'), ('classic', 'ceramic'), ('complimentary', 'baked'), ('buttery', 'medium'), ('few', 'different'), ('five.\\n\\nOur', 'first'), ('buffalo', 'short'), ('minute).\\n\\nMost', 'beloved'), ('nice', 'different'), ('original', 'classic'), ('dry', 'aged'), ('Great', 'overall'), ('good', 'old'), ('q

In [42]:
adjective_phrase_result_4,count_4 = adjective_pair_extraction_4(result_doc)

In [87]:
print(adjective_phrase_result_4)

[('somewhat', 'too', 'aldente'), ('also', 'super', 'nice'), ('just', 'amazingly', 'juicy'), ('as', 'amazingly', 'fresh'), ('even', 'more', 'amazing'), ("isn\\'t", 'much', 'more'), ('very', 'nicely', 'spaced'), ('so', 'incredibly', 'kind'), ('Just', 'there', 'last'), ('also', 'incredibly', 'nice'), ('slightly', 'over', 'cooked'), ('very', 'very', 'reasonable'), ('so', 'incredibly', 'kind')]


In [44]:
adjective_phrase_result_5,count_5 = adjective_pair_extraction_5(result_doc)

In [45]:
print(count_5)
print(adjective_phrase_result_5)

0
[]


In [46]:
adjective_phrase_result_6,count_6 = adjective_pair_extraction_6(result_doc)

In [47]:
print(count_6)
print(adjective_phrase_result_6)

0
[]


# Adjective numbers extraction

In [48]:
#how many two-word bigram reviews are related to this one store
total_frequency = len(extracted_review)
def evaluative_indicativeness_bigram (adjective_phrase_list: list):
    bigram=[]
    bigram.extend(adjective_phrase_list)  
    freq_bi = nltk.FreqDist(bigram)
    result_frequency = freq_bi.most_common(5)
    result_frequency_dict = {}
    for element in freq_bi.most_common(5):
      individual_frequency = element[1]/total_frequency
      result_frequency_dict[element[0]] = individual_frequency
    return result_frequency_dict

In [49]:
#combining all the bigram and trigrams
bigram_list = adjective_phrase_result_1 + adjective_phrase_result_2 + adjective_phrase_result_3
trigram_list = adjective_phrase_result_4 + adjective_phrase_result_5

In [50]:
#how many three-word trigram reviews are related to this one store
total_frequency = len(extracted_review)
def evaluative_indicativeness_bigram (adjective_phrase_list: list):
    bigram=[]
    bigram.extend(adjective_phrase_list)  
    freq_bi = nltk.FreqDist(bigram)
    result_frequency = freq_bi.most_common(5)
    result_frequency_dict = {}
    for element in freq_bi.most_common(5):
      individual_frequency = element[1]/total_frequency
      result_frequency_dict[element[0]] = individual_frequency
    return result_frequency_dict

In [51]:
#how many reviews are related to this one store
total_frequency = len(extracted_review)
def evaluative_indicativeness_trigram (adjective_phrase_list: list):
    trigram=[]
    trigram.extend(adjective_phrase_list)  
    freq_tri = nltk.FreqDist(trigram)
    result_frequency = freq_tri.most_common(5)
    result_frequency_dict = {}
    for element in freq_tri.most_common(5):
      individual_frequency = element[1]/total_frequency
      result_frequency_dict[element[0]] = individual_frequency
    return result_frequency_dict

In [52]:
#carry out the processing for bigram and trigram respectively
frequency_result_bi = evaluative_indicativeness_bigram(bigram_list)
frequency_result_tri = evaluative_indicativeness_trigram(trigram_list)

In [53]:
print(frequency_result_bi)

{('very', 'good'): 0.08, ('very', 'nice'): 0.05, ('so', 'good'): 0.05, ('as', 'good'): 0.03, ('really', 'good'): 0.03}


In [54]:
print(frequency_result_tri)

{('so', 'incredibly', 'kind'): 0.02, ('somewhat', 'too', 'aldente'): 0.01, ('also', 'super', 'nice'): 0.01, ('just', 'amazingly', 'juicy'): 0.01, ('as', 'amazingly', 'fresh'): 0.01}


In [55]:
#merge two bigram/trigram output list 
def merge(dict1, dict2):
    res = {**dict1, **dict2}
    return res

In [56]:
final_dict = merge(frequency_result_bi,frequency_result_tri)

In [57]:
print(final_dict)

{('very', 'good'): 0.08, ('very', 'nice'): 0.05, ('so', 'good'): 0.05, ('as', 'good'): 0.03, ('really', 'good'): 0.03, ('so', 'incredibly', 'kind'): 0.02, ('somewhat', 'too', 'aldente'): 0.01, ('also', 'super', 'nice'): 0.01, ('just', 'amazingly', 'juicy'): 0.01, ('as', 'amazingly', 'fresh'): 0.01}


In [58]:
from operator import itemgetter

# Initialize N 
N = 5
# Using sorted() + itemgetter() + items()
frequency_result_top_5 = dict(sorted(final_dict.items(), key = itemgetter(1), reverse = True)[:N])

In [59]:
print(frequency_result_top_5)

{('very', 'good'): 0.08, ('very', 'nice'): 0.05, ('so', 'good'): 0.05, ('as', 'good'): 0.03, ('really', 'good'): 0.03}


In [60]:
#obtain the top 5 phrases and their respective probs
index = 1
index_result = {}

for key in frequency_result_top_5:
    phrase_id = "phrase_{}".format(index)
    prob_id = "prob_{}".format(index)
    index_result[phrase_id] = frequency_result_top_5[key]
    index += 1

In [61]:
print(index_result)

{'phrase_1': 0.08, 'phrase_2': 0.05, 'phrase_3': 0.05, 'phrase_4': 0.03, 'phrase_5': 0.03}


# Trying out on the entire dataset

In [62]:
review_length = len(review_df)
print(review_length)

15300


In [63]:
#finding all the business ids
business_ids = set()
for i in range(0, review_length):
    business = review_df.iloc[i]
    business_ids.add(business['business_id'])

In [64]:
print(len(business_ids))

153


In [65]:
#construct a dictionary to get all the stores and their respective processed reviews
selected = {}
for business in business_ids:
    #extract reviews for each individual id
    individual_reviews = extract_reviews(business)
    #append the extracted review to each business first
    selected[business] = individual_reviews

In [66]:
#use spacy to tokenize and process each review
#and extract relevant pairs
for key in selected:
    processed_review_selected,result_doc_selected = spacy_process(selected[key])
    #adjective extraction for each combination
    final_adjective_phrase_result_1,final_count_1 = adjective_pair_extraction_1(result_doc_selected)
    final_adjective_phrase_result_2,final_count_2 = adjective_pair_extraction_2(result_doc_selected)
    final_adjective_phrase_result_3,final_count_3 = adjective_pair_extraction_3(result_doc_selected)
    final_adjective_phrase_result_4,final_count_4 = adjective_pair_extraction_4(result_doc_selected)
    #adding more considerations
    final_adjective_phrase_result_5,final_count_5 = adjective_pair_extraction_5(result_doc_selected)
    final_adjective_phrase_result_6,final_count_6 = adjective_pair_extraction_6(result_doc_selected)


    #combining the list extracted
    final_bigram_list = final_adjective_phrase_result_1 + final_adjective_phrase_result_2 + final_adjective_phrase_result_3
    final_trigram_list = final_adjective_phrase_result_4 + final_adjective_phrase_result_5 


    #evaluate the respective data obtained 
    final_frequency_result_bi = evaluative_indicativeness_bigram(final_bigram_list)
    final_frequency_result_tri = evaluative_indicativeness_bigram(final_trigram_list)
    
    #merge the data obtained
    final_frequency_result = merge(final_frequency_result_bi,final_frequency_result_tri)
    
    #append the analysis to each of the result
    selected[key] = final_frequency_result

In [67]:
#print out the result to check 
for key in selected:
    print(key)
    print(selected[key])

3kUqNxO1rkDDb89GAfyNgw
{('very', 'relaxing'): 0.06, ('very', 'professional'): 0.04, ('very', 'good'): 0.04, ('very', 'friendly'): 0.04, ('super', 'friendly'): 0.03, ('personally', 'very', 'pleasant'): 0.01, ('significantly', 'less', 'clinical'): 0.01, ('much', 'more', 'reasonable'): 0.01, ('technically', 'very', 'good'): 0.01, ("wasn\\'t", 'very', 'relaxing'): 0.01}
0kPm1zEpeXFRg8D2phqgCQ
{('very', 'clean'): 0.03, ('too', 'much'): 0.03, ('very', 'friendly'): 0.03, ('too', 'sweet'): 0.03, ('large', 'iced'): 0.03, ('then', 'once', 'more'): 0.01, ('usually', 'very', 'sweet'): 0.01, ('way', 'too', 'sweet'): 0.01, ('little', 'less', 'expensive'): 0.01, ('kind', 'of', 'old'): 0.01}
sWh-N7K3ebRHZKhhH01mJQ
{('very', 'friendly'): 0.03, ('very', 'nice'): 0.03, ('non', '-'): 0.03, ('-', 'ER'): 0.03, ('much', 'better'): 0.02, ('so', 'much', 'better'): 0.02, ('very', 'very', 'careful'): 0.02, ('just', 'as', 'rude'): 0.01, ('Just', 'as', 'bad'): 0.01, ('so', 'so', 'much'): 0.01}
NdpvGGF4cLrdnA6ydSZz

# Analysis

In [79]:
#getting the selected store's result for analysis
frequency_result_top_5_key_list = list(frequency_result_top_5)
frequency_result_top_5_value_list = list(frequency_result_top_5.values())


In [80]:
#check the selected store's 1st review and compare its probs with the rest of the stores
print("For business ",selected_id)
phrase_1 = frequency_result_top_5_key_list[0]
prob_1 = frequency_result_top_5_value_list[0]
print("The first adjective phrase is:",phrase_1)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_1):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_1) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  rN3gHTjqx5sOnoUpJ-6jyg
The first adjective phrase is: ('very', 'good')

Its comparison with the rest of the stores is as follows: 

Store: 3kUqNxO1rkDDb89GAfyNgw| Probability:  0.04
******* There is a big difference *******
Store: NdpvGGF4cLrdnA6ydSZz3g| Probability:  0.06
Store: sj9osyqLyOy7b_kDZb1txA| Probability:  0.05
******* There is a big difference *******
Store: 7e3PZzUpG5FYOTGt3O3ePA| Probability:  0.13
Store: ZhACHVw9k438Z3i-Gw9BHA| Probability:  0.04
******* There is a big difference *******
Store: k99YNCx5KcQPR7oeTgAOzg| Probability:  0.06
Store: KjjNv4YFzBFnsBh2ncH1Iw| Probability:  0.04
******* There is a big difference *******
Store: UDiMCb4LKowlozI5mhtO_A| Probability:  0.04
******* There is a big difference *******
Store: mF2EW3twSrFPmT_RVV1-Qg| Probability:  0.08
Store: ukAUNyzUcBA4Su8UKjaJKg| Probability:  0.08
Store: 7vvliv5R0MD7BQ2CsquhcQ| Probability:  0.04
******* There is a big difference *******
Store: caq9CTtWB-8K0tdFUhTfAQ| Probability:  0.12
St

In [81]:
#check the selected store's 1st review and compare its probs with the rest of the stores
print("For business ",selected_id)
phrase_1 = frequency_result_top_5_key_list[0]
prob_1 = frequency_result_top_5_value_list[0]
print("The first adjective phrase is:",phrase_1)
print()
print("Its comparison with the rest of the stores is as follows: \n")

count_1 = 0
for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_1):
            count_1 += (selected[key][value]*100)
overall_prob_1 = count_1/15300
print("The phrase's local probability is:", prob_1)
print("The phrase's global probability is:",overall_prob_1)

For business  rN3gHTjqx5sOnoUpJ-6jyg
The first adjective phrase is: ('very', 'good')

Its comparison with the rest of the stores is as follows: 

The phrase's local probability is: 0.08
The phrase's global probability is: 0.031176470588235295


In [82]:
#check the selected store's 2nd review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_2 = frequency_result_top_5_key_list[1]
prob_2 = frequency_result_top_5_value_list[1]
print("The second adjective phrase is:",phrase_2)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    if key == selected_id:
        continue
    for value in values:
        if(value == phrase_2):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_2) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  rN3gHTjqx5sOnoUpJ-6jyg
The second adjective phrase is: ('very', 'nice')

Its comparison with the rest of the stores is as follows: 

Store: sWh-N7K3ebRHZKhhH01mJQ| Probability:  0.03
Store: oLb3-eXUFtCFJl2DuBhcvA| Probability:  0.08
Store: IzcL0KZPKqMbjoIaCmClog| Probability:  0.03
Store: vuHzLZ7nAeT-EiecOkS5Og| Probability:  0.03
Store: UDiMCb4LKowlozI5mhtO_A| Probability:  0.06
Store: j7HO1YeMQGYo3KibMXZ5vg| Probability:  0.05
Store: mF2EW3twSrFPmT_RVV1-Qg| Probability:  0.05
Store: ukAUNyzUcBA4Su8UKjaJKg| Probability:  0.04
Store: R43QH4FqHOOJapm5TaCEVg| Probability:  0.04
Store: TbkyrVi1b90cZS-rUtnvHA| Probability:  0.04
Store: rv9T1qGHbh9eXqFpKEPy_A| Probability:  0.04
Store: xJ_L2sJN1zk3VDpZELrV_Q| Probability:  0.03
Store: ih_bb1SaYvsMYfdykhYh2g| Probability:  0.04
Store: wPgp3oddFUzpGrQc__HYhg| Probability:  0.03
Store: AktuBx1W7c3ZdzwuaOp8xg| Probability:  0.03
Store: JqBtQ1bSynPHE9gbyuSSvA| Probability:  0.04
Store: AT-og6IROn0hJt-HR5Njlw| Probability:  0.06
Sto

In [83]:
#check the selected store's 2nd review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_2 = frequency_result_top_5_key_list[1]
prob_2 = frequency_result_top_5_value_list[1]
print("The second adjective phrase is:",phrase_2)
print()
print("Its comparison with the rest of the stores is as follows: \n")

count_2 = 0
for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_2):
            count_2 += (selected[key][value]*100)

overall_prob_2 = count_2/15300
print("The phrase's local probability is:", prob_2)
print("The phrase's overall probability is:",overall_prob_2)

For business  rN3gHTjqx5sOnoUpJ-6jyg
The second adjective phrase is: ('very', 'nice')

Its comparison with the rest of the stores is as follows: 

The phrase's local probability is: 0.05
The phrase's overall probability is: 0.015228758169934641


In [84]:
#check the selected store's 3rd review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_3 = frequency_result_top_5_key_list[2]
prob_3 = frequency_result_top_5_value_list[2]
print("The third adjective phrase is:",phrase_3)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    if key == selected_id:
        continue
    for value in values:
        if(value == phrase_3):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_3) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  rN3gHTjqx5sOnoUpJ-6jyg
The third adjective phrase is: ('so', 'good')

Its comparison with the rest of the stores is as follows: 

Store: ZhACHVw9k438Z3i-Gw9BHA| Probability:  0.04
Store: mF2EW3twSrFPmT_RVV1-Qg| Probability:  0.03
Store: NV5Q8ZhDSh_oI8ZOnn72Lw| Probability:  0.03
Store: yEZn1XpLsEC9uBa-X4xAZw| Probability:  0.04
Store: ZBE-H_aUlicix_9vUGQPIQ| Probability:  0.02
******* There is a big difference *******
Store: w5LUtmw4G1KD7qAntqGd9w| Probability:  0.04
Store: ih_bb1SaYvsMYfdykhYh2g| Probability:  0.05
Store: Rii85bzYKGC9P0zOyAem6A| Probability:  0.05
Store: hXzoNgpkC86K_Jfg_zMHvA| Probability:  0.03
Store: xVpE01l6ZXdEtVf5PkRpDg| Probability:  0.05
Store: 9Xm2GfG8Rnbb1_CmXyrm3g| Probability:  0.03
Store: shIPnFoXrL3dFo5HLH1_HA| Probability:  0.03
Store: Jcyu0ml7rxizEA8giSH-8A| Probability:  0.03
Store: Jol7cXrDmKKmu_V6qp4wMA| Probability:  0.04
Store: -7XWJYkutqhIxLen7Grg1g| Probability:  0.06
Store: JTo0oS3iiMl6UPWA6aI8SQ| Probability:  0.02
******* There 

In [85]:
#check the selected store's 3rd review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_3 = frequency_result_top_5_key_list[2]
prob_3 = frequency_result_top_5_value_list[2]
print("The third adjective phrase is:",phrase_3)
print()
print("Its comparison with the rest of the stores is as follows: \n")

count_3 = 0
for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_3):
            count_3 += (selected[key][value]*100)

overall_prob_3 =  count_3/15300
print("The phrase's local probability is:", prob_3)
print("The phrase's global probability is:",overall_prob_3)

For business  rN3gHTjqx5sOnoUpJ-6jyg
The third adjective phrase is: ('so', 'good')

Its comparison with the rest of the stores is as follows: 

The phrase's local probability is: 0.05
The phrase's global probability is: 0.008169934640522876


In [86]:
#check the selected store's 4th review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_4 = frequency_result_top_5_key_list[3]
prob_4 = frequency_result_top_5_value_list[3]
print("The fourth adjective phrase is:",phrase_4)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    if key == selected_id:
        continue
    for value in values:
        if(value == phrase_4):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_4) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  rN3gHTjqx5sOnoUpJ-6jyg
The fourth adjective phrase is: ('as', 'good')

Its comparison with the rest of the stores is as follows: 

Store: sj9osyqLyOy7b_kDZb1txA| Probability:  0.05
Store: KjjNv4YFzBFnsBh2ncH1Iw| Probability:  0.04
Store: yEZn1XpLsEC9uBa-X4xAZw| Probability:  0.05
Store: QqGMtc24VdCzYAajw1g4bA| Probability:  0.03
Store: IUMyUYOIR9UQ7XGIEQKOuA| Probability:  0.03
Store: XVDR44P_74FmA0ANanm4CQ| Probability:  0.09
Store: aDHD7nASfqiQBB6YXy2aGA| Probability:  0.03
Store: y5fwt6pAKnvYOSn-xOUepA| Probability:  0.04
Store: 0Rni7ocMC_Lg2UH0lDeKMQ| Probability:  0.03
Store: ZP7U4qUCMjIvasWV7a_mgg| Probability:  0.03


In [76]:
#check the selected store's 4th review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_4 = frequency_result_top_5_key_list[3]
prob_4 = frequency_result_top_5_value_list[3]
print("The fourth adjective phrase is:",phrase_4)
print()
print("Its comparison with the rest of the stores is as follows: \n")

count_4 = 0
for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_4):
            count_4 += (selected[key][value]*100)
overall_prob_4 =  count_4/15300
print("The phrase's local probability is:", prob_4)
print("The phrase's overall probability is:",overall_prob_4)

For business  rN3gHTjqx5sOnoUpJ-6jyg
The fourth adjective phrase is: ('as', 'good')

Its comparison with the rest of the stores is as follows: 

The phrase's local probability is: 0.03
The phrase's overall probability is: 0.0029411764705882353


In [77]:
#check the selected store's 5th review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_5 = frequency_result_top_5_key_list[4]
prob_5 = frequency_result_top_5_value_list[4]
print("The fifth adjective phrase is:",phrase_5)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    if key == selected_id:
        continue
    for value in values:
        if(value == phrase_5):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_5) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  rN3gHTjqx5sOnoUpJ-6jyg
The fifth adjective phrase is: ('really', 'good')

Its comparison with the rest of the stores is as follows: 

Store: sj9osyqLyOy7b_kDZb1txA| Probability:  0.05
Store: 7e3PZzUpG5FYOTGt3O3ePA| Probability:  0.04
Store: ZhACHVw9k438Z3i-Gw9BHA| Probability:  0.07
Store: XA_m9daZl2VFDA6alnkBvg| Probability:  0.03
Store: k99YNCx5KcQPR7oeTgAOzg| Probability:  0.03
Store: KjjNv4YFzBFnsBh2ncH1Iw| Probability:  0.06
Store: oICXzFAaUMrYGzjRWmkw4Q| Probability:  0.04
Store: j7HO1YeMQGYo3KibMXZ5vg| Probability:  0.07
Store: 7vvliv5R0MD7BQ2CsquhcQ| Probability:  0.04
Store: caq9CTtWB-8K0tdFUhTfAQ| Probability:  0.04
Store: WO3L0pmtAO8ozspmaVdHIg| Probability:  0.03
Store: NV5Q8ZhDSh_oI8ZOnn72Lw| Probability:  0.07
Store: a6mYyhGgxWhnhrGDHzOTPA| Probability:  0.04
Store: S6apFS5ghsQg69rcBvm2Qg| Probability:  0.06
Store: xJ_L2sJN1zk3VDpZELrV_Q| Probability:  0.05
Store: QqGMtc24VdCzYAajw1g4bA| Probability:  0.04
Store: JqBtQ1bSynPHE9gbyuSSvA| Probability:  0.06
St

In [78]:
#check the selected store's 5th review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_5 = frequency_result_top_5_key_list[4]
prob_5 = frequency_result_top_5_value_list[4]
print("The fifth adjective phrase is:",phrase_5)
print()
print("Its comparison with the rest of the stores is as follows: \n")

count_5 = 0

for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_5):
            count_5 += (selected[key][value]*100)
overall_prob_5 = count_5/15300
print("The phrase's local probability is:", prob_5)
print("The phrase's overall probability is:",overall_prob_5)

For business  rN3gHTjqx5sOnoUpJ-6jyg
The fifth adjective phrase is: ('really', 'good')

Its comparison with the rest of the stores is as follows: 

The phrase's local probability is: 0.03
The phrase's overall probability is: 0.021111111111111112
