In [1]:
import numpy as np
import pandas as pd
import nltk,string
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import math
import os
from nltk import FreqDist
from nltk.corpus import stopwords
import random

import en_core_web_sm
from nltk.util import ngrams

nlp = en_core_web_sm.load()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
data_path = 'data/reviewSelected100.json'
os.path.exists(data_path)

True

In [3]:
random.seed(69)

In [4]:
review_df = pd.read_json(data_path, lines=True,encoding = "ISO-8859-1")

In [5]:
#Randomly generate 1 index
review_random_idx = np.array(np.random.rand(1)*len(review_df), dtype=np.int32)
#print out the business id selected
print(review_df.iloc[review_random_idx]['business_id'].iloc[0])

Jcyu0ml7rxizEA8giSH-8A


In [6]:
#set the selected business id as the id to be investigated
selected_id = review_df.iloc[review_random_idx]['business_id'].iloc[0]

In [7]:
#fix this id for subsequent usage
selected_id_str = str(selected_id)
print(selected_id_str)

Jcyu0ml7rxizEA8giSH-8A


In [8]:
#Extract out the sentences related to the random id
review_random_df = review_df.iloc[review_random_idx]
review_random_df = review_random_df.reset_index()

In [9]:
#Tokenize the text
review_random_df['tokenize'] = review_random_df['text'].apply(nltk.word_tokenize)
#Tag those individual tokens respectively
review_random_df['pos_tag'] = review_random_df['tokenize'].apply(nltk.pos_tag)

In [10]:
review_random_df.to_json(r'output/specific_business_review_3_final.json', orient='records', lines=True)

In [11]:
tokenized_path = 'output/specific_business_review_3_final.json'
reviews_adjectives_df = pd.read_json(tokenized_path, lines=True,encoding = "ISO-8859-1")

In [12]:
#checking some basic information
print(len(reviews_adjectives_df))
print(len(review_df))

1
15300


# Find all reviews that are related to a certain store

In [13]:
#extract all the reviews which are related to one store id 
def extract_reviews(store_id):
  if type(store_id) == str:
    selected_id = store_id
  else:
    selected_id = str(store_id)
  selected_store_reviews = []
  for index, element in review_df.iterrows():
    if element['business_id'] == selected_id:
        selected_store_reviews.append(element['text'])
  return selected_store_reviews

In [14]:
#test the function
extracted_review = extract_reviews(selected_id_str)
#print out the reviews for the store id selected 
print(extracted_review)

['I was utterly disappointed by their service. The tables were not cleaned. There was rice and water on the chairs.\n\nWe found ourself a half cleaned table and when I was halfway done eating (10:40pm), I was told by the waiter that we are closing in a few minutes. I gave her a look basically stating do you want me to stop eating.. She again came back in 5 mins stating we have to turn off the lights. At that point I just got up and left.\n\nI told her while leaving that they should have just told me at 10:20pm (when I arrived) that they are not taking orders. \n\nI am surprised this place is still open with this kind of service. \n\nBetter go somewhere else.', 'Went here yesterday and ordered a barg kabab and chicken breast and a bolani. The bolani was undercooked and had absolutely no filling in it like none whatsoever.  Was told that "This is how we make it." It was literally the worst bolani I\'ve seen.  The barg kabab was also not cooked all the way through that we only found out w

# Use spacy nlp to tokenize and pos tag each text

In [15]:
#use spacy library to process the text extracted
def spacy_process(input_list: list):
    final_result = []
    #use nlp which is a spacy in-built function which will return the text and doc which is in token datatype
    doc = nlp(str(input_list))
    #token consists of text and its pos tags
    for token in doc:
        result = []
        result.append(token.text)
        result.append(token.pos_)
        final_result.append(result)
    return final_result,doc

In [16]:
#test the function
processed_review,result_doc = spacy_process(extracted_review)

In [17]:
#print out the processed output
print(processed_review)

[['[', 'PUNCT'], ["'", 'PUNCT'], ['I', 'PRON'], ['was', 'AUX'], ['utterly', 'ADV'], ['disappointed', 'ADJ'], ['by', 'ADP'], ['their', 'PRON'], ['service', 'NOUN'], ['.', 'PUNCT'], ['The', 'DET'], ['tables', 'NOUN'], ['were', 'AUX'], ['not', 'PART'], ['cleaned', 'VERB'], ['.', 'PUNCT'], ['There', 'PRON'], ['was', 'AUX'], ['rice', 'NOUN'], ['and', 'CCONJ'], ['water', 'NOUN'], ['on', 'ADP'], ['the', 'DET'], ['chairs.\\n\\nWe', 'PROPN'], ['found', 'VERB'], ['ourself', 'PRON'], ['a', 'DET'], ['half', 'ADV'], ['cleaned', 'VERB'], ['table', 'NOUN'], ['and', 'CCONJ'], ['when', 'ADV'], ['I', 'PRON'], ['was', 'AUX'], ['halfway', 'ADV'], ['done', 'VERB'], ['eating', 'VERB'], ['(', 'PUNCT'], ['10:40pm', 'PROPN'], [')', 'PUNCT'], [',', 'PUNCT'], ['I', 'PRON'], ['was', 'AUX'], ['told', 'VERB'], ['by', 'ADP'], ['the', 'DET'], ['waiter', 'NOUN'], ['that', 'DET'], ['we', 'PRON'], ['are', 'AUX'], ['closing', 'VERB'], ['in', 'ADP'], ['a', 'DET'], ['few', 'ADJ'], ['minutes', 'NOUN'], ['.', 'PUNCT'], ['I',

In [18]:
print(result_doc)

['I was utterly disappointed by their service. The tables were not cleaned. There was rice and water on the chairs.\n\nWe found ourself a half cleaned table and when I was halfway done eating (10:40pm), I was told by the waiter that we are closing in a few minutes. I gave her a look basically stating do you want me to stop eating.. She again came back in 5 mins stating we have to turn off the lights. At that point I just got up and left.\n\nI told her while leaving that they should have just told me at 10:20pm (when I arrived) that they are not taking orders. \n\nI am surprised this place is still open with this kind of service. \n\nBetter go somewhere else.', 'Went here yesterday and ordered a barg kabab and chicken breast and a bolani. The bolani was undercooked and had absolutely no filling in it like none whatsoever.  Was told that "This is how we make it." It was literally the worst bolani I\'ve seen.  The barg kabab was also not cooked all the way through that we only found out w

# Find out the relevant pairs

In [19]:
# extact all the phrases that has adverb + adjectives
def adjective_pair_extraction_1(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_1 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADV'and (element.pos_ == 'ADJ')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_1.append(temp_list_tuple)
        selected_combination_1 = list(filter(lambda x: x, selected_combination_1))
    return selected_combination_1,count

In [20]:
# extact all the phrases that has adjectives + adverbs
def adjective_pair_extraction_2(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_2 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADJ'and (element.pos_ == 'ADV')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_2.append(temp_list_tuple)
        selected_combination_2 = list(filter(lambda x: x, selected_combination_2))
    return selected_combination_2,count

In [21]:
# extact all the phrases that has adjectives + adjectives
def adjective_pair_extraction_3(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_3 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADJ'and (element.pos_ == 'ADJ')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_3.append(temp_list_tuple)
        selected_combination_3 = list(filter(lambda x: x, selected_combination_3))
    return selected_combination_3,count

In [22]:
# extact all the phrases that has adverbs + adverbs  + adjectives
def adjective_pair_extraction_4(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_4 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADV'and (element.pos_ == 'ADV') and (element_after.pos_ == 'ADJ')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
            temp_list.append(element_after.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_4.append(temp_list_tuple)
        selected_combination_4 = list(filter(lambda x: x, selected_combination_4))
    return selected_combination_4,count

In [23]:
# extact all the phrases that has adjectives + prep  + noun
def adjective_pair_extraction_5(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_5 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element.pos_) == 'ADJ' and (element.pos_ == 'ADP') and (element_after.pos_ == 'NOUN')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
            temp_list.append(element_after.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_5.append(temp_list_tuple)
        selected_combination_5 = list(filter(lambda x: x, selected_combination_5))
    return selected_combination_5,count

In [24]:
# extact all the phrases that has adjectives + prep  + prep + noun
def adjective_pair_extraction_6(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_6 = []
    count = 0
    for index in range(len(doc_2)-2):
        temp_list = []
        element_1 = doc_2[index-1]
        element_2 = doc_2[index]
        element_3 = doc_2[index+1]
        element_4 = doc_2[index+2]
        if((element_1.pos_) == 'ADJ' and (element_2.pos_ == 'ADP') and (element_3.pos_ == 'ADP') and element_4.pos_ == 'NOUN'):
            count += 1
            temp_list.append(element_1.text)
            temp_list.append(element_2.text)
            temp_list.append(element_3.text)
            temp_list.append(element_4.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_6.append(temp_list_tuple)
        selected_combination_6 = list(filter(lambda x: x, selected_combination_6))
    return selected_combination_6,count

In [25]:
adjective_phrase_result_1,count_1 = adjective_pair_extraction_1(result_doc)

In [26]:
print(count_1)
print(adjective_phrase_result_1)

188
[('utterly', 'disappointed'), ('still', 'open'), ('completely', 'undigestsble'), ('super', 'spicy'), ('too', 'hard'), ('really', 'tasty'), ('soon', 'busy'), ('so', 'worth'), ('much', 'better'), ('very', 'similar'), ('absolutely', 'delicious'), ('Very', 'flavourful'), ('very', 'similar'), ('really', 'tasty'), ('so', 'much'), ('actually', 'spicy'), ('also', 'good'), ('very', 'good'), ('too', 'much'), ('very', 'noisy'), ('so', 'many'), ('pretty', 'similar'), ('really', 'busy'), ('also', 'similar'), ('also', 'good'), ('so', 'disgusting'), ('too', 'good'), ('even', 'decent'), ('very', 'little'), ('so', 'many'), ('pretty', 'good'), ('completely', 'clueless'), ('indeed', 'superior'), ('definitely', 'worth'), ('only', 'able'), ('at', 'best'), ('too', 'salty'), ('that', 'friendly'), ('totally', 'worth'), ('well', 'cooked.\\nI'), ('too', 'slow'), ('remotely', 'busy.\\n3'), ('very', 'busy'), ('just', 'iceberg'), ('super', 'juicy'), ('well', 'marinated'), ('really', 'good'), ('Very', 'busy'), 

In [27]:
adjective_phrase_result_2,count_2 = adjective_pair_extraction_2(result_doc)

In [28]:
print(count_2)
print(adjective_phrase_result_2)

31
[('medium', 'well'), ('mild', 'anymore'), ('hot', 'inside'), ('sure', 'how'), ('good', 'too'), ('okay', 'at'), ('little', 'too'), ('excellent', 'here'), ('cooked.\\nI', 'highly'), ('difficult', 'just'), ('disgusting.\\n\\nI', 'only'), ('big', 'so'), ('professional', 'either'), ('big', 'enough'), ('rude', 'when'), ('reasonable', 'especially'), ('usual', 'especially'), ('busy', 'so'), ('special', 'though'), ('busy', 'so'), ('good', 'overall'), ('good', 'overall'), ('good', 'too'), ('clean', 'inside'), ('airy', 'so'), ('little', 'too'), ('nice', 'enough'), ('friendly', 'sometimes'), ('reasonable', 'as'), ('superior.\\n\\nI', 'always'), ('good', 'enough')]


In [29]:
adjective_phrase_result_3,count_3 = adjective_pair_extraction_3(result_doc)

In [30]:
print(count_3)
print(adjective_phrase_result_3)

56
[('little', 'orange'), ('good', 'Afghan'), ('best', 'kabob'), ('Brown', 'long'), ('few', 'Indian'), ('little', 'disproportionate'), ('only', 'other'), ('other', 'Afghan'), ('little', 'expensive'), ('other', 'afghan'), ('many', 'good'), ('beautiful', 'Afghan'), ('awesome', 'hot'), ('Best', 'afghan'), ('extra', 'garlic'), ('best', 'Afghan'), ('little', 'harder'), ('little', 'slow'), ('popular', 'Scarborough'), ('Nice', 'juicy'), ('such', 'great'), ('little', 'busy'), ('good', 'enough'), ('best', 'Afghan'), ('exact', 'same'), ('\\n\\nThe', 'other'), ('decent', 'sized'), ('true', 'afghani'), ('regular', 'Afghan'), ('fancy', 'Afghan'), ('Rude', 'front'), ('little', 'dry'), ('delicious', 'Afghan'), ('super', 'spicy'), ('only', 'main'), ('little', 'reluctant'), ('Best', 'afghan'), ('best', 'afghan'), ('typical', 'afghani'), ('extra', 'white'), ('best', 'Afghani'), ('garlic', 'white'), ('red', 'hot'), ('tender', 'veal'), ('best', 'naan'), ('polite', 'fellow'), ('special', 'Islamic'), ('simi

In [31]:
adjective_phrase_result_4,count_4 = adjective_pair_extraction_4(result_doc)

In [32]:
print(count_4)
print(adjective_phrase_result_4)

10
[('really', 'really', 'tasty'), ('also', 'really', 'tasty'), ('really', 'that', 'friendly'), ('even', 'remotely', 'busy.\\n3'), ('nonetheless', 'really', 'good'), ('also', 'pretty', 'good'), ('Usually', 'very', 'busy'), ('already', 'very', 'cold'), ('Still', 'very', 'good'), ('usually', 'very', 'busy')]


In [33]:
adjective_phrase_result_5,count_5 = adjective_pair_extraction_5(result_doc)

In [34]:
print(count_5)
print(adjective_phrase_result_5)

0
[]


In [35]:
adjective_phrase_result_6,count_6 = adjective_pair_extraction_6(result_doc)

In [36]:
print(count_6)
print(adjective_phrase_result_6)

0
[]


# Adjective numbers extraction

In [37]:
#how many two-word bigram reviews are related to this one store
total_frequency = len(extracted_review)
def evaluative_indicativeness_bigram (adjective_phrase_list: list):
    bigram=[]
    bigram.extend(adjective_phrase_list)  
    freq_bi = nltk.FreqDist(bigram)
    result_frequency = freq_bi.most_common(5)
    result_frequency_dict = {}
    for element in freq_bi.most_common(5):
      individual_frequency = element[1]/total_frequency
      result_frequency_dict[element[0]] = individual_frequency
    return result_frequency_dict

In [38]:
#combining all the bigram and trigrams
bigram_list = adjective_phrase_result_1 + adjective_phrase_result_2 + adjective_phrase_result_3
trigram_list = adjective_phrase_result_4 + adjective_phrase_result_5

In [39]:
#how many three-word trigram reviews are related to this one store
total_frequency = len(extracted_review)
def evaluative_indicativeness_bigram (adjective_phrase_list: list):
    bigram=[]
    bigram.extend(adjective_phrase_list)  
    freq_bi = nltk.FreqDist(bigram)
    result_frequency = freq_bi.most_common(5)
    result_frequency_dict = {}
    for element in freq_bi.most_common(5):
      individual_frequency = element[1]/total_frequency
      result_frequency_dict[element[0]] = individual_frequency
    return result_frequency_dict

In [40]:
#how many reviews are related to this one store
total_frequency = len(extracted_review)
def evaluative_indicativeness_trigram (adjective_phrase_list: list):
    trigram=[]
    trigram.extend(adjective_phrase_list)  
    freq_tri = nltk.FreqDist(trigram)
    result_frequency = freq_tri.most_common(5)
    result_frequency_dict = {}
    for element in freq_tri.most_common(5):
      individual_frequency = element[1]/total_frequency
      result_frequency_dict[element[0]] = individual_frequency
    return result_frequency_dict

In [41]:
#carry out the processing for bigram and trigram respectively
frequency_result_bi = evaluative_indicativeness_bigram(bigram_list)
frequency_result_tri = evaluative_indicativeness_trigram(trigram_list)

In [42]:
print(frequency_result_bi)

{('very', 'good'): 0.08, ('very', 'busy'): 0.06, ('pretty', 'good'): 0.03, ('so', 'good'): 0.03, ('very', 'big'): 0.03}


In [43]:
print(frequency_result_tri)

{('really', 'really', 'tasty'): 0.01, ('also', 'really', 'tasty'): 0.01, ('really', 'that', 'friendly'): 0.01, ('even', 'remotely', 'busy.\\n3'): 0.01, ('nonetheless', 'really', 'good'): 0.01}


In [44]:
#merge two bigram/trigram output list 
def merge(dict1, dict2):
    res = {**dict1, **dict2}
    return res

In [45]:
final_dict = merge(frequency_result_bi,frequency_result_tri)

In [46]:
print(final_dict)

{('very', 'good'): 0.08, ('very', 'busy'): 0.06, ('pretty', 'good'): 0.03, ('so', 'good'): 0.03, ('very', 'big'): 0.03, ('really', 'really', 'tasty'): 0.01, ('also', 'really', 'tasty'): 0.01, ('really', 'that', 'friendly'): 0.01, ('even', 'remotely', 'busy.\\n3'): 0.01, ('nonetheless', 'really', 'good'): 0.01}


In [47]:
from operator import itemgetter

# Initialize N 
N = 5
# Using sorted() + itemgetter() + items()
frequency_result_top_5 = dict(sorted(final_dict.items(), key = itemgetter(1), reverse = True)[:N])

In [48]:
print(frequency_result_top_5)

{('very', 'good'): 0.08, ('very', 'busy'): 0.06, ('pretty', 'good'): 0.03, ('so', 'good'): 0.03, ('very', 'big'): 0.03}


In [49]:
#obtain the top 5 phrases and their respective probs
index = 1
index_result = {}

for key in frequency_result_top_5:
    phrase_id = "phrase_{}".format(index)
    prob_id = "prob_{}".format(index)
    index_result[phrase_id] = frequency_result_top_5[key]
    index += 1

In [50]:
print(index_result)

{'phrase_1': 0.08, 'phrase_2': 0.06, 'phrase_3': 0.03, 'phrase_4': 0.03, 'phrase_5': 0.03}


# Trying out on the entire dataset

In [51]:
review_length = len(review_df)
print(review_length)

15300


In [52]:
#finding all the business ids
business_ids = set()
for i in range(0, review_length):
    business = review_df.iloc[i]
    business_ids.add(business['business_id'])

In [53]:
print(len(business_ids))

153


In [54]:
#construct a dictionary to get all the stores and their respective processed reviews
selected = {}
for business in business_ids:
    #extract reviews for each individual id
    individual_reviews = extract_reviews(business)
    #append the extracted review to each business first
    selected[business] = individual_reviews

In [55]:
#use spacy to tokenize and process each review
#and extract relevant pairs
for key in selected:
    processed_review_selected,result_doc_selected = spacy_process(selected[key])
    #adjective extraction for each combination
    final_adjective_phrase_result_1,final_count_1 = adjective_pair_extraction_1(result_doc_selected)
    final_adjective_phrase_result_2,final_count_2 = adjective_pair_extraction_2(result_doc_selected)
    final_adjective_phrase_result_3,final_count_3 = adjective_pair_extraction_3(result_doc_selected)
    final_adjective_phrase_result_4,final_count_4 = adjective_pair_extraction_4(result_doc_selected)
    #adding more considerations
    final_adjective_phrase_result_5,final_count_5 = adjective_pair_extraction_5(result_doc_selected)
    final_adjective_phrase_result_6,final_count_6 = adjective_pair_extraction_6(result_doc_selected)


    #combining the list extracted
    final_bigram_list = final_adjective_phrase_result_1 + final_adjective_phrase_result_2 + final_adjective_phrase_result_3
    final_trigram_list = final_adjective_phrase_result_4 + final_adjective_phrase_result_5 


    #evaluate the respective data obtained 
    final_frequency_result_bi = evaluative_indicativeness_bigram(final_bigram_list)
    final_frequency_result_tri = evaluative_indicativeness_bigram(final_trigram_list)
    
    #merge the data obtained
    final_frequency_result = merge(final_frequency_result_bi,final_frequency_result_tri)
    
    #append the analysis to each of the result
    selected[key] = final_frequency_result

In [56]:
#print out the result to check 
for key in selected:
    print(key)
    print(selected[key])

Gr_TkW3iFdgahixONGBsww
{('very', 'nice'): 0.11, ('very', 'comfortable'): 0.05, ('very', 'clean'): 0.03, ('very', 'friendly'): 0.03, ('very', 'good'): 0.03, ('Sometimes', 'too', 'edgy'): 0.01, ('much', 'more', 'expensive'): 0.01, ('most', 'importantly', 'FAST'): 0.01, ('just', 'as', 'nice'): 0.01, ('kind', 'of', 'surprised'): 0.01}
XuZ9pksVQBIpmVQm6UO2YQ
{('pretty', 'good'): 0.06, ('very', 'good'): 0.06, ('so', 'good'): 0.05, ('quite', 'impressed'): 0.04, ('very', 'tasty'): 0.04, ('just', 'too', 'weird'): 0.02, ('much', 'more', 'varied'): 0.01, ('so', 'much', 'better'): 0.01, ('very', 'much', 'worth'): 0.01, ('maybe', 'too', 'light'): 0.01}
k99YNCx5KcQPR7oeTgAOzg
{('very', 'good'): 0.06, ('much', 'better'): 0.04, ('really', 'good'): 0.03, ('always', 'good'): 0.03, ('over', 'cooked'): 0.03, ('quite', 'as', 'bad'): 0.01, ('there', 'much', 'longer'): 0.01, ('so', 'much', 'better'): 0.01, ('just', 'plain', 'mediocre'): 0.01, ('so', 'at', 'best'): 0.01}
ih_bb1SaYvsMYfdykhYh2g
{('very', 'good

# Analysis

In [57]:
#getting the selected store's result for analysis
frequency_result_top_5_key_list = list(frequency_result_top_5)
frequency_result_top_5_value_list = list(frequency_result_top_5.values())


In [63]:
#check the selected store's 1st review and compare its probs with the rest of the stores
print("For business ",selected_id)
phrase_1 = frequency_result_top_5_key_list[0]
prob_1 = frequency_result_top_5_value_list[0]
print("The first adjective phrase is:",phrase_1)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_1):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_1) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  Jcyu0ml7rxizEA8giSH-8A
The first adjective phrase is: ('very', 'good')

Its comparison with the rest of the stores is as follows: 

Store: Gr_TkW3iFdgahixONGBsww| Probability:  0.03
******* There is a big difference *******
Store: XuZ9pksVQBIpmVQm6UO2YQ| Probability:  0.06
Store: k99YNCx5KcQPR7oeTgAOzg| Probability:  0.06
Store: ih_bb1SaYvsMYfdykhYh2g| Probability:  0.11
Store: y3otpml08iLWdX5RCrWnbA| Probability:  0.04
******* There is a big difference *******
Store: QeEQXdto_4wFRaNKyIygRA| Probability:  0.05
******* There is a big difference *******
Store: cOY6ipigtTXdcmmmiFiniA| Probability:  0.06
Store: pJnN5PxqFPGZW2pJWkQtzA| Probability:  0.04
******* There is a big difference *******
Store: Cr066pnTj0ioEMZRTHgMOw| Probability:  0.07
Store: yHHVKa9joZAKiBDUp2SkKw| Probability:  0.05
******* There is a big difference *******
Store: DcfkRb2bS2c8z21WH-aS6A| Probability:  0.1
Store: R4R7ttLXfKKWM0VEMoaW4w| Probability:  0.04
******* There is a big difference *******
Sto

In [64]:
#check the selected store's 2nd review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_2 = frequency_result_top_5_key_list[1]
prob_2 = frequency_result_top_5_value_list[1]
print("The first adjective phrase is:",phrase_2)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_2):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_2) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  Jcyu0ml7rxizEA8giSH-8A
The first adjective phrase is: ('very', 'busy')

Its comparison with the rest of the stores is as follows: 

Store: c0t81YxNqZQuTJrTfn3QYA| Probability:  0.04
Store: etzDsNjkCyQBoJcU2a3U-g| Probability:  0.04
Store: Jcyu0ml7rxizEA8giSH-8A| Probability:  0.06


In [65]:
#check the selected store's 3rd review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_3 = frequency_result_top_5_key_list[2]
prob_3 = frequency_result_top_5_value_list[2]
print("The first adjective phrase is:",phrase_3)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_3):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_3) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  Jcyu0ml7rxizEA8giSH-8A
The first adjective phrase is: ('pretty', 'good')

Its comparison with the rest of the stores is as follows: 

Store: XuZ9pksVQBIpmVQm6UO2YQ| Probability:  0.06
Store: y3otpml08iLWdX5RCrWnbA| Probability:  0.06
Store: QeEQXdto_4wFRaNKyIygRA| Probability:  0.06
Store: XA_m9daZl2VFDA6alnkBvg| Probability:  0.06
Store: zZ7KDK3GAkBUZzsaqB1A4Q| Probability:  0.03
Store: pYJtM8BaRUwNg3hkO3v2sQ| Probability:  0.03
Store: IeK0nzbMPbWnUBIPB83v6A| Probability:  0.04
Store: Mx8jCUdhaNbMgx_04DWHdA| Probability:  0.04
Store: daqYMX3Y4QR8xl-BUlYBPw| Probability:  0.03
Store: XVDR44P_74FmA0ANanm4CQ| Probability:  0.08
Store: AktuBx1W7c3ZdzwuaOp8xg| Probability:  0.04
Store: 84DjKzaR26vphu9fNI9nKg| Probability:  0.1
Store: l07ctcrDMV--TYwe3uzebQ| Probability:  0.08
Store: hM48axj9twnf7A6a5f8cjA| Probability:  0.06
Store: w5LUtmw4G1KD7qAntqGd9w| Probability:  0.02
Store: --I7YYLada0tSLkORTHb5Q| Probability:  0.09
Store: XGaa9NDCwOJ9v0Cj55p28w| Probability:  0.03
Sto

In [66]:
#check the selected store's 4th review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_4 = frequency_result_top_5_key_list[3]
prob_4 = frequency_result_top_5_value_list[3]
print("The first adjective phrase is:",phrase_4)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_4):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_4) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  Jcyu0ml7rxizEA8giSH-8A
The first adjective phrase is: ('so', 'good')

Its comparison with the rest of the stores is as follows: 

Store: XuZ9pksVQBIpmVQm6UO2YQ| Probability:  0.05
Store: ih_bb1SaYvsMYfdykhYh2g| Probability:  0.05
Store: pJnN5PxqFPGZW2pJWkQtzA| Probability:  0.05
Store: Cr066pnTj0ioEMZRTHgMOw| Probability:  0.03
Store: shIPnFoXrL3dFo5HLH1_HA| Probability:  0.03
Store: DcfkRb2bS2c8z21WH-aS6A| Probability:  0.05
Store: R4R7ttLXfKKWM0VEMoaW4w| Probability:  0.06
Store: w5LUtmw4G1KD7qAntqGd9w| Probability:  0.04
Store: 9Xm2GfG8Rnbb1_CmXyrm3g| Probability:  0.03
Store: ZhACHVw9k438Z3i-Gw9BHA| Probability:  0.04
Store: ZBE-H_aUlicix_9vUGQPIQ| Probability:  0.02
Store: JTo0oS3iiMl6UPWA6aI8SQ| Probability:  0.02
Store: hXzoNgpkC86K_Jfg_zMHvA| Probability:  0.03
Store: Rii85bzYKGC9P0zOyAem6A| Probability:  0.05
Store: aDHD7nASfqiQBB6YXy2aGA| Probability:  0.06
Store: bmm5F5smQrHCEkFYjwrQ9w| Probability:  0.04
Store: 793LPDn8axywoOANbDeAZw| Probability:  0.06
Store:

In [67]:
#check the selected store's 5th review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_5 = frequency_result_top_5_key_list[4]
prob_5 = frequency_result_top_5_value_list[4]
print("The first adjective phrase is:",phrase_5)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_5):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_5) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  Jcyu0ml7rxizEA8giSH-8A
The first adjective phrase is: ('very', 'big')

Its comparison with the rest of the stores is as follows: 

Store: Jcyu0ml7rxizEA8giSH-8A| Probability:  0.03
