In [1]:
import numpy as np
import pandas as pd
import nltk,string
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import math
import os
from nltk import FreqDist
from nltk.corpus import stopwords
import random

import en_core_web_sm
from nltk.util import ngrams

nlp = en_core_web_sm.load()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/guangxushen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
data_path = 'data/reviewSelected100.json'
os.path.exists(data_path)

True

In [3]:
random.seed(69)

In [4]:
review_df = pd.read_json(data_path, lines=True,encoding = "ISO-8859-1")

In [5]:
#Randomly generate 1 index
review_random_idx = np.array(np.random.rand(1)*len(review_df), dtype=np.int32)
#print out the business id selected
print(review_df.iloc[review_random_idx]['business_id'].iloc[0])

rubyBEGXLBLU_JAuEtKEuw


In [6]:
#set the selected business id as the id to be investigated
selected_id = review_df.iloc[review_random_idx]['business_id'].iloc[0]

In [7]:
#fix this id for subsequent usage
selected_id_str = str(selected_id)
print(selected_id_str)

rubyBEGXLBLU_JAuEtKEuw


In [8]:
#Extract out the sentences related to the random id
review_random_df = review_df.iloc[review_random_idx]
review_random_df = review_random_df.reset_index()

In [9]:
#Tokenize the text
review_random_df['tokenize'] = review_random_df['text'].apply(nltk.word_tokenize)
#Tag those individual tokens respectively
review_random_df['pos_tag'] = review_random_df['tokenize'].apply(nltk.pos_tag)

In [10]:
review_random_df.to_json(r'output/specific_business_review_3_final.json', orient='records', lines=True)

In [11]:
tokenized_path = 'output/specific_business_review_3_final.json'
reviews_adjectives_df = pd.read_json(tokenized_path, lines=True,encoding = "ISO-8859-1")

In [12]:
#checking some basic information
print(len(reviews_adjectives_df))
print(len(review_df))

1
15300


# Find all reviews that are related to a certain store

In [13]:
#extract all the reviews which are related to one store id 
def extract_reviews(store_id):
  if type(store_id) == str:
    selected_id = store_id
  else:
    selected_id = str(store_id)
  selected_store_reviews = []
  for index, element in review_df.iterrows():
    if element['business_id'] == selected_id:
        selected_store_reviews.append(element['text'])
  return selected_store_reviews

In [14]:
#test the function
extracted_review = extract_reviews(selected_id_str)
#print out the reviews for the store id selected 
print(extracted_review)

['This place has very high quality flowers. All florists are pretty dang expensive, so you have to expect that going in. But the $180 bouquet lasted at least 10 days looking amazing and another 5 looking decent.\n\nThe owner was very pleasant and answered my questions.\n\nI will definitely use this florist every time in the future.', "Thank you!!! Greg. For making up for the last batch of bad roses. These look 10 times healthier. I really appreciate it. I'll definitely be a regular customer.", "I went on to Yelp to find a local florist in Las Vegas.  I live in Northern California and wanted to send flowers to a friend that just had surgery.  I did not realize that my friend was in ICU and couldn't receive the flowers.  Greg, who delivered the flowers was great and helped me get the flowers to my friends husband.  I highly recommend using this flower shop!!!  GREAT CUSTOMER SERVICE!!!!", 'My wife and I celebrated our renewal of vows in Las Vegas and hosted guests in a Bellagio suite. We

# Use spacy nlp to tokenize and pos tag each text

In [15]:
#use spacy library to process the text extracted
def spacy_process(input_list: list):
    final_result = []
    #use nlp which is a spacy in-built function which will return the text and doc which is in token datatype
    doc = nlp(str(input_list))
    #token consists of text and its pos tags
    for token in doc:
        result = []
        result.append(token.text)
        result.append(token.pos_)
        final_result.append(result)
    return final_result,doc

In [16]:
#test the function
processed_review,result_doc = spacy_process(extracted_review)

In [17]:
#print out the processed output
print(processed_review)

[['[', 'PUNCT'], ["'", 'PUNCT'], ['This', 'DET'], ['place', 'NOUN'], ['has', 'VERB'], ['very', 'ADV'], ['high', 'ADJ'], ['quality', 'NOUN'], ['flowers', 'NOUN'], ['.', 'PUNCT'], ['All', 'DET'], ['florists', 'NOUN'], ['are', 'AUX'], ['pretty', 'ADV'], ['dang', 'ADJ'], ['expensive', 'ADJ'], [',', 'PUNCT'], ['so', 'CCONJ'], ['you', 'PRON'], ['have', 'VERB'], ['to', 'PART'], ['expect', 'VERB'], ['that', 'SCONJ'], ['going', 'VERB'], ['in', 'ADP'], ['.', 'PUNCT'], ['But', 'CCONJ'], ['the', 'DET'], ['$', 'SYM'], ['180', 'NUM'], ['bouquet', 'NOUN'], ['lasted', 'VERB'], ['at', 'ADV'], ['least', 'ADV'], ['10', 'NUM'], ['days', 'NOUN'], ['looking', 'VERB'], ['amazing', 'ADJ'], ['and', 'CCONJ'], ['another', 'DET'], ['5', 'NUM'], ['looking', 'VERB'], ['decent.\\n\\nThe', 'NOUN'], ['owner', 'NOUN'], ['was', 'VERB'], ['very', 'ADV'], ['pleasant', 'ADJ'], ['and', 'CCONJ'], ['answered', 'VERB'], ['my', 'PRON'], ['questions.\\n\\nI', 'NOUN'], ['will', 'AUX'], ['definitely', 'ADV'], ['use', 'VERB'], ['th

In [18]:
print(result_doc)

['This place has very high quality flowers. All florists are pretty dang expensive, so you have to expect that going in. But the $180 bouquet lasted at least 10 days looking amazing and another 5 looking decent.\n\nThe owner was very pleasant and answered my questions.\n\nI will definitely use this florist every time in the future.', "Thank you!!! Greg. For making up for the last batch of bad roses. These look 10 times healthier. I really appreciate it. I'll definitely be a regular customer.", "I went on to Yelp to find a local florist in Las Vegas.  I live in Northern California and wanted to send flowers to a friend that just had surgery.  I did not realize that my friend was in ICU and couldn't receive the flowers.  Greg, who delivered the flowers was great and helped me get the flowers to my friends husband.  I highly recommend using this flower shop!!!  GREAT CUSTOMER SERVICE!!!!", 'My wife and I celebrated our renewal of vows in Las Vegas and hosted guests in a Bellagio suite. We

# Find out the relevant pairs

In [19]:
# extact all the phrases that has adverb + adjectives
def adjective_pair_extraction_1(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_1 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADV'and (element.pos_ == 'ADJ')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_1.append(temp_list_tuple)
        selected_combination_1 = list(filter(lambda x: x, selected_combination_1))
    return selected_combination_1,count

In [20]:
# extact all the phrases that has adjectives + adverbs
def adjective_pair_extraction_2(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_2 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADJ'and (element.pos_ == 'ADV')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_2.append(temp_list_tuple)
        selected_combination_2 = list(filter(lambda x: x, selected_combination_2))
    return selected_combination_2,count

In [21]:
# extact all the phrases that has adjectives + adjectives
def adjective_pair_extraction_3(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_3 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADJ'and (element.pos_ == 'ADJ')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_3.append(temp_list_tuple)
        selected_combination_3 = list(filter(lambda x: x, selected_combination_3))
    return selected_combination_3,count

In [22]:
# extact all the phrases that has adverbs + adverbs  + adjectives
def adjective_pair_extraction_4(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_4 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element_before.pos_) == 'ADV'and (element.pos_ == 'ADV') and (element_after.pos_ == 'ADJ')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
            temp_list.append(element_after.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_4.append(temp_list_tuple)
        selected_combination_4 = list(filter(lambda x: x, selected_combination_4))
    return selected_combination_4,count

In [23]:
# extact all the phrases that has adjectives + prep  + noun
def adjective_pair_extraction_5(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_5 = []
    count = 0
    for index in range(len(doc_2)-1):
        temp_list = []
        element_before = doc_2[index-1]
        element = doc_2[index]
        element_after = doc_2[index+1]
        if((element.pos_) == 'ADJ' and (element.pos_ == 'ADP') and (element_after.pos_ == 'NOUN')):
            count += 1
            temp_list.append(element_before.text)
            temp_list.append(element.text)
            temp_list.append(element_after.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_5.append(temp_list_tuple)
        selected_combination_5 = list(filter(lambda x: x, selected_combination_5))
    return selected_combination_5,count

In [24]:
# extact all the phrases that has adjectives + prep  + prep + noun
def adjective_pair_extraction_6(doc_2):
    #finding all the adjective phrase combinations of the selected store
    selected_combination_6 = []
    count = 0
    for index in range(len(doc_2)-2):
        temp_list = []
        element_1 = doc_2[index-1]
        element_2 = doc_2[index]
        element_3 = doc_2[index+1]
        element_4 = doc_2[index+2]
        if((element_1.pos_) == 'ADJ' and (element_2.pos_ == 'ADP') and (element_3.pos_ == 'ADP') and element_4.pos_ == 'NOUN'):
            count += 1
            temp_list.append(element_1.text)
            temp_list.append(element_2.text)
            temp_list.append(element_3.text)
            temp_list.append(element_4.text)
        temp_list_tuple = tuple(temp_list)
        selected_combination_6.append(temp_list_tuple)
        selected_combination_6 = list(filter(lambda x: x, selected_combination_6))
    return selected_combination_6,count

In [25]:
adjective_phrase_result_1,count_1 = adjective_pair_extraction_1(result_doc)

In [26]:
print(count_1)
print(adjective_phrase_result_1)

111
[('very', 'high'), ('pretty', 'dang'), ('very', 'pleasant'), ('super', 'easy'), ('so', 'beautiful'), ('very', 'pleased'), ('how', 'beautiful'), ('most', 'wonderful'), ('Very', 'bad'), ('extremely', 'rude'), ('most', 'beautiful'), ('more', 'impressive'), ('so', 'friendly'), ('always', 'great'), ('so', 'rare'), ('Very', 'nice'), ('here', 'several'), ('extremely', 'friendly'), ('very', 'personable'), ('so', 'appreciative'), ('so', 'happy'), ('especially', 'nice'), ('just', 'stunning'), ('increasingly', 'frustrated'), ('very', 'small'), ('very', 'few'), ('Truly', 'spectacular'), ('very', 'impressed'), ('genuinely', 'helpful'), ('easily', 'able'), ('very', 'strong'), ('very', 'calm'), ('totally', 'empty'), ('super', 'friendly'), ('very', 'classy'), ('so', 'gracious'), ('very', 'helpful'), ('really', 'helpful'), ('so', 'sweet'), ('Far', 'better'), ('so', 'beautiful'), ('so', 'happy'), ('absolutely', 'gorgeous'), ('pretty', 'sure'), ('very', 'friendly'), ('very', 'tempting'), ('super', 'h

In [27]:
adjective_phrase_result_2,count_2 = adjective_pair_extraction_2(result_doc)

In [28]:
print(count_2)
print(adjective_phrase_result_2)

13
[('open', 'late'), ('bad', 'now'), ('few', 'anyway'), ('simple', 'yet'), ('pink', 'lily'), ('purple', 'so'), ('open', 'longer'), ('best', 'so'), ('special', 'together'), ('sure', 'how'), ('least', 'once'), ('ready', 'ahead'), ('resonable', 'very')]


In [29]:
adjective_phrase_result_3,count_3 = adjective_pair_extraction_3(result_doc)

In [30]:
print(count_3)
print(adjective_phrase_result_3)

32
[('dang', 'expensive'), ('creative', 'local'), ('friendly', 'local'), ('beautiful', 'floral'), ('bright', 'vibrant'), ('friendly', 'helpful'), ('little', 'limited'), ('next', 'closest'), ('past', 'few'), ('such', 'beautiful'), ('Super', 'awesome'), ('great', 'local'), ('many', 'other'), ('quiet', 'polite'), ('friendly', 'local'), ('same', 'stellar'), ('cute', 'little'), ('best', 'floral'), ('ready', 'first'), ('bi', '-'), ('-', 'monthly'), ('monthly', 'single'), ('nearby', 'florist'), ('specific', 'tartan'), ('enough', 'nice'), ('Fabulous', 'awesome'), ('few', 'florist'), ('bright', 'yellow'), ('cheap', 'little'), ('fresh', 'lavender'), ('few', 'prior'), ('available', 'cheap')]


In [31]:
adjective_phrase_result_4,count_4 = adjective_pair_extraction_4(result_doc)

In [32]:
print(count_4)
print(adjective_phrase_result_4)

9
[('even', 'more', 'impressive'), ('also', 'very', 'tempting'), ('so', 'incredibly', 'kind'), ('very', 'very', 'impressed'), ('always', 'so', 'nice'), ('enough', 'how', 'amazing'), ('once', 'again', 'beautiful'), ('just', 'too', 'cute'), ('so', 'much', 'easier')]


In [33]:
adjective_phrase_result_5,count_5 = adjective_pair_extraction_5(result_doc)

In [34]:
print(count_5)
print(adjective_phrase_result_5)

0
[]


In [35]:
adjective_phrase_result_6,count_6 = adjective_pair_extraction_6(result_doc)

In [36]:
print(count_6)
print(adjective_phrase_result_6)

1
[('pink', 'due', 'to', 'valentine')]


# Adjective numbers extraction

In [37]:
#how many two-word bigram reviews are related to this one store
total_frequency = len(extracted_review)
def evaluative_indicativeness_bigram (adjective_phrase_list: list):
    bigram=[]
    bigram.extend(adjective_phrase_list)  
    freq_bi = nltk.FreqDist(bigram)
    result_frequency = freq_bi.most_common(5)
    result_frequency_dict = {}
    for element in freq_bi.most_common(5):
      individual_frequency = element[1]/total_frequency
      result_frequency_dict[element[0]] = individual_frequency
    return result_frequency_dict

In [38]:
#combining all the bigram and trigrams
bigram_list = adjective_phrase_result_1 + adjective_phrase_result_2 + adjective_phrase_result_3
trigram_list = adjective_phrase_result_4 + adjective_phrase_result_5

In [39]:
#how many three-word trigram reviews are related to this one store
total_frequency = len(extracted_review)
def evaluative_indicativeness_bigram (adjective_phrase_list: list):
    bigram=[]
    bigram.extend(adjective_phrase_list)  
    freq_bi = nltk.FreqDist(bigram)
    result_frequency = freq_bi.most_common(5)
    result_frequency_dict = {}
    for element in freq_bi.most_common(5):
      individual_frequency = element[1]/total_frequency
      result_frequency_dict[element[0]] = individual_frequency
    return result_frequency_dict

In [40]:
#how many reviews are related to this one store
total_frequency = len(extracted_review)
def evaluative_indicativeness_trigram (adjective_phrase_list: list):
    trigram=[]
    trigram.extend(adjective_phrase_list)  
    freq_tri = nltk.FreqDist(trigram)
    result_frequency = freq_tri.most_common(5)
    result_frequency_dict = {}
    for element in freq_tri.most_common(5):
      individual_frequency = element[1]/total_frequency
      result_frequency_dict[element[0]] = individual_frequency
    return result_frequency_dict

In [41]:
#carry out the processing for bigram and trigram respectively
frequency_result_bi = evaluative_indicativeness_bigram(bigram_list)
frequency_result_tri = evaluative_indicativeness_trigram(trigram_list)

In [42]:
print(frequency_result_bi)

{('so', 'happy'): 0.04, ('so', 'beautiful'): 0.03, ('how', 'beautiful'): 0.03, ('most', 'beautiful'): 0.03, ('very', 'helpful'): 0.03}


In [43]:
print(frequency_result_tri)

{('even', 'more', 'impressive'): 0.01, ('also', 'very', 'tempting'): 0.01, ('so', 'incredibly', 'kind'): 0.01, ('very', 'very', 'impressed'): 0.01, ('always', 'so', 'nice'): 0.01}


In [44]:
#merge two bigram/trigram output list 
def merge(dict1, dict2):
    res = {**dict1, **dict2}
    return res

In [45]:
final_dict = merge(frequency_result_bi,frequency_result_tri)

In [46]:
print(final_dict)

{('so', 'happy'): 0.04, ('so', 'beautiful'): 0.03, ('how', 'beautiful'): 0.03, ('most', 'beautiful'): 0.03, ('very', 'helpful'): 0.03, ('even', 'more', 'impressive'): 0.01, ('also', 'very', 'tempting'): 0.01, ('so', 'incredibly', 'kind'): 0.01, ('very', 'very', 'impressed'): 0.01, ('always', 'so', 'nice'): 0.01}


In [47]:
from operator import itemgetter

# Initialize N 
N = 5
# Using sorted() + itemgetter() + items()
frequency_result_top_5 = dict(sorted(final_dict.items(), key = itemgetter(1), reverse = True)[:N])

In [48]:
print(frequency_result_top_5)

{('so', 'happy'): 0.04, ('so', 'beautiful'): 0.03, ('how', 'beautiful'): 0.03, ('most', 'beautiful'): 0.03, ('very', 'helpful'): 0.03}


In [49]:
#obtain the top 5 phrases and their respective probs
index = 1
index_result = {}

for key in frequency_result_top_5:
    phrase_id = "phrase_{}".format(index)
    prob_id = "prob_{}".format(index)
    index_result[phrase_id] = frequency_result_top_5[key]
    index += 1

In [50]:
print(index_result)

{'phrase_1': 0.04, 'phrase_2': 0.03, 'phrase_3': 0.03, 'phrase_4': 0.03, 'phrase_5': 0.03}


# Trying out on the entire dataset

In [51]:
review_length = len(review_df)
print(review_length)

15300


In [52]:
#finding all the business ids
business_ids = set()
for i in range(0, review_length):
    business = review_df.iloc[i]
    business_ids.add(business['business_id'])

In [53]:
print(len(business_ids))

153


In [54]:
#construct a dictionary to get all the stores and their respective processed reviews
selected = {}
for business in business_ids:
    #extract reviews for each individual id
    individual_reviews = extract_reviews(business)
    #append the extracted review to each business first
    selected[business] = individual_reviews

In [55]:
#use spacy to tokenize and process each review
#and extract relevant pairs
for key in selected:
    processed_review_selected,result_doc_selected = spacy_process(selected[key])
    #adjective extraction for each combination
    final_adjective_phrase_result_1,final_count_1 = adjective_pair_extraction_1(result_doc_selected)
    final_adjective_phrase_result_2,final_count_2 = adjective_pair_extraction_2(result_doc_selected)
    final_adjective_phrase_result_3,final_count_3 = adjective_pair_extraction_3(result_doc_selected)
    final_adjective_phrase_result_4,final_count_4 = adjective_pair_extraction_4(result_doc_selected)
    #adding more considerations
    final_adjective_phrase_result_5,final_count_5 = adjective_pair_extraction_5(result_doc_selected)
    final_adjective_phrase_result_6,final_count_6 = adjective_pair_extraction_6(result_doc_selected)


    #combining the list extracted
    final_bigram_list = final_adjective_phrase_result_1 + final_adjective_phrase_result_2 + final_adjective_phrase_result_3
    final_trigram_list = final_adjective_phrase_result_4 + final_adjective_phrase_result_5 


    #evaluate the respective data obtained 
    final_frequency_result_bi = evaluative_indicativeness_bigram(final_bigram_list)
    final_frequency_result_tri = evaluative_indicativeness_bigram(final_trigram_list)
    
    #merge the data obtained
    final_frequency_result = merge(final_frequency_result_bi,final_frequency_result_tri)
    
    #append the analysis to each of the result
    selected[key] = final_frequency_result

In [56]:
#print out the result to check 
for key in selected:
    print(key)
    print(selected[key])

z_RTK0MaxaRHRQ99hDL2QA
{('as', 'big'): 0.07, ('so', 'good'): 0.05, ('very', 'good'): 0.05, ('really', 'good'): 0.04, ('very', 'nice'): 0.03, ('just', 'as', 'big'): 0.02, ('So', 'so', 'good'): 0.01, ('Definitely', 'filling.\\n\\nAlso', 'convenient'): 0.01, ('actually', 'pretty', 'close'): 0.01, ('so', 'terribly', 'inconvenient'): 0.01}
YgWGWza3sXzI0brcOogR5A
{('very', 'nice'): 0.07, ('so', 'nice'): 0.06, ('so', 'glad'): 0.05, ('so', 'gentle'): 0.04, ('very', 'gentle'): 0.03, ('Very', 'well', 'pleased'): 0.01, ('so', 'very', 'nice'): 0.01, ('all', 'very', 'nice'): 0.01, ('So', 'incredibly', 'happy'): 0.01, ('no', 'longer', 'afraid'): 0.01}
mOzOYgXYOmkdNfZr453PgQ
{('too', 'much'): 0.04, ('very', 'rude'): 0.03, ('so', 'many'): 0.03, ('many', 'other'): 0.03, ('other', 'Vietnamese'): 0.03, ('However', 'very', 'tight'): 0.01, ('Slightly', 'more', 'pricey'): 0.01, ('Now', 'how', 'many'): 0.01, ('here', 'so', 'delicious'): 0.01, ('always', 'very', 'tasty'): 0.01}
IeK0nzbMPbWnUBIPB83v6A
{('very'

# Analysis

In [57]:
#getting the selected store's result for analysis
frequency_result_top_5_key_list = list(frequency_result_top_5)
frequency_result_top_5_value_list = list(frequency_result_top_5.values())


In [73]:
#check the selected store's 1st review and compare its probs with the rest of the stores
print("For business ",selected_id)
phrase_1 = frequency_result_top_5_key_list[0]
prob_1 = frequency_result_top_5_value_list[0]
print("The first adjective phrase is:",phrase_1)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_1):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_1) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  rubyBEGXLBLU_JAuEtKEuw
The first adjective phrase is: ('so', 'happy')

Its comparison with the rest of the stores is as follows: 

Store: a4GRh1TlOVhPD401mSPLZg| Probability:  0.06
Store: rubyBEGXLBLU_JAuEtKEuw| Probability:  0.04
Store: QaFO4S6HFUu2NIbeu4OwCg| Probability:  0.02


In [102]:
#check the selected store's 1st review and compare its probs with the rest of the stores
print("For business ",selected_id)
phrase_1 = frequency_result_top_5_key_list[0]
prob_1 = frequency_result_top_5_value_list[0]
print("The first adjective phrase is:",phrase_1)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

count_1 = 0
for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_1):
            count_1 += (selected[key][value]*100)
print(count_1)
overall_prob_1 = count_1/15300
print("The store's probability is:", prob_1)
print("The phrase's overall probability is:",overall_prob_1)

For business  rubyBEGXLBLU_JAuEtKEuw
The first adjective phrase is: ('so', 'happy')

Its comparison with the rest of the stores is as follows: 

12.0
The store's probability is: 0.04
The phrase's overall probability is: 0.000784313725490196


In [103]:
#check the selected store's 2nd review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_2 = frequency_result_top_5_key_list[1]
prob_2 = frequency_result_top_5_value_list[1]
print("The second adjective phrase is:",phrase_2)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    if key == selected_id:
        continue
    for value in values:
        if(value == phrase_2):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_2) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  rubyBEGXLBLU_JAuEtKEuw
The second adjective phrase is: ('so', 'beautiful')

Its comparison with the rest of the stores is as follows: 



In [84]:
#check the selected store's 2nd review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_2 = frequency_result_top_5_key_list[1]
prob_2 = frequency_result_top_5_value_list[1]
print("The second adjective phrase is:",phrase_2)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

count_2 = 0
for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_2):
            count_2 += (selected[key][value]*100)

print(count_2)
overall_prob_2 = count_2/15300
print("The store's probability is:", prob_2)
print("The phrase's overall probability is:",overall_prob_2)

For business  rubyBEGXLBLU_JAuEtKEuw
The second adjective phrase is: ('so', 'beautiful')

Its comparison with the rest of the stores is as follows: 

3.0
The store's probability is: 0.03
The phrase's overall probability is: 0.000392156862745098


In [104]:
#check the selected store's 3rd review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_3 = frequency_result_top_5_key_list[2]
prob_3 = frequency_result_top_5_value_list[2]
print("The third adjective phrase is:",phrase_3)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    if key == selected_id:
        continue
    for value in values:
        if(value == phrase_3):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_3) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  rubyBEGXLBLU_JAuEtKEuw
The third adjective phrase is: ('how', 'beautiful')

Its comparison with the rest of the stores is as follows: 



In [99]:
#check the selected store's 3rd review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_3 = frequency_result_top_5_key_list[2]
prob_3 = frequency_result_top_5_value_list[2]
print("The third adjective phrase is:",phrase_3)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

count_3 = 0
for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_3):
            count_3 += (selected[key][value]*100)

print(count_3)
overall_prob_3 =  count_3/15300
print("The store's probability is:", prob_3)
print("The phrase's overall probability is:",overall_prob_3)

For business  rubyBEGXLBLU_JAuEtKEuw
The third adjective phrase is: ('how', 'beautiful')

Its comparison with the rest of the stores is as follows: 

3.0
The store's probability is: 0.03
The phrase's overall probability is: 0.000196078431372549


In [105]:
#check the selected store's 4th review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_4 = frequency_result_top_5_key_list[3]
prob_4 = frequency_result_top_5_value_list[3]
print("The fourth adjective phrase is:",phrase_4)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    if key == selected_id:
        continue
    for value in values:
        if(value == phrase_4):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_4) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  rubyBEGXLBLU_JAuEtKEuw
The fourth adjective phrase is: ('most', 'beautiful')

Its comparison with the rest of the stores is as follows: 



In [100]:
#check the selected store's 4th review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_4 = frequency_result_top_5_key_list[3]
prob_4 = frequency_result_top_5_value_list[3]
print("The fourth adjective phrase is:",phrase_4)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

count_4 = 0
for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_4):
            count_4 += (selected[key][value]*100)
print(count_4)
overall_prob_4 =  count_4/15300
print("The store's probability is:", prob_4)
print("The phrase's overall probability is:",overall_prob_4)

For business  rubyBEGXLBLU_JAuEtKEuw
The fourth adjective phrase is: ('most', 'beautiful')

Its comparison with the rest of the stores is as follows: 

3.0
The store's probability is: 0.03
The phrase's overall probability is: 0.000196078431372549


In [106]:
#check the selected store's 5th review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_5 = frequency_result_top_5_key_list[4]
prob_5 = frequency_result_top_5_value_list[4]
print("The fifth adjective phrase is:",phrase_5)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

for key in selected:
    values = selected[key]
    if key == selected_id:
        continue
    for value in values:
        if(value == phrase_5):
            print("Store: "+ key + "| Probability: ", selected[key][value])
            if(float(prob_5) - selected[key][value]) >= 0.03:
                print("******* There is a big difference *******")

For business  rubyBEGXLBLU_JAuEtKEuw
The fifth adjective phrase is: ('very', 'helpful')

Its comparison with the rest of the stores is as follows: 

Store: c0t81YxNqZQuTJrTfn3QYA| Probability:  0.03
Store: QbdADbSI908Or0ybjuQLxw| Probability:  0.04
Store: qMyYY8zpHusOXUd6wmqX2w| Probability:  0.03
Store: UuO28w986H0CO_HJluQmew| Probability:  0.04
Store: GwKq3kjkFXhbBMAwe4H3rg| Probability:  0.04
Store: R43QH4FqHOOJapm5TaCEVg| Probability:  0.04
Store: NnxwkiBvSSyKeFiez0DDcQ| Probability:  0.05
Store: QaFO4S6HFUu2NIbeu4OwCg| Probability:  0.04
Store: uhyDNWYRSsom3VrFgOgP_w| Probability:  0.03
Store: NRAmrbIGgXgH5uYv8GGJ5A| Probability:  0.03


In [101]:
#check the selected store's 5th review and compare its probs with the rest of the stores

print("For business ",selected_id)
phrase_5 = frequency_result_top_5_key_list[4]
prob_5 = frequency_result_top_5_value_list[4]
print("The fifth adjective phrase is:",phrase_5)
print()
print("Its comparison with the rest of the stores is as follows: \n")
print("============== Other stores ==============")

count_5 = 0

for key in selected:
    values = selected[key]
    for value in values:
        if(value == phrase_5):
            count_5 += (selected[key][value]*100)
print(count_5)
overall_prob_5 = count_5/15300
print("The store's probability is:", prob_5)
print("The phrase's overall probability is:",overall_prob_5)

For business  rubyBEGXLBLU_JAuEtKEuw
The fifth adjective phrase is: ('very', 'helpful')

Its comparison with the rest of the stores is as follows: 

40.0
The store's probability is: 0.03
The phrase's overall probability is: 0.00261437908496732
