In [12]:
# https://www.kaggle.com/iamaureen/exercise-intro-to-nlp/edit
import pandas as pd

# Load in the data from JSON file
data = pd.read_json('restaurant.json')
data.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
109,lDJIaF4eYRF4F7g6Zb9euw,lb0QUR5bc4O-Am4hNq9ZGg,r5PLDU-4mSbde5XekTXSCA,4,2,0,0,I used to work food service and my manager at ...,2013-01-27 17:54:54
1013,vvIzf3pr8lTqE_AOsxmgaA,MAmijW4ooUzujkufYYLMeQ,r5PLDU-4mSbde5XekTXSCA,4,0,0,0,We have been trying Eggplant sandwiches all ov...,2015-04-15 04:50:56
1204,UF-JqzMczZ8vvp_4tPK3bQ,slfi6gf_qEYTXy90Sw93sg,r5PLDU-4mSbde5XekTXSCA,5,1,0,0,Amazing Steak and Cheese... Better than any Ph...,2011-03-20 00:57:45
1251,geUJGrKhXynxDC2uvERsLw,N_-UepOzAsuDQwOUtfRFGw,r5PLDU-4mSbde5XekTXSCA,1,0,0,0,Although I have been going to DeFalco's for ye...,2018-07-17 01:48:23
1354,aPctXPeZW3kDq36TRm-CqA,139hD7gkZVzSvSzDPwhNNw,r5PLDU-4mSbde5XekTXSCA,2,0,0,0,"Highs: Ambience, value, pizza and deserts. Thi...",2018-01-21 10:52:58


In [13]:
menu = ["Cheese Steak", "Cheesesteak", "Steak and Cheese", "Italian Combo", "Tiramisu", "Cannoli",
        "Chicken Salad", "Chicken Spinach Salad", "Meatball", "Pizza", "Pizzas", "Spaghetti",
        "Bruchetta", "Eggplant", "Italian Beef", "Purista", "Pasta", "Calzones",  "Calzone",
        "Italian Sausage", "Chicken Cutlet", "Chicken Parm", "Chicken Parmesan", "Gnocchi",
        "Chicken Pesto", "Turkey Sandwich", "Turkey Breast", "Ziti", "Portobello", "Reuben",
        "Mozzarella Caprese",  "Corned Beef", "Garlic Bread", "Pastrami", "Roast Beef",
        "Tuna Salad", "Lasagna", "Artichoke Salad", "Fettuccini Alfredo", "Chicken Parmigiana",
        "Grilled Veggie", "Grilled Veggies", "Grilled Vegetable", "Mac and Cheese", "Macaroni",  
         "Prosciutto", "Salami"]

In [24]:
# Given the data from Yelp and the list of menu items, how would you find which menu items have disappointed diners?

# solution: You could group reviews by what menu items they mention, 
# and then calculate the average rating for reviews that mentioned each item. 
# You can tell which foods are mentioned in reviews with low scores, 
# so the restaurant can fix the recipe or remove those foods from the menu.

# first step: write code to extract the foods mentioned in a single review.
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.blank('en')

index_of_review_to_test_on = 14 #for a specicif line - single review
text_to_test_on = data.text.iloc[index_of_review_to_test_on]

review_doc = nlp(text_to_test_on) #tokenized version of text_to_test_on

matcher = PhraseMatcher(nlp.vocab, attr='LOWER') #create the phrasematcher
menu_tokens_list = [nlp(item) for item in menu]  #create a list of tokens for each item in the menu
matcher.add("MENU", None, *menu_tokens_list)     #add the list to the matcher
matches = matcher(review_doc)                    #find matches in the review doc

#print(matches)

for match in matches:
    print(f"Token number {match[1]}: {review_doc[match[1]:match[2]]}")

Token number 2: Purista
Token number 16: prosciutto
Token number 58: meatball


In [29]:
# Matching on the whole dataset
# We will run this matcher over the whole dataset and collect ratings for each menu item
# For each item that appears in the review text (review.text), append the review's rating to a list of ratings for 
# that item. The lists are kept in a dictionary item_ratings.

from collections import defaultdict
# item_ratings is a dictionary of lists. If a key doesn't exist in item_ratings,
# the key is added with an empty list as the value.
item_ratings = defaultdict(list)

for idx, review in data.iterrows():
    doc = nlp(review.text)    #tokenize each row
    matches = matcher(doc)    #find matches in the doc

    found_items = set([doc[match[1]:match[2]] for match in matches])

    for item in found_items:
        item_ratings[str(item).lower()].append(review.stars)
        
#print (item_ratings)
# sample output: defaultdict(<class 'list'>, {'chicken parmigiana': [4, 5, 4, 4, 5, 5, 5, 5, 5, 4, 4, 4, 3, 4, 5, 5, 4, 5], 
#'eggplant': [4, 4, 4, 4, 3, 1, 1, 1, 1, 5, 4, 4, 4, 3, 3, 3, 3, 3, 4, 4, 3, 4, 5, 5, 4, 5, 5, 5, 3, 3, 5, 5, 5, 4, 4, 5, 4, 5, 4, 4, 4, 5, 5, 5, 2, 5, 5, 5, 5, 4, 4, 3, 3, 5, 5, 5, 5, 5, 5, 4, 2, 4, 3, 5, 5, 5, 5, 3, 4, 4, 4, 5, 5, 2, 4, 4, 5, 5, 2, 5, 2, 5, 4, 4, 4, 4, 4, 3, 5, 1, 1, 5, 5, 5, 5] ....

In [44]:
# Using these item ratings, we will find the menu item with the worst average rating

#creates a dict in the following form --> item: mean
mean_ratings = {item: sum(ratings)/len(ratings) for item, ratings in item_ratings.items()} 
# print((mean_ratings))
worst_item = sorted(mean_ratings, key=mean_ratings.get)[0]
print(worst_item)
print(mean_ratings[worst_item])

chicken cutlet
3.5454545454545454


In [None]:
# count the total number of reviews for each item

counts = {item: len(ratings) for item, ratings in item_ratings.items()}