Prepare Restaurant Data for Analysis

In [1]:
import pandas as pd

# Load JSON file
restaurants = pd.read_json('restaurants.json')

# Define a function to extract the 'alias' values from a list of categories
def extract_aliases(categories):
    return [category['alias'] for category in categories]

# Apply the function to the 'categories' column and store the result in a new column called 'category_aliases'
restaurants['categories'] = restaurants['categories'].apply(lambda x: extract_aliases(x))

# Print the first five rows of the DataFrame
print(restaurants.head())


                       id                                              alias  \
0  M1cIV-JrVOxMjG_K6bUeiw                         craft-and-common-orlando-2   
1  WulVBxLRw4mwn4yjG4JkyQ                           kres-chophouse-orlando-2   
2  wD_LRs35rEldm95MtTdKJw                    tin-and-taco-downtown-orlando-3   
3  7HDwsoFVZwj9llu5QOwtEw  super-rico-colombian-restaurant-and-bar-orlando-2   
4  BAle9XGF4_x-uHAQi59qCw                      the-greenery-creamery-orlando   

                                    name  \
0                         Craft & Common   
1                         Kres Chophouse   
2                  Tin & Taco - Downtown   
3  Super Rico Colombian Restaurant & Bar   
4                  The Greenery Creamery   

                                           image_url  is_closed  \
0  https://s3-media3.fl.yelpcdn.com/bphoto/oVW03A...      False   
1  https://s3-media2.fl.yelpcdn.com/bphoto/HhDNvu...      False   
2  https://s3-media4.fl.yelpcdn.com/bphoto/wXve51... 

In [2]:
restaurants.drop('alias', axis=1, inplace=True)
restaurants.drop('name', axis=1, inplace=True)
restaurants.drop('image_url', axis=1, inplace=True)
restaurants.drop('url', axis=1, inplace=True)
restaurants.drop('phone', axis=1, inplace=True)
restaurants.drop('location', axis=1, inplace=True)
restaurants.drop('display_phone', axis=1, inplace=True)
restaurants.drop('distance', axis=1, inplace=True)
restaurants.drop('is_closed', axis=1, inplace=True)




print(restaurants.head())
print(restaurants.dtypes)

                       id  review_count                         categories  \
0  M1cIV-JrVOxMjG_K6bUeiw           568  [coffee, breakfast_brunch, cafes]   
1  WulVBxLRw4mwn4yjG4JkyQ           897     [steak, cocktailbars, seafood]   
2  wD_LRs35rEldm95MtTdKJw           757       [tacos, beerbar, newmexican]   
3  7HDwsoFVZwj9llu5QOwtEw           517    [colombian, burgers, juicebars]   
4  BAle9XGF4_x-uHAQi59qCw           494        [desserts, icecream, vegan]   

   rating                                        coordinates  \
0     4.5     {'latitude': 28.54596, 'longitude': -81.37797}   
1     4.5  {'latitude': 28.5406819, 'longitude': -81.3794...   
2     4.5  {'latitude': 28.54345886545256, 'longitude': -...   
3     4.5  {'latitude': 28.5422483, 'longitude': -81.3802...   
4     4.5     {'latitude': 28.54012, 'longitude': -81.37198}   

         transactions price  
0  [delivery, pickup]    $$  
1          [delivery]   $$$  
2          [delivery]    $$  
3          [delivery]    $

Prepare Reviews for Data Analysis

In [3]:
import json

# Load the JSON data into a Python dictionary
with open('reviews.json', 'r') as f:
    data = json.load(f)

# Initialize an empty list to store the extracted data
rev = []

# Iterate through each restaurant ID and its reviews
for rest_id, rest_reviews in data.items():
    # Iterate through each review for the restaurant
    for review in rest_reviews:
        # Extract the relevant fields from the review
        review_data = {
            'restaurant_id': rest_id,
            'review_rating': review['rating'],
            'user_id': review['user']['id'],
        }
        # Append the review data to the list
        rev.append(review_data)

# Convert the list of reviews to a pandas DataFrame
reviews = pd.DataFrame(rev)

# Print the first five rows of the DataFrame
print(reviews.head())

print(reviews.dtypes)
print(reviews.shape)

            restaurant_id  review_rating                 user_id
0  M1cIV-JrVOxMjG_K6bUeiw              5  _PrAKxHQY3BsIE_vGnLOdw
1  M1cIV-JrVOxMjG_K6bUeiw              4  6iJroP8frO-EEjjA9p9rjQ
2  M1cIV-JrVOxMjG_K6bUeiw              4  9lkKGcEQavs2sXS0upwhLg
3  M1cIV-JrVOxMjG_K6bUeiw              5  _PrAKxHQY3BsIE_vGnLOdw
4  M1cIV-JrVOxMjG_K6bUeiw              4  6iJroP8frO-EEjjA9p9rjQ
restaurant_id    object
review_rating     int64
user_id          object
dtype: object
(8798, 3)


In [42]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np


# Define a custom tokenizer function that joins the individual categories within each list
def tokenize_categories(categories):
    return ','.join(categories)


# Define a custom function to transform the price column into a numeric scale
def transform_price(price):
    if price is None:
        return np.nan
    elif price == '$':
        return 1
    elif price == '$$':
        return 2
    elif price == '$$$':
        return 3
    elif price == '$$$$':
        return 4
    else:
        return np.nan


review_pivot = pd.pivot_table(reviews, index='user_id', columns='restaurant_id', values='review_rating')

# Create a new DataFrame that includes the transformed price and rating columns
restaurants_transformed = restaurants.copy()
restaurants_transformed['price_numeric'] = restaurants_transformed['price'].apply(transform_price)
restaurants_transformed = restaurants_transformed[['id', 'categories', 'rating', 'price_numeric']]
restaurants_transformed = restaurants_transformed.dropna()

# Scale the rating and price columns so that they have the same weight as the category column
scaler = StandardScaler()
scaler.fit(restaurants_transformed[['rating', 'price_numeric']])
categories_matrix = vectorizer.fit_transform(restaurants_transformed['categories'].apply(tokenize_categories))
cosine_similarities = cosine_similarity(categories_matrix)

def get_recommendations(restaurants_list, n=10):
    # Calculate the cosine similarity between the selected restaurants and all other restaurants
    selected_restaurants = restaurants_transformed.loc[restaurants_transformed['id'].isin(restaurants_list)]
    selected_categories = selected_restaurants['categories'].apply(tokenize_categories)
    selected_categories_matrix = vectorizer.transform(selected_categories)
    selected_restaurants_scaled = scaler.transform(selected_restaurants[['rating', 'price_numeric']])
    selected_restaurants_full = np.hstack([selected_categories_matrix.toarray(), selected_restaurants_scaled])
    cosine_similarities = cosine_similarity(
        selected_restaurants_full,
        np.hstack([categories_matrix.toarray(), price_scaled, rating_scaled]))

    # Find the restaurants with the highest cosine similarity to the selected restaurants
    similar_restaurants = pd.Series(cosine_similarities[-1], index=restaurants_transformed.index).sort_values(ascending=False)
    top_similar_restaurants = similar_restaurants.head(n+1)[1:]

    # Return the full information for the top similar restaurants
    return restaurants.loc[top_similar_restaurants.index]



# Example usage
recommendations = get_recommendations(['gfM4BLPhZNXDkeWx8jOkvw', 'a__umvCwQmGXsHrLB9Q9FQ'], n=10)
print(recommendations)


                          id  review_count  \
1303  jQPML07vnQIxxEs29LG7qg          1084   
358   ULdHiY51w7QjVea-mi4mJQ            71   
332   tlb4-J-FHUgbRJusYLPfSA            33   
860   E-In3GkNpToXr9ISVmUswQ           210   
57    96RSYhKtJXU70XmKuhtHeQ           393   
1097  GAE73k8sKR9Tm_7CfVKf0Q            43   
299   vv9DS3NcbIt-_j78cMN5ug            36   
43    WdTLUw0Y-lnGSv-CRnC8Ag           707   
1354  fgQaq4AMWZAgzvbWQDOVGA            39   
211   wq9TmD0S5eqBmJmA7uDvbw           419   

                              categories  rating  \
1303          [seafood, southern, cajun]     3.5   
358                     [seafood, cajun]     4.0   
332        [southern, seafood, soulfood]     4.0   
860        [seafood, cocktailbars, soup]     4.0   
57          [seafood, cajun, sandwiches]     4.0   
1097  [seafood, foodtrucks, puertorican]     4.0   
299     [colombian, hotdogs, foodtrucks]     5.0   
43          [cajun, seafood, sandwiches]     4.0   
1354   [hotdogs, chicken_



In [38]:
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split
from collections import defaultdict

# Convert the user-restaurant matrix to a surprise-compatible format
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(reviews[['user_id', 'restaurant_id', 'review_rating']], reader)

# Split the data into training and testing sets
trainset = data.build_full_trainset()

# Define the similarity metric and algorithm for the model
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNWithMeans(sim_options=sim_options)

# Train the model on the training data
algo.fit(trainset)

# Create a new user with the user's reviews as the values
new_user = {'GFVxe4gtWZwlILWp1qPTLg': 4}

# Get the top-n recommended restaurants for the new user
top_n = defaultdict(float)
for restaurant_id, rating in new_user.items():
    neighbors = algo.get_neighbors(trainset.to_inner_iid(restaurant_id), k=10)
    for neighbor in neighbors:
        neighbor_id = trainset.to_raw_iid(neighbor)
        if neighbor_id not in new_user:
            prediction = algo.predict(uid='dummy', iid=neighbor_id)
            top_n[neighbor_id] += prediction.est

# Sort the recommended restaurants by score
top_n = sorted(top_n.items(), key=lambda x: x[1], reverse=True)

# Extract the restaurant IDs from the recommendations
recommended_restaurants = [restaurant_id for restaurant_id, score in top_n]

print(recommended_restaurants)


Computing the cosine similarity matrix...
Done computing similarity matrix.


ValueError: Item GFVxe4gtWZwlILWp1qPTLg is not part of the trainset.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

data = pd.merge(restaurants, reviews, left_on='id', right_on='restaurant_id')

# Merge the restaurant and review data on the restaurant ID and review ID fields
data = pd.merge(restaurants, reviews, left_on='id', right_on='restaurant_id')

# Select the relevant fields for the model
model_data = data[['restaurant_id', 'categories', 'rating']]

# Create a new DataFrame with one row for each restaurant-category combination
model_data = data.explode('categories')

# Check the data type of the 'rating' column
print(model_data['rating'].dtype)

# Create the user-item matrix
user_item_matrix = pd.pivot_table(model_data, index='restaurant_id', columns='categories', values='rating', fill_value=0)

# Calculate the cosine similarity between the rows of the matrix
item_similarities = cosine_similarity(user_item_matrix)

# Select a target user
target_user = user_item_matrix.index[0]

# Calculate the similarity between the target user and all other users
user_similarities = item_similarities_df[target_user].sort_values(ascending=False)

# Select the top-k similar users
k = 10
top_k_users = user_similarities[1:k+1].index

# Generate restaurant recommendations for the target user
recommendations = user_item_matrix.loc[top_k_users].mean()

# Get the top-k recommended restaurants
top_k_restaurant_ratings = recommendations.nlargest(k)
top_k_restaurants = top_k_restaurant_ratings.index.tolist()

# Return the restaurant IDs instead of categories
top_k_restaurants_ids = [int(restaurant.split('_')[1]) for restaurant in top_k_restaurants]
print(top_k_restaurants_ids)

