In [1]:
import pandas as pd
import json

rest_df = pd.read_json('restaurants.json')

rest_df['categories'] = rest_df['categories'].apply(lambda x: [cat['alias'] for cat in x])

# extract the latitude and longitude from the 'location' dictionary and create new columns for each
rest_df['latitude'] = rest_df['coordinates'].apply(lambda x: x['latitude'])
rest_df['longitude'] = rest_df['coordinates'].apply(lambda x: x['longitude'])


# convert the 'price' column to a string and then to a numerical value between 0 and 3
rest_df['price'] = rest_df['price'].astype(str).apply(lambda x: len(x) - 1)

rest_df = rest_df.drop(columns=['alias', 'name', 'image_url', 'is_closed', 'url', 'location', 'phone', 'display_phone', 'distance', 'coordinates'])

print(rest_df.head())

                       id  review_count                         categories  \
0  M1cIV-JrVOxMjG_K6bUeiw           568  [coffee, breakfast_brunch, cafes]   
1  WulVBxLRw4mwn4yjG4JkyQ           897     [steak, cocktailbars, seafood]   
2  wD_LRs35rEldm95MtTdKJw           757       [tacos, beerbar, newmexican]   
3  7HDwsoFVZwj9llu5QOwtEw           517    [colombian, burgers, juicebars]   
4  BAle9XGF4_x-uHAQi59qCw           494        [desserts, icecream, vegan]   

   rating        transactions  price   latitude  longitude  
0     4.5  [delivery, pickup]      1  28.545960 -81.377970  
1     4.5          [delivery]      2  28.540682 -81.379423  
2     4.5          [delivery]      1  28.543459 -81.380053  
3     4.5          [delivery]      1  28.542248 -81.380262  
4     4.5          [delivery]      0  28.540120 -81.371980  


In [2]:
num_missing = rest_df.isna().sum()

# print the results
print(num_missing)
rest_df = rest_df.dropna()

print(rest_df.describe())

id              0
review_count    0
categories      0
rating          0
transactions    0
price           0
latitude        2
longitude       2
dtype: int64
       review_count       rating        price     latitude    longitude
count   1542.000000  1542.000000  1542.000000  1542.000000  1542.000000
mean     170.151102     3.778859     0.981842    28.544812   -81.369740
std      282.842354     0.861834     0.775807     0.051857     0.060231
min        1.000000     1.000000     0.000000    28.262680   -82.508204
25%       17.000000     3.500000     0.000000    28.514519   -81.397269
50%       71.000000     4.000000     1.000000    28.547999   -81.369202
75%      203.750000     4.500000     2.000000    28.582480   -81.336286
max     3071.000000     5.000000     3.000000    28.891080   -80.845600


In [3]:
with open('reviews.json', 'r') as f:
    data = json.load(f)

reviews = []

# Iterate through each restaurant ID and its reviews
for rest_id, rest_reviews in data.items():
    # Iterate through each review for the restaurant
    for review in rest_reviews:
        # Extract the relevant fields from the review
        review_data = {
            'restaurant_id': rest_id,
            'review_rating': review['rating'],
            'user_id': review['user']['id'],
        }
        # Append the review data to the list
        reviews.append(review_data)

review_df = pd.DataFrame(reviews)

print(review_df.head())

print(review_df.dtypes)
print(review_df.shape)

            restaurant_id  review_rating                 user_id
0  M1cIV-JrVOxMjG_K6bUeiw              5  _PrAKxHQY3BsIE_vGnLOdw
1  M1cIV-JrVOxMjG_K6bUeiw              4  6iJroP8frO-EEjjA9p9rjQ
2  M1cIV-JrVOxMjG_K6bUeiw              4  9lkKGcEQavs2sXS0upwhLg
3  M1cIV-JrVOxMjG_K6bUeiw              5  _PrAKxHQY3BsIE_vGnLOdw
4  M1cIV-JrVOxMjG_K6bUeiw              4  6iJroP8frO-EEjjA9p9rjQ
restaurant_id    object
review_rating     int64
user_id          object
dtype: object
(8798, 3)


In [4]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Normalize 'price' 'review_count' and 'rating'

rest_df['price'] = scaler.fit_transform(rest_df[['price']])
rest_df['review_count'] = scaler.fit_transform(rest_df[['review_count']])
rest_df['rating'] = scaler.fit_transform(rest_df[['rating']])
print(rest_df.head())

                       id  review_count                         categories  \
0  M1cIV-JrVOxMjG_K6bUeiw      0.184691  [coffee, breakfast_brunch, cafes]   
1  WulVBxLRw4mwn4yjG4JkyQ      0.291857     [steak, cocktailbars, seafood]   
2  wD_LRs35rEldm95MtTdKJw      0.246254       [tacos, beerbar, newmexican]   
3  7HDwsoFVZwj9llu5QOwtEw      0.168078    [colombian, burgers, juicebars]   
4  BAle9XGF4_x-uHAQi59qCw      0.160586        [desserts, icecream, vegan]   

   rating        transactions     price   latitude  longitude  
0   0.875  [delivery, pickup]  0.333333  28.545960 -81.377970  
1   0.875          [delivery]  0.666667  28.540682 -81.379423  
2   0.875          [delivery]  0.333333  28.543459 -81.380053  
3   0.875          [delivery]  0.333333  28.542248 -81.380262  
4   0.875          [delivery]  0.000000  28.540120 -81.371980  


In [5]:
# extract the categories and create a new dataframe
categories_df = pd.DataFrame(rest_df['categories'].values.tolist(), index=rest_df.index)

# perform one-hot encoding
categories_one_hot = pd.get_dummies(categories_df.apply(pd.Series).stack()).sum(level=0)

category_counts = categories_one_hot.sum().sort_values(ascending=False)
print(category_counts)

# concatenate the one-hot encoded categories with the original dataframe
rest_df = pd.concat([rest_df, categories_one_hot], axis=1)
rest_df = rest_df.drop('categories', axis=1)


# extract the transactions and create a new dataframe
transactions_df = pd.DataFrame(rest_df['transactions'].values.tolist(), index=rest_df.index)

# perform one-hot encoding
transactions_one_hot = pd.get_dummies(transactions_df.apply(pd.Series).stack()).sum(level=0)

# fill missing values with 0
transactions_one_hot = transactions_one_hot.fillna(0)

transactions_counts = transactions_one_hot.sum().sort_values(ascending=False)
print(transactions_counts)

# concatenate the one-hot encoded transactions with the original dataframe
rest_df = pd.concat([rest_df, transactions_one_hot], axis=1)

rest_df = rest_df.drop('transactions', axis=1)

print(rest_df.head())


sandwiches          199
tradamerican        152
hotdogs             145
pizza               143
breakfast_brunch    141
                   ... 
shanghainese          1
markets               1
singaporean           1
chocolate             1
shoppingcenters       1
Length: 172, dtype: int64
delivery                  1125
pickup                     689
restaurant_reservation      14
dtype: int64
                       id  review_count  rating     price   latitude  \
0  M1cIV-JrVOxMjG_K6bUeiw      0.184691   0.875  0.333333  28.545960   
1  WulVBxLRw4mwn4yjG4JkyQ      0.291857   0.875  0.666667  28.540682   
2  wD_LRs35rEldm95MtTdKJw      0.246254   0.875  0.333333  28.543459   
3  7HDwsoFVZwj9llu5QOwtEw      0.168078   0.875  0.333333  28.542248   
4  BAle9XGF4_x-uHAQi59qCw      0.160586   0.875  0.000000  28.540120   

   longitude  acaibowls  african  argentine  armenian  ...  venues  \
0 -81.377970          0        0          0         0  ...       0   
1 -81.379423          0        

  categories_one_hot = pd.get_dummies(categories_df.apply(pd.Series).stack()).sum(level=0)
  transactions_one_hot = pd.get_dummies(transactions_df.apply(pd.Series).stack()).sum(level=0)


In [6]:
# TODO none of these rows should be missing

# select rows with missing values
missing_rows = rest_df.loc[rest_df.isna().any(axis=1)]

# print the results
print(missing_rows.head())

# drop missing rows
rest_df = rest_df.dropna()

print(rest_df.head())

                        id  review_count  rating     price  latitude  \
15  f-dvot7GwmnObBdClhqidg      0.102932   0.875  0.333333  28.54142   
54  YrxAFQg9uollouZg6yRzHQ      0.042997   0.875  0.333333  28.54247   
67  Cj4Lf7sz2KitWQPGhVj5tA      0.063518   0.750  0.333333  28.53600   
70  CrJSlhxhgpGjO8HYFLFQwQ      0.078176   0.875  0.666667  28.55324   
74  GfXflrmYwZ9olGGMR1TNyA      0.147557   0.750  0.333333  28.55296   

    longitude  acaibowls  african  argentine  armenian  ...  venues  \
15 -81.377770          0        0          0         0  ...       0   
54 -81.369150          0        0          0         0  ...       0   
67 -81.375358          0        0          0         0  ...       0   
70 -81.389120          0        0          0         0  ...       0   
74 -81.358720          0        0          0         0  ...       0   

    vietnamese  waffles  whiskeybars  wine_bars  winetasteclasses  wraps  \
15           0        0            0          0                 

In [7]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors

# TODO test for overfitting

# set the ID column as the index
rest_df = rest_df.set_index('id')

# fit a nearest neighbors model on the preprocessed data
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(rest_df)

def get_recommendations(ids, n=5):
    """
    Given a list of restaurant IDs, return n recommendations based on similar restaurants.
    """
    # get the preprocessed data for the input IDs
    input_data = rest_df.loc[ids]

    # find the n closest restaurants
    distances, indices = knn_model.kneighbors(input_data, n_neighbors=n+1)

    # exclude the input IDs from the recommendations
    indices = indices[:, 1:]

    # get the IDs of the recommended restaurants
    rec_ids = rest_df.iloc[indices.ravel()].index
    
    # return the recommended restaurants
    return rec_ids.tolist()

# example usage: recommend 5 restaurants similar to restaurants with IDs 'id1' and 'id2'
recs = get_recommendations(['WulVBxLRw4mwn4yjG4JkyQ', '7HDwsoFVZwj9llu5QOwtEw'], n=5)
print(recs)


['sacDLoEaaV1-R3xh0EsKEw', 'u1Fvw7GfLF0HGwM2gNpW3w', 'B6kJqpVPLNIDId87ueUhFQ', 'PVejouuFNCjSDqGv_YwAFA', 'HNPahPWST4I9EH_gs8L7GA', 'GDs0ymtRPWWHlUMBfNT5yg', 'CnJP5mdCU6Ml9u48bhLzuQ', '9pqtIAm33b40yAlsCHEd-A', 'qBilWGILaDI2KHQ-SFsLJA', 'ZRXkJWgGP54MRpbUZabCnw']


In [8]:
import joblib
joblib.dump(knn_model, 'model.joblib')

['model.joblib']

In [9]:
def print_json_for_ids(ids):
    with open('restaurants.json', 'r') as f:
        data = json.load(f)
        
        for obj in data:
            if obj['id'] in ids:
                print(json.dumps(obj, indent=4))


recs = get_recommendations(['WulVBxLRw4mwn4yjG4JkyQ', '7HDwsoFVZwj9llu5QOwtEw'], n=5)
print_json_for_ids(recs)

recs = get_recommendations(['7HDwsoFVZwj9llu5QOwtEw'], n=5)
print_json_for_ids(recs)


{
    "id": "HNPahPWST4I9EH_gs8L7GA",
    "alias": "firebirds-wood-fired-grill-orlando",
    "name": "Firebirds Wood Fired Grill",
    "image_url": "https://s3-media1.fl.yelpcdn.com/bphoto/XgA0hqFbwhwz-sK1JqzNeA/o.jpg",
    "is_closed": false,
    "url": "https://www.yelp.com/biz/firebirds-wood-fired-grill-orlando?adjust_creative=aO5yZNW_8NJRqRdup8fiEA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=aO5yZNW_8NJRqRdup8fiEA",
    "review_count": 693,
    "categories": [
        {
            "alias": "seafood",
            "title": "Seafood"
        },
        {
            "alias": "steak",
            "title": "Steakhouses"
        },
        {
            "alias": "wine_bars",
            "title": "Wine Bars"
        }
    ],
    "rating": 4.0,
    "coordinates": {
        "latitude": 28.5652994624947,
        "longitude": -81.364627625795
    },
    "transactions": [
        "delivery"
    ],
    "price": "$$",
    "location": {
        "address1": "1562 N Mills

{
    "id": "ZRXkJWgGP54MRpbUZabCnw",
    "alias": "skyebird-orlando-2",
    "name": "Skyebird",
    "image_url": "https://s3-media2.fl.yelpcdn.com/bphoto/zvnR6S2phhQ6LCg_90SQVQ/o.jpg",
    "is_closed": false,
    "url": "https://www.yelp.com/biz/skyebird-orlando-2?adjust_creative=aO5yZNW_8NJRqRdup8fiEA&utm_campaign=yelp_api_v3&utm_medium=api_v3_business_search&utm_source=aO5yZNW_8NJRqRdup8fiEA",
    "review_count": 138,
    "categories": [
        {
            "alias": "juicebars",
            "title": "Juice Bars & Smoothies"
        },
        {
            "alias": "raw_food",
            "title": "Live/Raw Food"
        }
    ],
    "rating": 4.0,
    "coordinates": {
        "latitude": 28.56832,
        "longitude": -81.34366
    },
    "transactions": [
        "delivery"
    ],
    "price": "$",
    "location": {
        "address1": "3201 Corrine Dr",
        "address2": "",
        "address3": "East End Market",
        "city": "Orlando",
        "zip_code": "32803",
       

In [10]:
# collaborative filtering approach.