In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option('display.max_rows', 500)

In [2]:
data_path = Path("../data")
businesses_path = data_path / "yelp_dataset" / "yelp_academic_dataset_business.json"
reviews_path = data_path / "yelp_dataset" / "yelp_academic_dataset_review.json"
photos_path = data_path / "yelp_photos" / "photos.json"
restaurants_path = data_path / "restaurants"
restaurants_path.mkdir(exist_ok=True)

In [3]:
restaurants_data = []
with open(businesses_path) as f:
    for line_raw in f:
        line = json.loads(line_raw)
        business = {
            "business_id": line.get("business_id", None),
            "categories_raw": line.get("categories", None),
            "review_count": line.get("review_count", None),
            "city": line.get("city", None),
            "state": line.get("state", None),
        }
        
        if None in business.values():
            continue
        
        categories = (line["categories"] or "").lower()
        if any(category in categories for category in ["restaurant", "food"]):
            restaurants_data.append(business)
restaurants = pd.DataFrame(data=restaurants_data)

In [4]:
reviews_data = []
with open(reviews_path) as f:
    for line_raw in f:
        line = json.loads(line_raw)
        review = {
            "user_id": line.get("user_id", None),
            "business_id": line.get("business_id", None),
            "rating": line.get("stars", None),
        }
        
        if None in review.values():
            continue
        
        reviews_data.append(review)
reviews = pd.DataFrame(data=reviews_data)

In [5]:
photos_data = []
with open(photos_path) as f:
    for line_raw in f:
        line = json.loads(line_raw)
        photo = {
            "photo_id": line.get("photo_id", None),
            "business_id": line.get("business_id", None)
        }
        
        if None in photo.values():
            continue
        
        photos_data.append(photo)
photos = pd.DataFrame(data=photos_data)

In [6]:
categories = [
    "mexic",
    "sandwich",
    "vietnam",
    "pizza",
    "burger",
    "taco",
    "seafood",
    "chinese",
    "wing",
    "japan",
    "asia",
    "salad",
    "sushi",
    "barbecue",
    "bbq",
    "noodle",
    "italian",
    "india",
    "soup",
    "dessert",
    "thai",
    "chicken",
    "hotdog",
    "mediterran",
    "korea",
    "hawaii",
    "greek",
    "steak",
    "poke",
    "cajun",
    "creole",
    "bagel",
    "hotpot",
    "icecream",
    "filipino",
    "ramen",
    "donut",
    "cheesesteak",
    "dimsum",
    "pasta",
]

restaurants["categories"] = restaurants["categories_raw"].str.lower().str.replace(" ", "").apply(lambda cats: {cat for cat in categories if cat in cats})
restaurants["restaurant_index"] = restaurants.index
category_counts = restaurants.categories.explode().dropna().value_counts()

In [7]:
user_review_counts = reviews["user_id"].value_counts()
user_indices = user_review_counts[user_review_counts > 10].index.to_series(name="user_id").reset_index(drop=True).reset_index().rename(columns={"index": "user_index"})
reviews = reviews.merge(restaurants[["business_id", "restaurant_index"]], on="business_id")
reviews = reviews.merge(user_indices, on="user_id", how="inner")

In [8]:
photo_counts = photos.groupby("business_id").apply(len).rename("photo_count").reset_index()
restaurant_counts = restaurants.merge(photo_counts, on="business_id")
category_counts = restaurant_counts[restaurant_counts["photo_count"] >= 10]["categories"].explode().dropna().value_counts()

from scipy.sparse import csr_matrix, identity

n = reviews.user_index.max() + 1  # users
m = reviews.restaurant_index.max() + 1  # restaurants

P = csr_matrix((
    -np.ones(2 * len(reviews)),
    (np.concatenate([(reviews.restaurant_index).to_numpy(), (reviews.user_index + m).to_numpy()]),
     np.concatenate([(reviews.user_index + m).to_numpy(), (reviews.restaurant_index).to_numpy()])
    )
), shape=(n+m,n+m)) + identity(n+m) * (n+m)

A = np.concatenate([
    np.concatenate([
        np.where(restaurants["categories"].apply(lambda x: category in x), 1, 0),
        np.zeros(n)
    ]).reshape(1, -1)
    for category in category_counts.index
])

b = np.concatenate([(n+m) * np.ones(m), (n+m+10) * np.ones(n)])

import cvxpy as cp

x = cp.Variable(m+n)

objective = cp.quad_form(x, P, assume_PSD=True) - (n+m) * np.ones(m+n) @ x
constraints = [
    0 <= x,
    x <= 1,
    np.ones(m+n) @ x >= 1000,
    np.ones(m+n) @ x <= 3000,
    A @ x >= 5
]
prob = cp.Problem(cp.Minimize(objective), constraints)
prob.solve(solver=cp.OSQP, verbose=True, time_limit=10)

In [9]:
city_counts = restaurants[["city", "state"]].value_counts().rename("restaurant_count").reset_index()
restaurants_by_city = city_counts[city_counts["restaurant_count"] > 3000].merge(restaurants, on=["city", "state"])
restaurants_by_city = restaurants_by_city.merge(photo_counts[photo_counts["photo_count"] > 5]["business_id"], on="business_id", how="inner")
users_by_city = restaurants_by_city.merge(reviews, on="business_id").groupby(["city", "state"]).apply(lambda group: (group["user_id"].value_counts() > 10).sum())
users_by_city

city          state
Indianapolis  IN       2082
Nashville     TN       1614
Philadelphia  PA       5282
Tampa         FL       2177
Tucson        AZ       1402
dtype: int64

In [12]:
restaurants_by_category_count = restaurants_by_city.explode("categories").merge(category_counts.rename("category_count"), left_on="categories", right_index=True)
restaurants_by_category_count["category_count"] = restaurants_by_category_count["category_count"].fillna(1000)
restaurant_weights = restaurants_by_category_count.groupby("business_id")[["review_count", "category_count"]].min().reset_index()
restaurant_weights["weight"] = restaurant_weights["review_count"] / restaurant_weights["category_count"]
restaurant_weights

Unnamed: 0,business_id,review_count,category_count,weight
0,--hF_3v1JmU9nlu4zfXJ8Q,15,414,0.036232
1,-0TffRSXXIlBYVbb5AwfTg,1097,63,17.412698
2,-1B9pP_CrRBJYPICE5WbRA,822,86,9.558140
3,-1oygVebK81K8JEPI6H6Lw,72,25,2.880000
4,-2wh7NTLkWEgsrLJvilnFQ,96,171,0.561404
...,...,...,...,...
2571,zuZ8mK0hoEB2Q3r7j-_ZqA,55,414,0.132850
2572,zujdPV3HT-Y-CKE1GgkMHQ,346,91,3.802198
2573,zwGzwkVeYXE-tRisb8if7A,544,131,4.152672
2574,zwrgCMuZyFX46mL3piDyjg,862,455,1.894505


In [13]:
restaurants_sample = restaurants_by_city.merge(restaurant_weights[["business_id", "weight"]], on="business_id").sample(n=2000, weights="weight")

reviews_by_city = restaurants_sample.merge(reviews, on="business_id").groupby(["city", "state"])
users_gt_10 = reviews_by_city.apply(lambda group: (group["user_id"].value_counts() > 10).sum()).rename(">10_total").to_frame()
users_gt_20 = reviews_by_city.apply(lambda group: (group["user_id"].value_counts() > 20).sum()).rename(">20_total").to_frame()

print("total >10:", (reviews_by_city["user_id"].value_counts() > 10).sum())
print("total > 20:", (reviews_by_city["user_id"].value_counts() > 20).sum())

users_by_city = users_gt_10.merge(users_gt_20, left_index=True, right_index=True)
users_by_city

total >10: 7574
total > 20: 2758


Unnamed: 0_level_0,Unnamed: 1_level_0,>10_total,>20_total
city,state,Unnamed: 2_level_1,Unnamed: 3_level_1
Indianapolis,IN,1240,437
Nashville,TN,830,242
Philadelphia,PA,3223,1307
Tampa,FL,1349,466
Tucson,AZ,932,306


In [14]:
reviews_by_category = restaurants_sample.merge(reviews, on="business_id").explode("categories").groupby("categories")
category_users = reviews_by_category.size().rename("total").to_frame()
category_users_gt_5 = reviews_by_category.apply(lambda group: (group["user_id"].value_counts() > 5).sum()).rename(">5").to_frame()
category_users_gt_10 = reviews_by_category.apply(lambda group: (group["user_id"].value_counts() > 10).sum()).rename(">10").to_frame()

users_by_category = category_users_gt_5.merge(category_users_gt_10, left_index=True, right_index=True).merge(category_users, left_index=True, right_index=True)
users_by_category

Unnamed: 0_level_0,>5,>10,total
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
asia,497,95,27914
bagel,3,0,6732
burger,796,125,43880
cajun,2,0,4504
cheesesteak,140,12,12158
chicken,102,11,17718
chinese,546,122,25059
creole,2,0,4504
dessert,785,147,42168
dimsum,56,7,6815


In [16]:
photos.merge(restaurants_sample["business_id"], how="inner")[["business_id", "photo_id"]].to_csv(data_path / "restaurants" / "photos_sample.csv", index=None)