In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from geopy.distance import distance

In [2]:
data_path = Path("../data")
businesses_path = data_path / "yelp_dataset" / "yelp_academic_dataset_business.json"
reviews_path = data_path / "yelp_dataset" / "yelp_academic_dataset_review.json"
photos_path = data_path / "yelp_photos" / "photos.json"

In [3]:
restaurants_data = []
with open(businesses_path) as f:
    for line_raw in f:
        line = json.loads(line_raw)
        business = {
            "business_id": line.get("business_id", None),
            "categories_raw": line.get("categories", None),
            "review_count": line.get("review_count", None),
        }
        
        if None in business.values():
            continue
        
        categories = (line["categories"] or "").lower()
        if any(category in categories for category in ["restaurant", "food"]):
            restaurants_data.append(business)
restaurants = pd.DataFrame(data=restaurants_data)

In [4]:
reviews_data = []
with open(reviews_path) as f:
    for line_raw in f:
        line = json.loads(line_raw)
        review = {
            "user_id": line.get("user_id", None),
            "business_id": line.get("business_id", None),
            "rating": line.get("stars", None),
        }
        
        if None in review.values():
            continue
        
        reviews_data.append(review)
reviews = pd.DataFrame(data=reviews_data)

In [5]:
photos_data = []
with open(photos_path) as f:
    for line_raw in f:
        line = json.loads(line_raw)
        photo = {
            "photo_id": line.get("photo_id", None),
            "business_id": line.get("business_id", None)
        }
        
        if None in photo.values():
            continue
        
        photos_data.append(photo)
photos = pd.DataFrame(data=photos_data)

In [6]:
categories = [
    "mexic",
    "sandwich",
    "vietnam",
    "pizza",
    "burger",
    "taco",
    "seafood",
    "chinese",
    "wing",
    "japan",
    "asia",
    "salad",
    "sushi",
    "barbecue",
    "bbq",
    "noodle",
    "italian",
    "india",
    "soup",
    "dessert",
    "thai",
    "chicken",
    "hotdog",
    "mediterran",
    "korea",
    "hawaii",
    "greek",
    "steak",
    "poke",
    "cajun",
    "creole",
    "bagel",
    "hotpot",
    "icecream",
    "filipino",
    "ramen",
    "donut",
    "cheesesteak",
    "dimsum",
    "pasta",
]

restaurants["categories"] = restaurants["categories_raw"].str.lower().str.replace(" ", "").apply(lambda cats: {cat for cat in categories if cat in cats})
restaurants["restaurant_index"] = restaurants.index
category_counts = restaurants.categories.explode().dropna().value_counts()

In [7]:
user_review_counts = reviews["user_id"].value_counts()
user_indices = user_review_counts[user_review_counts > 10].index.to_series(name="user_id").reset_index(drop=True).reset_index().rename(columns={"index": "user_index"})
reviews = reviews.merge(restaurants[["business_id", "restaurant_index"]], on="business_id")
reviews = reviews.merge(user_indices, on="user_id", how="inner")

In [8]:
photo_counts = photos.groupby("business_id").apply(len).rename("photo_count").reset_index()
restaurant_counts = restaurants.merge(photo_counts, on="business_id")
category_counts = restaurant_counts[restaurant_counts["photo_count"] >= 10]["categories"].explode().dropna().value_counts()

In [9]:
from scipy.sparse import csr_matrix, identity

n = reviews.user_index.max() + 1  # users
m = reviews.restaurant_index.max() + 1  # restaurants

P = csr_matrix((
    -np.ones(2 * len(reviews)),
    (np.concatenate([(reviews.restaurant_index).to_numpy(), (reviews.user_index + m).to_numpy()]),
     np.concatenate([(reviews.user_index + m).to_numpy(), (reviews.restaurant_index).to_numpy()])
    )
), shape=(n+m,n+m)) + identity(n+m) * (n+m)

A = np.concatenate([
    np.concatenate([
        np.where(restaurants["categories"].apply(lambda x: category in x), 1, 0),
        np.zeros(n)
    ]).reshape(1, -1)
    for category in category_counts.index
])

b = np.concatenate([(n+m) * np.ones(m), (n+m+10) * np.ones(n)])

In [None]:
import cvxpy as cp

x = cp.Variable(m+n)

objective = cp.quad_form(x, P, assume_PSD=True) - (n+m) * np.ones(m+n) @ x
constraints = [
    0 <= x,
    x <= 1,
    np.ones(m+n) @ x >= 1000,
    np.ones(m+n) @ x <= 3000,
    A @ x >= 5
]
prob = cp.Problem(cp.Minimize(objective), constraints)
prob.solve()

In [None]:
x.value