In [2]:
import pandas as pd
import numpy as np
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pathlib import Path

DATA = Path("../data/processed")
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [3]:
# load interactions
reviews_path = DATA / "ny_food_reviews.parquet"
df = pd.read_parquet(reviews_path)

def temporal_split(df):
    df = df.dropna(subset=["user_id","gmap_id","rating","time"]).copy()
    df = df.sort_values(["user_id","time"])
    tri, vai, tei = [], [], []
    for uid, g in df.groupby("user_id", sort=False):
        n = len(g)
        if n < 5:  # skip ultra-short histories
            continue
        t1, t2 = int(0.7*n), int(0.8*n)
        tri.append(g.iloc[:t1]); vai.append(g.iloc[t1:t2]); tei.append(g.iloc[t2:])
    tr = pd.concat(tri); va = pd.concat(vai); te = pd.concat(tei)
    return tr, va, te

split_files = [DATA/"ny_train.parquet", DATA/"ny_val.parquet", DATA/"ny_test.parquet"]
if all(p.exists() for p in split_files):
    train_df = pd.read_parquet(split_files[0])
    val_df   = pd.read_parquet(split_files[1])
    test_df  = pd.read_parquet(split_files[2])
else:
    train_df, val_df, test_df = temporal_split(df)
    train_df.to_parquet(split_files[0], index=False)
    val_df.to_parquet(split_files[1], index=False)
    test_df.to_parquet(split_files[2], index=False)

len(train_df), len(val_df), len(test_df)

(2871735, 444233, 968857)

In [19]:
### sampling
SEED = 42
MIN_USER_HIST = 10     
SAMPLE_USERS = None     # None for now
MAX_ITEMS = 30000       

rng = np.random.default_rng(SEED)

# 1) filter users by history in TRAIN (so train/val/test remain consistent)
hist = train_df.groupby("user_id")["gmap_id"].count()
eligible_users = hist[hist >= MIN_USER_HIST].index

if SAMPLE_USERS is not None and len(eligible_users) > SAMPLE_USERS:
    sampled_users = pd.Index(rng.choice(eligible_users.values, size=SAMPLE_USERS, replace=False))
else:
    sampled_users = eligible_users

train_df = train_df[train_df.user_id.isin(sampled_users)].copy()
val_df   = val_df[val_df.user_id.isin(sampled_users)].copy()
test_df  = test_df[test_df.user_id.isin(sampled_users)].copy()

if MAX_ITEMS is not None:
    pop_items = train_df.groupby("gmap_id")["user_id"].count().sort_values(ascending=False)
    keep_items = pop_items.index[:MAX_ITEMS]
    train_df = train_df[train_df.gmap_id.isin(keep_items)].copy()
    val_df   = val_df[val_df.gmap_id.isin(keep_items)].copy()
    test_df  = test_df[test_df.gmap_id.isin(keep_items)].copy()

print("After lite sampling:")
print(" users:", train_df.user_id.nunique(), "| items:", train_df.gmap_id.nunique())
print(" train/val/test sizes:", len(train_df), len(val_df), len(test_df))

After lite sampling:
 users: 5000 | items: 18732
 train/val/test sizes: 107519 15050 30554


In [14]:
### build id maps from ALL splits to avoid unknown ids at test time
all_users = pd.Index(pd.concat([train_df.user_id, val_df.user_id, test_df.user_id]).unique())
all_items = pd.Index(pd.concat([train_df.gmap_id, val_df.gmap_id, test_df.gmap_id]).unique())
u2i = {u:i for i,u in enumerate(all_users)}
v2i = {v:i for i,v in enumerate(all_items)}

def encode(df):
    df = df.copy()
    df["u"] = df.user_id.map(u2i)
    df["i"] = df.gmap_id.map(v2i)
    df["r"] = df.rating.astype("float32")
    return df.dropna(subset=["u","i"])

train_e = encode(train_df)
val_e   = encode(val_df)
n_users, n_items = len(all_users), len(all_items)
n_users, n_items

(5000, 18732)

In [15]:
### pytorch dataset and model
class RatingDS(Dataset):
    def __init__(self, df):
        self.u = torch.tensor(df.u.values, dtype=torch.long)
        self.i = torch.tensor(df.i.values, dtype=torch.long)
        self.r = torch.tensor(df.r.values, dtype=torch.float32)
    def __len__(self): return len(self.u)
    def __getitem__(self, idx): return self.u[idx], self.i[idx], self.r[idx]

class MF(nn.Module):
    def __init__(self, n_users, n_items, k=64, bias=True):
        super().__init__()
        self.P = nn.Embedding(n_users, k)
        self.Q = nn.Embedding(n_items, k)
        nn.init.normal_(self.P.weight, std=0.01)
        nn.init.normal_(self.Q.weight, std=0.01)
        self.bias = bias
        if bias:
            self.ub = nn.Embedding(n_users, 1)
            self.ib = nn.Embedding(n_items, 1)
            self.mu = nn.Parameter(torch.zeros(1))
    def forward(self, u, i):
        s = (self.P(u) * self.Q(i)).sum(-1)
        if self.bias:
            s = s + self.ub(u).squeeze(-1) + self.ib(i).squeeze(-1) + self.mu
        return s

In [16]:
### TRAIN MF (MSE on explicit ratings)
batch_size=4096; epochs=6; k=64
model = MF(n_users, n_items, k=k, bias=True).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=1e-4)
loss_fn = nn.MSELoss()
dl = DataLoader(RatingDS(train_e), batch_size=batch_size, shuffle=True)

for ep in range(epochs):
    model.train(); total=0
    for u,i,r in dl:
        u,i,r = u.to(device), i.to(device), r.to(device)
        pred = model(u,i)
        loss = loss_fn(pred, r)
        opt.zero_grad(); loss.backward(); opt.step()
        total += loss.item()*len(u)
    print(f"Epoch {ep+1}: train MSE={total/len(train_e):.4f}")

Epoch 1: train MSE=18.6812
Epoch 2: train MSE=13.1024
Epoch 3: train MSE=5.2518
Epoch 4: train MSE=1.7436
Epoch 5: train MSE=1.0950
Epoch 6: train MSE=0.9342


In [17]:
### generate top-200 candidates (base_score)
@torch.no_grad()
def topk_candidates_streaming(model, K=200, users_idx=None):
    """
    Memory-safe: for each user u, compute scores = Q @ P[u] (vector-matrix),
    then take top-K. Avoids building full P @ Q^T.
    """
    model.eval()
    P = model.P.weight.to(device)   # [n_users, k]
    Q = model.Q.weight.to(device)   # [n_items, k]

    if users_idx is None:
        users_idx = range(P.shape[0])

    rows = []
    for u in users_idx:
        uvec = P[u]                          # [k]
        scores = torch.mv(Q, uvec)           # [n_items]
        vals, idxs = torch.topk(scores, min(K, Q.shape[0]))
        uid = all_users[u]
        idxs = idxs.cpu().numpy(); vals = vals.cpu().numpy()
        rows.extend((uid, all_items[i], float(s)) for i, s in zip(idxs, vals))
    return pd.DataFrame(rows, columns=["user_id","item_id","base_score"])

cand_val  = topk_candidates_streaming(model, K=200)
cand_test = cand_val.copy()  # or regenerate after retraining on train+val

In [18]:
from collections import defaultdict
import math

### Evaluation

def build_gt(df):
    gt = defaultdict(set)
    for u,i in zip(df.user_id, df.gmap_id): gt[u].add(i)
    return gt

def recall_ndcg(cand_df, gt, K=10):
    recs = cand_df.groupby("user_id")["item_id"].apply(list)
    r_list, n_list = [], []
    for u, items in recs.items():
        hits = [1 if it in gt.get(u,set()) else 0 for it in items[:K]]
        if not hits: continue
        rec = sum(hits)/min(K, max(1,len(gt.get(u,set()))))
        dcg = sum(h/math.log2(idx+2) for idx,h in enumerate(hits))
        idcg = sum(1/math.log2(i+2) for i in range(min(K, len(gt.get(u,set())))))
        ndcg = dcg/(idcg or 1)
        r_list.append(rec); n_list.append(ndcg)
    return float(np.mean(r_list or [0])), float(np.mean(n_list or [0]))

gt_val = build_gt(val_df)
r10, n10 = recall_ndcg(cand_val, gt_val, K=10)
r20, n20 = recall_ndcg(cand_val, gt_val, K=20)
{"recall@10":round(r10,4), "ndcg@10":round(n10,4), "recall@20":round(r20,4), "ndcg@20":round(n20,4)}

{'recall@10': 0.0081,
 'ndcg@10': 0.0053,
 'recall@20': 0.0134,
 'ndcg@20': 0.0071}