Environment and Import Preparation

In [2]:
!pip install torch torchvision torchaudio -q
!pip install torch-geometric -q
!pip install dgl -q  # generic DGL (CPU/GPU autodetect)
!pip install torchmetrics==1.4.0.post0 scikit-learn pandas numpy tqdm geopy haversine -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os, json, math, random, gc, time
from dataclasses import dataclass
from typing import Dict, Tuple, List, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torch_geometric.data import HeteroData
from torch_geometric.utils import to_undirected, coalesce
from torch_geometric.nn import HGTConv, SAGEConv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from haversine import haversine

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED);
if DEVICE.type == 'cuda':
    torch.cuda.manual_seed_all(SEED)

print("Device:", DEVICE)

Device: cuda


JSON processing

In [10]:
import json

with open("filter_all_t.json", "r") as f:
    data = json.load(f)

print(type(data))
print(list(data.keys())[:10] if isinstance(data, dict) else data[0])

<class 'dict'>
['train', 'val', 'test']


In [11]:
import json
import pandas as pd

# Load the JSON file
with open("filter_all_t.json", "r") as f:
    data = json.load(f)

# Loop through each dataset and save separately
for split_name, records in data.items():
    df = pd.DataFrame(records)
    csv_name = f"{split_name}.csv"
    df.to_csv(csv_name, index=False)
    print(f"✅ Saved {csv_name} with {len(df)} rows.")

✅ Saved train.csv with 87013 rows.
✅ Saved val.csv with 10860 rows.
✅ Saved test.csv with 11015 rows.


Data loader for train/test/val

In [9]:
!fusermount -u /content/drive
!rm -rf /content/drive

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

fusermount: failed to unmount /content/drive: Invalid argument
Mounted at /content/drive


In [10]:
# Loader for train/test/val sets

import os, time
import numpy as np
import pandas as pd

# Paths to your three CSVs
DATA_DIR = "/content/drive/MyDrive/CS224W/Data"
os.makedirs(DATA_DIR, exist_ok=True)

TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
VAL_CSV   = os.path.join(DATA_DIR, "val.csv")
TEST_CSV  = os.path.join(DATA_DIR, "test.csv")

MIN_USER_INTERACTIONS = 2
MIN_ITEM_INTERACTIONS = 2

def _ensure_required_cols(df: pd.DataFrame, split_name: str) -> pd.DataFrame:
    df = df.copy()

    # Map your headings to the schema the pipeline expects
    # business_id -> item_id (keep the original column intact for reference)
    if "business_id" in df.columns and "item_id" not in df.columns:
        df["item_id"] = df["business_id"].astype(str)

    # Ensure user_id is string
    if "user_id" in df.columns:
        df["user_id"] = df["user_id"].astype(str)
    else:
        raise ValueError(f"{split_name} is missing 'user_id' column.")

    # Ensure rating exists and is numeric
    if "rating" not in df.columns:
        df["rating"] = 5.0
    df["rating"] = pd.to_numeric(df["rating"], errors="coerce").fillna(5.0)

    # Timestamp (not present in CSV) — create a stable dummy ts
    # Use the row index to create increasing integers (works for sorting)
    if "ts" not in df.columns:
        df["ts"] = np.arange(len(df), dtype=np.int64)

    # Placeholders for metadata we don't have yet
    for c in ["user_lat", "user_lon", "item_lat", "item_lon", "price"]:
        if c not in df.columns:
            df[c] = np.nan

    # Categories placeholder (list-like)
    if "categories" not in df.columns:
        df["categories"] = [[] for _ in range(len(df))]

    # Tag the split (handy later)
    df["split"] = split_name
    return df

def load_google_restaurants() -> pd.DataFrame:
    """Loads train/val/test CSVs (with headers: business_id, user_id, rating, review_text, pics, history_reviews),
    standardizes to the expected schema, and returns a single combined DataFrame with a 'split' column.
    """
    existing = [p for p in [TRAIN_CSV, VAL_CSV, TEST_CSV] if os.path.exists(p)]
    if not existing:
        raise FileNotFoundError("Couldn't find train.csv, val.csv, or test.csv in DATA_DIR")

    dfs = []
    for path in existing:
        split_name = os.path.splitext(os.path.basename(path))[0]  # "train"/"val"/"test"
        raw = pd.read_csv(path)
        std = _ensure_required_cols(raw, split_name)
        dfs.append(std)

    df_all = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(df_all)} rows from {len(dfs)} files: {[os.path.basename(p) for p in existing]}")
    return df_all

df = load_google_restaurants()
print(df.head(), "\n", df.shape)


Loaded 108888 rows from 3 files: ['train.csv', 'val.csv', 'test.csv']
                business_id                user_id  rating  \
0  60567465d335d0abfb415b26  101074926318992653684       4   
1  6050fa9f5b4ccec8d5cae994  117065749986299237881       5   
2  604be10877e81aaed3cc9a1e  106700937793048450809       4   
3  60411e017cd8bf130362365a  101643045857250355161       5   
4  604139dd7cd8bf1303624208  109802745326785766951       4   

                                         review_text  \
0  The tang of the tomato sauce is outstanding. A...   
1              Chicken and waffles were really good!   
2  The appetizer of colossal shrimp was very good...   
3  The fish tacos here  omg! The salad was great ...   
4  Ribs are great, as are the mac and cheese, fri...   

                                                pics  \
0  ['AF1QipM-2IRmvitARbcJr7deWfe5hyVBg_ArPMQSYvq0...   
1   ['AF1QipMpfxIZUT_aymQ3qPGO-QgGYzxbtLZGmHufAp2s']   
2  ['AF1QipMNnqM5X9sSyZ9pXRZ1jvrURHN9bZhGdzuEXoP8...

Filter/encode/split and build the graph with train edges

In [11]:
import math
import torch
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import HeteroData
from torch_geometric.utils import to_undirected, coalesce

DEVICE = "cpu"  # change to "cuda" if you have a GPU

# ---------- Filtering ----------
def filter_min_interactions(df, umin=MIN_USER_INTERACTIONS, imin=MIN_ITEM_INTERACTIONS):
    """Filter out users/items globally with < umin/imin interactions."""
    grouped_u = df.groupby("user_id").size()
    keep_users = set(grouped_u[grouped_u >= umin].index)
    grouped_i = df.groupby("item_id").size()
    keep_items = set(grouped_i[grouped_i >= imin].index)
    out = df[df.user_id.isin(keep_users) & df.item_id.isin(keep_items)].copy()
    return out

df_all = filter_min_interactions(df)
print("After filtering:", df_all.shape)

# ---------- Global encoding ----------
u_enc = LabelEncoder().fit(df_all["user_id"])
i_enc = LabelEncoder().fit(df_all["item_id"])

def encode_df(subdf: pd.DataFrame) -> pd.DataFrame:
    subdf = subdf.copy()
    # Only keep rows that survived global filtering
    subdf = subdf[subdf["user_id"].isin(u_enc.classes_) & subdf["item_id"].isin(i_enc.classes_)]
    subdf["u"] = u_enc.transform(subdf["user_id"])
    subdf["i"] = i_enc.transform(subdf["item_id"])
    return subdf

train_df = encode_df(df_all[df_all["split"] == "train"])
val_df   = encode_df(df_all[df_all["split"] == "val"])
test_df  = encode_df(df_all[df_all["split"] == "test"])

num_users = len(u_enc.classes_)
num_items = len(i_enc.classes_)
print(f"num_users={num_users}, num_items={num_items}")

# ---------- Build hetero graph (train edges only) ----------
data = HeteroData()
data["user"].num_nodes = num_users
data["item"].num_nodes = num_items

# user-item edges from train
ui_src = torch.tensor(train_df["u"].values, dtype=torch.long)
ui_dst = torch.tensor(train_df["i"].values, dtype=torch.long)
edge_index = torch.stack([ui_src, ui_dst], dim=0)
data["user", "rates", "item"].edge_index = edge_index
data["item", "rev_by", "user"].edge_index = edge_index.flip(0)

# ---------- Item features (metadata not yet available) ----------
# Keep the expected 3-dim feature shape [item_lat, item_lon, price], all zeros.
# This matches downstream code expecting a (num_items, 3) tensor.
item_x = torch.zeros((num_items, 3), dtype=torch.float)
data["item"].x = item_x

# ---------- Item-item edges ----------
# Co-review edges from train: connect items co-rated by the same user at least twice.
co_counts = {}
for u, grp in train_df.groupby("u"):
    items = grp["i"].tolist()
    for a in items:
        for b in items:
            if a >= b:
                continue
            co_counts[(a, b)] = co_counts.get((a, b), 0) + 1

pairs = [(a, b) for (a, b), c in co_counts.items() if c >= 2]
if len(pairs) > 0:
    ii_src = [a for a, b in pairs]
    ii_dst = [b for a, b in pairs]
    ii_edge = torch.tensor([ii_src, ii_dst], dtype=torch.long)
    ii_edge = to_undirected(ii_edge)
    data["item", "similar", "item"].edge_index = coalesce(ii_edge, num_nodes=num_items)
else:
    # If no pairs meet the threshold, create an empty edge_index
    data["item", "similar", "item"].edge_index = torch.empty((2,0), dtype=torch.long)

data = data.to(DEVICE)
print(data)

After filtering: (96003, 15)
num_users=36364, num_items=17946
HeteroData(
  user={ num_nodes=36364 },
  item={
    num_nodes=17946,
    x=[17946, 3],
  },
  (user, rates, item)={ edge_index=[2, 76769] },
  (item, rev_by, user)={ edge_index=[2, 76769] },
  (item, similar, item)={ edge_index=[2, 3652] }
)


Sampling, Metrics, and Utilities

In [12]:
import random
import math
import torch
import numpy as np
from typing import Optional, Dict, List, Tuple

# ---------- BPR Sampler ----------
def bpr_triplet_sampler(train_df: pd.DataFrame, num_items: int,
                        radius_km: Optional[float]=None,
                        item_latlon: Optional[List[Tuple[float,float]]]=None,
                        user_pos_map: Optional[Dict[int,set]]=None,
                        user_home_latlon: Optional[Dict[int,Tuple[float,float]]]=None,
                        batch_size: int = 2048):
    """
    Yields batches of (u, pos_i, neg_j) for BPR.
    Works even if geographic metadata is missing.
    """
    # Build user -> positive item map
    if user_pos_map is None:
        user_pos_map = {u: set(g["i"].values.tolist()) for u, g in train_df.groupby("u")}
    users = list(user_pos_map.keys())

    # Precompute safe item_latlon array if not provided
    if item_latlon is None or len(item_latlon) != num_items:
        item_latlon = [(np.nan, np.nan)] * num_items

    while True:
        uu, ii, jj = [], [], []
        for _ in range(batch_size):
            u = random.choice(users)
            pos_i = random.choice(list(user_pos_map[u]))

            # Negative sampling
            if radius_km is not None and item_latlon is not None:
                neg_j = None
                # Determine user's location center (home or positive item)
                center = None
                if user_home_latlon and u in user_home_latlon:
                    center = user_home_latlon[u]
                else:
                    lat_i, lon_i = item_latlon[pos_i]
                    if not (math.isnan(lat_i) or math.isnan(lon_i)):
                        center = (lat_i, lon_i)

                # If we have a center, attempt radius-aware sampling
                if center:
                    latc, lonc = center
                    candidates = [k for k in range(num_items) if k not in user_pos_map[u]]
                    random.shuffle(candidates)
                    for k in candidates:
                        latk, lonk = item_latlon[k]
                        # skip if no coordinates
                        if math.isnan(latk) or math.isnan(lonk):
                            continue
                        if haversine((latc, lonc), (latk, lonk)) <= radius_km:
                            neg_j = k
                            break
                # Fallback to uniform negative sampling
                if neg_j is None:
                    while True:
                        k = random.randrange(num_items)
                        if k not in user_pos_map[u]:
                            neg_j = k
                            break
            else:
                # Uniform negative sampling (without geo info for now)
                while True:
                    k = random.randrange(num_items)
                    if k not in user_pos_map[u]:
                        neg_j = k
                        break

            uu.append(u)
            ii.append(pos_i)
            jj.append(neg_j)

        yield (
            torch.tensor(uu, device=DEVICE),
            torch.tensor(ii, device=DEVICE),
            torch.tensor(jj, device=DEVICE)
        )

# ---------- Ranking Metrics Helper Functions ----------
def recall_at_k(ranked_items, ground_truth, k=10):
    hits = sum([1 for x in ranked_items[:k] if x in ground_truth])
    return hits / float(min(k, len(ground_truth))) if ground_truth else 0.0

def ndcg_at_k(ranked_items, ground_truth, k=10):
    dcg = sum([1.0 / math.log2(idx + 2) for idx, it in enumerate(ranked_items[:k]) if it in ground_truth])
    idcg = sum([1.0 / math.log2(i + 2) for i in range(min(k, len(ground_truth)))])
    return dcg / idcg if idcg > 0 else 0.0

def geo_discount(distance_km, R=5.0):
    return math.exp(-distance_km / R)

def geo_ndcg_at_k(ranked_items, ground_truth, user_loc, item_latlon, k=10, R=5.0):
    dcg = 0.0
    for idx, it in enumerate(ranked_items[:k], start=1):
        if it in ground_truth:
            w = 1.0
            if user_loc and not any(np.isnan(user_loc)):
                latu, lonu = user_loc
                lati, loni = item_latlon[it]
                if not math.isnan(lati) and not math.isnan(loni):
                    d = haversine((latu, lonu), (lati, loni))
                    w = geo_discount(d, R=R)
            dcg += w / math.log2(idx + 1)
    idcg = sum([1.0 / math.log2(i + 2) for i in range(min(k, len(ground_truth)))])
    return dcg / idcg if idcg > 0 else 0.0

def mrr(ranked_items, ground_truth, k=10):
    for idx, it in enumerate(ranked_items[:k], start=1):
        if it in ground_truth:
            return 1.0 / idx
    return 0.0

def rmse(preds, trues):
    return float(np.sqrt(np.mean((np.array(preds) - np.array(trues)) ** 2)))

def mae(preds, trues):
    return float(np.mean(np.abs(np.array(preds) - np.array(trues))))

# ---------- Dataset Helpers ----------
# For now, items have dummy coordinates → all NaNs
item_latlon = [(float(v[0]), float(v[1])) for v in data["item"].x[:, :2].tolist()]
user_home = {}  # optional; keep empty unless you have user locations


Baseline Model: LightGCN

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ---------- Basic LightGCN implementation ----------
class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, emb_dim=64, num_layers=4, alpha=None):
        super().__init__()
        self.num_users, self.num_items = num_users, num_items
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)
        nn.init.normal_(self.user_emb.weight, std=0.1)
        nn.init.normal_(self.item_emb.weight, std=0.1)
        self.num_layers = num_layers
        self.alpha = alpha if alpha is not None else [1/(num_layers+1)]*(num_layers+1)

        # Check if the graph has user-item edges
        if ("user", "rates", "item") in data.edge_types:
            edge = data["user", "rates", "item"].edge_index
            u, i = edge[0], edge[1]
            # compute degrees
            deg_u = torch.bincount(u, minlength=num_users).float()
            deg_i = torch.bincount(i, minlength=num_items).float()
        else:
            # if no edges (edge_index empty)
            u = torch.tensor([], dtype=torch.long)
            i = torch.tensor([], dtype=torch.long)
            deg_u = torch.ones(num_users)
            deg_i = torch.ones(num_items)

        self.pairs = (u.to(DEVICE), i.to(DEVICE), deg_u.to(DEVICE), deg_i.to(DEVICE))

    def propagate(self, user_x, item_x):
        u, i, deg_u, deg_i = self.pairs
        all_user = [user_x]
        all_item = [item_x]

        for _ in range(self.num_layers):
            msg_u = torch.zeros_like(user_x)
            msg_i = torch.zeros_like(item_x)

            if len(u) > 0:  # only if edges exist
                msg_u.index_add_(
                    0,
                    u,
                    item_x[i] / torch.sqrt(deg_u[u].unsqueeze(1) * deg_i[i].unsqueeze(1) + 1e-8),
                )
                msg_i.index_add_(
                    0,
                    i,
                    user_x[u] / torch.sqrt(deg_i[i].unsqueeze(1) * deg_u[u].unsqueeze(1) + 1e-8),
                )

            user_x, item_x = msg_u, msg_i
            all_user.append(user_x)
            all_item.append(item_x)

        # Weighted layer-wise average
        alpha_tensor = torch.tensor(self.alpha, device=user_x.device).view(-1, 1, 1)
        user_out = (alpha_tensor * torch.stack(all_user)).sum(0)
        item_out = (alpha_tensor * torch.stack(all_item)).sum(0)
        return user_out, item_out

    def forward(self):
        u0 = self.user_emb.weight
        i0 = self.item_emb.weight
        return self.propagate(u0, i0)

    def score(self, users, items, user_emb=None, item_emb=None):
        if user_emb is None or item_emb is None:
            user_emb, item_emb = self.forward()
        return (user_emb[users] * item_emb[items]).sum(dim=1)


# ---------- BPR Loss Function ----------
def bpr_loss(pos_scores, neg_scores, reg=None, params: list = []):
    loss = -F.logsigmoid(pos_scores - neg_scores).mean()
    if reg:
        loss = loss + reg * sum(p.norm(2).pow(2) for p in params) / len(params)
    return loss


# ---------- Training Loop ----------
def train_lightgcn(epochs=5, emb_dim=64, batch_size=2048, lr=1e-3, reg=1e-4):
    model = LightGCN(num_users, num_items, emb_dim=emb_dim).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)

    sampler = bpr_triplet_sampler(train_df, num_items, batch_size=batch_size)
    user_pos = {u: set(g["i"].values.tolist()) for u, g in train_df.groupby("u")}

    for ep in range(1, epochs + 1):
        model.train()
        total = 0.0
        steps = max(1, len(train_df) // batch_size)
        for step in range(steps):
            u, i, j = next(sampler)
            user_emb, item_emb = model()
            pos = model.score(u, i, user_emb, item_emb)
            neg = model.score(u, j, user_emb, item_emb)
            loss = bpr_loss(pos, neg, reg, [model.user_emb.weight, model.item_emb.weight])

            opt.zero_grad()
            loss.backward()
            opt.step()
            total += loss.detach().item()
        print(f"[LightGCN] Epoch {ep:02d} | Loss = {total/steps:.4f}")

    return model


# ---------- Run Training ----------
# lightgcn_model = train_lightgcn(epochs=30, emb_dim=128, batch_size=1024, lr=5e-4, reg=1e-4)

[LightGCN] Epoch 01 | Loss = 3.2987


KeyboardInterrupt: 

In [17]:
import math, copy, torch
import numpy as np

# ---------- Basic LightGCN implementation (+ early stopping) ----------
def train_lightgcn_with_eval(
    epochs=30,
    eval_every=5,
    emb_dim=128,
    num_layers=3,
    batch_size=1024,
    lr=5e-4,
    reg=1e-4,
    k_list=[5, 10, 20],
    early_stop_patience=5,      # epochs without improvement on NDCG@10
):
    model = LightGCN(num_users, num_items, emb_dim=emb_dim, num_layers=num_layers).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    sampler = bpr_triplet_sampler(train_df, num_items, batch_size=batch_size)

    history = {"epoch": [], "loss": [], "val": []}
    best_metric = -1.0
    best_state = copy.deepcopy(model.state_dict())
    no_improve = 0

    steps_per_epoch = max(1, len(train_df) // batch_size)

    for ep in range(1, epochs + 1):
        model.train()
        running = 0.0

        for _ in range(steps_per_epoch):
            u, i, j = next(sampler)
            user_emb, item_emb = model()
            pos = model.score(u, i, user_emb, item_emb)
            neg = model.score(u, j, user_emb, item_emb)
            loss = bpr_loss(pos, neg, reg, [model.user_emb.weight, model.item_emb.weight])

            opt.zero_grad()
            loss.backward()
            opt.step()

            running += loss.detach().item()   # <- no warning

        avg_loss = running / steps_per_epoch
        history["epoch"].append(ep)
        history["loss"].append(avg_loss)

        print(f"[LightGCN] Epoch {ep:02d} | Loss = {avg_loss:.4f}")

        # ---- periodic validation for early stopping ----
        if (ep % eval_every == 0) or (ep == 1) or (ep == epochs):
            val_results = evaluate_model(model, train_df, val_df, k_list=k_list)
            history["val"].append((ep, val_results))
            metric = val_results.get("NDCG@10", 0.0)  # select your key metric

            pretty = " | ".join([f"{k}:{v:.4f}" for k, v in val_results.items()])
            print(f"Val @Epoch {ep}: {pretty}")

            # early stopping on the chosen metric
            if metric > best_metric + 1e-6:
                best_metric = metric
                best_state = copy.deepcopy(model.state_dict())
                no_improve = 0
                print("New best model (by NDCG@10).")
            else:
                no_improve += 1
                if no_improve >= early_stop_patience:
                    print(f"Early stopping (no improvement for {early_stop_patience} evals).")
                    break

    # load best and report final val/test
    model.load_state_dict(best_state)
    print("\nLoaded best model (by NDCG@10).")

    val_results  = evaluate_model(model, train_df, val_df,  k_list=k_list)
    test_results = evaluate_model(model, train_df, test_df, k_list=k_list)

    print("\n✅ Final Validation:")
    for k, v in val_results.items():
        print(f"  {k}: {v:.4f}")

    print("\n✅ Final Test:")
    for k, v in test_results.items():
        print(f"  {k}: {v:.4f}")

    return model, {"history": history, "val": val_results, "test": test_results}

# ---- Run it ----
lightgcn_model, eval_summary = train_lightgcn_with_eval(
    epochs=30,            # increase if still improving
    eval_every=5,         # validate every 5 epochs
    emb_dim=128,          # try 32/64/128
    num_layers=3,         # try 2–4
    batch_size=1024,      # adjust for memory
    lr=5e-4,
    reg=1e-4,
    k_list=[5, 10, 20],
    early_stop_patience=4
)

[LightGCN] Epoch 01 | Loss = 3.2955
Evaluating on 3628 users...


100%|██████████| 3628/3628 [00:02<00:00, 1609.76it/s]


Val @Epoch 1: Recall@5:0.0002 | NDCG@5:0.0002 | MRR@5:0.0003 | Recall@10:0.0006 | NDCG@10:0.0003 | MRR@10:0.0004 | Recall@20:0.0011 | NDCG@20:0.0005 | MRR@20:0.0005
New best model (by NDCG@10).
[LightGCN] Epoch 02 | Loss = 2.1546
[LightGCN] Epoch 03 | Loss = 1.5223
[LightGCN] Epoch 04 | Loss = 1.1664
[LightGCN] Epoch 05 | Loss = 0.9640
Evaluating on 3628 users...


100%|██████████| 3628/3628 [00:02<00:00, 1705.41it/s]


Val @Epoch 5: Recall@5:0.0012 | NDCG@5:0.0010 | MRR@5:0.0012 | Recall@10:0.0016 | NDCG@10:0.0011 | MRR@10:0.0013 | Recall@20:0.0021 | NDCG@20:0.0013 | MRR@20:0.0014
New best model (by NDCG@10).
[LightGCN] Epoch 06 | Loss = 0.8483
[LightGCN] Epoch 07 | Loss = 0.7820
[LightGCN] Epoch 08 | Loss = 0.7440
[LightGCN] Epoch 09 | Loss = 0.7222
[LightGCN] Epoch 10 | Loss = 0.7096
Evaluating on 3628 users...


100%|██████████| 3628/3628 [00:02<00:00, 1735.44it/s]


Val @Epoch 10: Recall@5:0.0018 | NDCG@5:0.0019 | MRR@5:0.0022 | Recall@10:0.0018 | NDCG@10:0.0019 | MRR@10:0.0022 | Recall@20:0.0025 | NDCG@20:0.0021 | MRR@20:0.0023
New best model (by NDCG@10).
[LightGCN] Epoch 11 | Loss = 0.7025
[LightGCN] Epoch 12 | Loss = 0.6984
[LightGCN] Epoch 13 | Loss = 0.6961
[LightGCN] Epoch 14 | Loss = 0.6948
[LightGCN] Epoch 15 | Loss = 0.6940
Evaluating on 3628 users...


100%|██████████| 3628/3628 [00:02<00:00, 1695.69it/s]


Val @Epoch 15: Recall@5:0.0021 | NDCG@5:0.0022 | MRR@5:0.0025 | Recall@10:0.0022 | NDCG@10:0.0022 | MRR@10:0.0026 | Recall@20:0.0025 | NDCG@20:0.0023 | MRR@20:0.0026
New best model (by NDCG@10).
[LightGCN] Epoch 16 | Loss = 0.6936
[LightGCN] Epoch 17 | Loss = 0.6934
[LightGCN] Epoch 18 | Loss = 0.6933
[LightGCN] Epoch 19 | Loss = 0.6932
[LightGCN] Epoch 20 | Loss = 0.6932
Evaluating on 3628 users...


100%|██████████| 3628/3628 [00:15<00:00, 229.27it/s]


Val @Epoch 20: Recall@5:0.0019 | NDCG@5:0.0020 | MRR@5:0.0022 | Recall@10:0.0021 | NDCG@10:0.0021 | MRR@10:0.0023 | Recall@20:0.0028 | NDCG@20:0.0023 | MRR@20:0.0024
[LightGCN] Epoch 21 | Loss = 0.6931
[LightGCN] Epoch 22 | Loss = 0.6931
[LightGCN] Epoch 23 | Loss = 0.6931
[LightGCN] Epoch 24 | Loss = 0.6931
[LightGCN] Epoch 25 | Loss = 0.6931
Evaluating on 3628 users...


100%|██████████| 3628/3628 [00:52<00:00, 69.44it/s]


Val @Epoch 25: Recall@5:0.0022 | NDCG@5:0.0022 | MRR@5:0.0025 | Recall@10:0.0025 | NDCG@10:0.0023 | MRR@10:0.0026 | Recall@20:0.0045 | NDCG@20:0.0030 | MRR@20:0.0029
New best model (by NDCG@10).
[LightGCN] Epoch 26 | Loss = 0.6931
[LightGCN] Epoch 27 | Loss = 0.6931
[LightGCN] Epoch 28 | Loss = 0.6931
[LightGCN] Epoch 29 | Loss = 0.6931
[LightGCN] Epoch 30 | Loss = 0.6931
Evaluating on 3628 users...


100%|██████████| 3628/3628 [00:53<00:00, 67.85it/s]


Val @Epoch 30: Recall@5:0.0022 | NDCG@5:0.0022 | MRR@5:0.0024 | Recall@10:0.0042 | NDCG@10:0.0029 | MRR@10:0.0029 | Recall@20:0.0046 | NDCG@20:0.0031 | MRR@20:0.0030
New best model (by NDCG@10).

Loaded best model (by NDCG@10).
Evaluating on 3628 users...


100%|██████████| 3628/3628 [00:53<00:00, 68.32it/s]


Evaluating on 3642 users...


100%|██████████| 3642/3642 [00:52<00:00, 68.82it/s]


✅ Final Validation:
  Recall@5: 0.0022
  NDCG@5: 0.0022
  MRR@5: 0.0024
  Recall@10: 0.0042
  NDCG@10: 0.0029
  MRR@10: 0.0029
  Recall@20: 0.0046
  NDCG@20: 0.0031
  MRR@20: 0.0030

✅ Final Test:
  Recall@5: 0.0018
  NDCG@5: 0.0021
  MRR@5: 0.0029
  Recall@10: 0.0029
  NDCG@10: 0.0025
  MRR@10: 0.0033
  Recall@20: 0.0038
  NDCG@20: 0.0028
  MRR@20: 0.0034





In [14]:
import torch
import numpy as np
from tqdm import tqdm

# ---------- Eval helpers ----------
def evaluate_model(model, train_df, eval_df, k_list=[5, 10, 20]):
    """
    Evaluate a trained LightGCN model using Recall@K, NDCG@K, and MRR@K.
    """
    model.eval()
    with torch.no_grad():
        user_emb, item_emb = model.forward()

    # Build user->train_items map to exclude seen items from ranking
    train_user_pos = train_df.groupby("u")["i"].apply(set).to_dict()
    eval_user_pos = eval_df.groupby("u")["i"].apply(set).to_dict()

    recall_scores = {k: [] for k in k_list}
    ndcg_scores = {k: [] for k in k_list}
    mrr_scores = {k: [] for k in k_list}

    print(f"Evaluating on {len(eval_user_pos)} users...")
    for u, gt_items in tqdm(eval_user_pos.items()):
        if len(gt_items) == 0:
            continue

        # Compute scores for all items
        user_vec = user_emb[u].unsqueeze(0)
        scores = torch.matmul(user_vec, item_emb.T).squeeze(0)

        # Mask out items the user has already interacted with (training set)
        seen_items = train_user_pos.get(u, set())
        scores[list(seen_items)] = -1e9

        # Get top-K recommendations
        ranked_items = torch.topk(scores, k=max(k_list)).indices.cpu().numpy().tolist()

        # Compute metrics for each K
        for k in k_list:
            recall_scores[k].append(recall_at_k(ranked_items, gt_items, k))
            ndcg_scores[k].append(ndcg_at_k(ranked_items, gt_items, k))
            mrr_scores[k].append(mrr(ranked_items, gt_items, k))

    # Aggregate metrics
    results = {}
    for k in k_list:
        results[f"Recall@{k}"] = np.mean(recall_scores[k]) if recall_scores[k] else 0.0
        results[f"NDCG@{k}"] = np.mean(ndcg_scores[k]) if ndcg_scores[k] else 0.0
        results[f"MRR@{k}"] = np.mean(mrr_scores[k]) if mrr_scores[k] else 0.0

    return results

Evaluating on 3628 users...


100%|██████████| 3628/3628 [00:01<00:00, 2680.22it/s]


Evaluating on 3642 users...


100%|██████████| 3642/3642 [00:01<00:00, 2612.05it/s]


✅ Validation Results:
  Recall@5: 0.0021
  NDCG@5: 0.0020
  MRR@5: 0.0020
  Recall@10: 0.0024
  NDCG@10: 0.0021
  MRR@10: 0.0022
  Recall@20: 0.0029
  NDCG@20: 0.0023
  MRR@20: 0.0023

✅ Test Results:
  Recall@5: 0.0007
  NDCG@5: 0.0008
  MRR@5: 0.0012
  Recall@10: 0.0007
  NDCG@10: 0.0008
  MRR@10: 0.0012
  Recall@20: 0.0019
  NDCG@20: 0.0011
  MRR@20: 0.0013





In [None]:
# ---------- Evaluate model ----------
val_results = evaluate_model(lightgcn_model, train_df, val_df, k_list=[5, 10, 20])
test_results = evaluate_model(lightgcn_model, train_df, test_df, k_list=[5, 10, 20])

print("\n✅ Validation Results:")
for k, v in val_results.items():
    print(f"  {k}: {v:.4f}")

print("\n✅ Test Results:")
for k, v in test_results.items():
    print(f"  {k}: {v:.4f}")

Extension Model 1: LightGCL

Elements:
1. Contrastive self-supervision

In [None]:
# Extension Model 1: LightGCL
# --------------------------
# Fixes vs. the original draft:
#  (1) Build the SVD "global view" from a *sparse* user–item matrix (no dense .to_dense()).
#  (2) Use in-batch InfoNCE (scales to large |U| and |I|) instead of full N x N logits.

import numpy as np

def build_svd_view(train_df: pd.DataFrame,
                   num_users: int,
                   num_items: int,
                   rank: int = 64,
                   use_ratings: bool = False,
                   seed: int = 42):
    """
    Returns SVD (global) embeddings:
      Uk  = U_k * sqrt(Sigma_k)   (shape: [num_users, k])
      Vk  = V_k * sqrt(Sigma_k)   (shape: [num_items, k])

    Uses sparse SVD (SciPy) to avoid materializing a dense user–item matrix.
    Falls back to sklearn TruncatedSVD if svds fails to converge.
    """
    import scipy.sparse as sp
    from scipy.sparse.linalg import svds

    rows = train_df["u"].to_numpy(dtype=np.int64)
    cols = train_df["i"].to_numpy(dtype=np.int64)

    if use_ratings and "rating" in train_df.columns:
        vals = train_df["rating"].to_numpy(dtype=np.float32)
    else:
        vals = np.ones(len(train_df), dtype=np.float32)

    M = sp.coo_matrix((vals, (rows, cols)), shape=(num_users, num_items), dtype=np.float32)

    k = int(min(rank, min(num_users, num_items) - 1))
    if k <= 0:
        # Degenerate edge case: return tiny random views.
        rng = np.random.default_rng(seed)
        Uk = rng.normal(scale=0.01, size=(num_users, 1)).astype(np.float32)
        Vk = rng.normal(scale=0.01, size=(num_items, 1)).astype(np.float32)
        return torch.from_numpy(Uk).to(DEVICE), torch.from_numpy(Vk).to(DEVICE)

    try:
        U, S, Vt = svds(M, k=k)  # singular values in ascending order
        order = np.argsort(-S)
        S = S[order]
        U = U[:, order]
        Vt = Vt[order, :]

        sqrtS = np.sqrt(np.maximum(S, 1e-12)).astype(np.float32)
        Uk = (U.astype(np.float32) * sqrtS[None, :])         # U * sqrt(S)
        Vk = (Vt.T.astype(np.float32) * sqrtS[None, :])      # V * sqrt(S)
    except Exception as e:
        # Fallback: TruncatedSVD (more stable on some matrices).
        from sklearn.decomposition import TruncatedSVD
        svd = TruncatedSVD(n_components=k, random_state=seed)
        svd.fit(M)

        Vt = svd.components_.astype(np.float32)             # (k, num_items)
        S  = svd.singular_values_.astype(np.float32)        # (k,)
        sqrtS = np.sqrt(np.maximum(S, 1e-12)).astype(np.float32)

        # X = M @ V  == U * Sigma
        X = (M @ Vt.T).astype(np.float32)                   # (num_users, k)
        Uk = X / sqrtS[None, :]                             # (U*Sigma)/sqrt(Sigma) == U*sqrt(Sigma)
        Vk = (Vt.T * sqrtS[None, :])                        # V*sqrt(Sigma)

    Uk = torch.from_numpy(Uk).to(torch.float32)
    Vk = torch.from_numpy(Vk).to(torch.float32)
    return Uk.to(DEVICE), Vk.to(DEVICE)

def info_nce_inbatch(z_q: torch.Tensor, z_k: torch.Tensor, temperature: float = 0.2) -> torch.Tensor:
    """
    In-batch InfoNCE with aligned positives (diagonal) and other batch elements as negatives.
    z_q, z_k: [B, D] with matching rows as positives.
    """
    z_q = F.normalize(z_q, dim=1)
    z_k = F.normalize(z_k, dim=1)
    logits = (z_q @ z_k.T) / temperature
    labels = torch.arange(z_q.size(0), device=z_q.device)
    return F.cross_entropy(logits, labels)

class LightGCL(nn.Module):
    """
    LightGCN trained with a LightGCL-style global collaborative view (low-rank SVD)
    and contrastive alignment between the two embedding spaces.
    """
    def __init__(self,
                 base: LightGCN,
                 lambda_cl: float = 0.1,
                 temperature: float = 0.2,
                 svd_rank: int = 64,
                 use_ratings: bool = False):
        super().__init__()
        self.base = base
        self.lambda_cl = float(lambda_cl)
        self.temperature = float(temperature)

        # Precompute global-view embeddings once.
        Uk, Vk = build_svd_view(train_df, base.num_users, base.num_items, rank=svd_rank, use_ratings=use_ratings)
        self.register_buffer("Uk", Uk)
        self.register_buffer("Vk", Vk)

    def training_step(self, batch, opt, reg: float = 1e-4) -> float:
        u, i, j = batch

        # Full embeddings from LightGCN (as in the baseline)
        user_emb_all, item_emb_all = self.base()

        # BPR supervised term
        pos = self.base.score(u, i, user_emb_all, item_emb_all)
        neg = self.base.score(u, j, user_emb_all, item_emb_all)
        loss_bpr = bpr_loss(pos, neg, reg, [self.base.user_emb.weight, self.base.item_emb.weight])

        # Contrastive alignment (in-batch negatives)
        u_uniq = torch.unique(u)
        it_uniq = torch.unique(torch.cat([i, j], dim=0))

        cl_u = info_nce_inbatch(user_emb_all[u_uniq], self.Uk[u_uniq].to(user_emb_all.device), temperature=self.temperature)
        cl_i = info_nce_inbatch(item_emb_all[it_uniq], self.Vk[it_uniq].to(item_emb_all.device), temperature=self.temperature)

        loss = loss_bpr + self.lambda_cl * (cl_u + cl_i)

        opt.zero_grad()
        loss.backward()
        opt.step()
        return float(loss.item())

def train_lightgcl(epochs: int = 5,
                   emb_dim: int = 64,
                   svd_rank: int = None,
                   batch_size: int = 2048,
                   lr: float = 1e-3,
                   reg: float = 1e-4,
                   lambda_cl: float = 0.1,
                   temperature: float = 0.2):
    base = LightGCN(num_users, num_items, emb_dim=emb_dim).to(DEVICE)
    model = LightGCL(
        base,
        lambda_cl=lambda_cl,
        temperature=temperature,
        svd_rank=(svd_rank if svd_rank is not None else emb_dim),
    ).to(DEVICE)

    opt = torch.optim.Adam(base.parameters(), lr=lr)
    sampler = bpr_triplet_sampler(train_df, num_items, batch_size=batch_size)

    for ep in range(1, epochs + 1):
        total = 0.0
        steps = max(1, len(train_df) // batch_size)
        for _ in range(steps):
            u, i, j = next(sampler)
            total += model.training_step((u, i, j), opt, reg=reg)
        print(f"[LightGCL] epoch {ep} | loss {total / steps:.4f}")

    return model

lightgcl_model = train_lightgcl(epochs=5, emb_dim=64, lambda_cl=0.1)

Extension Model 2: LightGCL + Geographical Awareness

Elements:
1. Contrastive self-supervision
2. Distance-aware scoring
3. Radius-aware negative sampling

In [None]:
# Extension Model 2: LightGCL + Geographical Awareness
# ---------------------------------------------------
# Adds:
#  (1) Distance-aware scoring: dot(u,i) + beta * exp(-d(u,i)/rho)
#  (2) Radius-aware negative sampling via bpr_triplet_sampler(..., radius_km=R)
#  (3) Optional geo-smoothing regularizer on nearby restaurants (items)

import ast
import re
from collections import defaultdict

def _parse_categories(x):
    """Best-effort parse of categories that may be stored as list, stringified list, or comma-separated string."""
    if isinstance(x, list):
        return [str(t).strip() for t in x if pd.notna(t) and str(t).strip()]
    if pd.isna(x):
        return []
    if isinstance(x, str):
        s = x.strip()
        # stringified Python list
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                y = ast.literal_eval(s)
                if isinstance(y, (list, tuple)):
                    return [str(t).strip() for t in y if pd.notna(t) and str(t).strip()]
            except Exception:
                pass
        # fallback: split
        parts = [p.strip() for p in re.split(r"[;,|]", s) if p.strip()]
        return parts
    return [str(x).strip()] if str(x).strip() else []

def _safe_float(x):
    try:
        return float(x)
    except Exception:
        return float("nan")

def build_item_latlon(df_any: pd.DataFrame, num_items: int):
    """
    Builds a list of (lat, lon) for every encoded item index i in [0, num_items).
    Works even if coordinates are missing: returns (nan, nan) for those items.
    """
    # Ensure an encoded item index exists
    if "i" not in df_any.columns:
        tmp = df_any.copy()
        tmp["i"] = i_enc.transform(tmp["item_id"])
        df_any = tmp

    # Find usable columns (prefer canonical names)
    lat_candidates = ["item_lat", "business_lat", "business_latitude", "latitude", "lat", "y"]
    lon_candidates = ["item_lon", "business_lon", "business_longitude", "longitude", "lon", "lng", "x"]

    lat_col = next((c for c in lat_candidates if c in df_any.columns), None)
    lon_col = next((c for c in lon_candidates if c in df_any.columns), None)

    if lat_col is None or lon_col is None:
        return [(math.nan, math.nan) for _ in range(num_items)]

    # Coerce to numeric
    tmp = df_any[["i", lat_col, lon_col]].copy()
    tmp[lat_col] = pd.to_numeric(tmp[lat_col], errors="coerce")
    tmp[lon_col] = pd.to_numeric(tmp[lon_col], errors="coerce")

    g = tmp.groupby("i")[[lat_col, lon_col]].mean()
    arr = np.full((num_items, 2), np.nan, dtype=np.float32)
    arr[g.index.to_numpy(dtype=np.int64)] = g.to_numpy(dtype=np.float32)

    return [(float(a), float(b)) for a, b in arr]

def build_user_home_latlon(train_df: pd.DataFrame, item_latlon):
    """
    Best-effort user "home" location:
      - If user_lat/user_lon exist and are usable, use their mean.
      - Else, estimate from the mean lat/lon of items the user interacted with in train.
    """
    user_home = {}

    if "user_lat" in train_df.columns and "user_lon" in train_df.columns:
        tmp = train_df[["u", "user_lat", "user_lon"]].copy()
        tmp["user_lat"] = pd.to_numeric(tmp["user_lat"], errors="coerce")
        tmp["user_lon"] = pd.to_numeric(tmp["user_lon"], errors="coerce")
        g = tmp.groupby("u")[["user_lat", "user_lon"]].mean()
        for u, row in g.iterrows():
            lat, lon = float(row["user_lat"]), float(row["user_lon"])
            if not (math.isnan(lat) or math.isnan(lon)):
                user_home[int(u)] = (lat, lon)

    # Fill remaining users from interacted item coordinates
    for u, g in train_df.groupby("u"):
        u = int(u)
        if u in user_home:
            continue
        coords = np.array([item_latlon[int(it)] for it in g["i"].values], dtype=np.float64)
        m = ~np.isnan(coords).any(axis=1)
        if m.any():
            user_home[u] = (float(coords[m, 0].mean()), float(coords[m, 1].mean()))

    return user_home

def precompute_geo_knn(item_latlon, k: int = 10, rho_r_km: float = 5.0):
    """
    Precompute kNN neighbors for each item using haversine distance (BallTree).
    Returns (neighbors, weights), where neighbors[i] is a list of item indices, and weights[i] aligns.
    If coords missing, returns (None, None).
    """
    coords = np.array(item_latlon, dtype=np.float64)  # (N,2)
    valid = ~np.isnan(coords).any(axis=1)
    idx_valid = np.where(valid)[0]
    if len(idx_valid) < 2:
        return None, None

    # BallTree expects radians for haversine metric.
    from sklearn.neighbors import BallTree
    rad = np.radians(coords[idx_valid])
    tree = BallTree(rad, metric="haversine")

    qk = min(k + 1, len(idx_valid))
    dist_rad, ind = tree.query(rad, k=qk)  # includes self at position 0
    dist_km = dist_rad * 6371.0

    neighbors = [[] for _ in range(len(coords))]
    weights   = [[] for _ in range(len(coords))]

    for row, item_idx in enumerate(idx_valid):
        neigh_local = ind[row, 1:]                 # drop self
        neigh_items = idx_valid[neigh_local]
        d = dist_km[row, 1:]
        w = np.exp(-d / float(rho_r_km))
        neighbors[item_idx] = neigh_items.tolist()
        weights[item_idx] = w.astype(np.float32).tolist()

    return neighbors, weights

class LightGCL_Geo(LightGCL):
    def __init__(self,
                 base: LightGCN,
                 item_latlon,
                 user_home_latlon,
                 beta: float = 0.2,
                 radius_km: float = 5.0,
                 rho_km: float = None,
                 lambda_cl: float = 0.1,
                 temperature: float = 0.2,
                 mu_geo_smooth: float = 0.0,
                 knn_k: int = 10,
                 rho_r_km: float = 5.0):
        super().__init__(base,
                         lambda_cl=lambda_cl,
                         temperature=temperature,
                         svd_rank=base.user_emb.embedding_dim)

        self.beta = float(beta)
        self.radius_km = float(radius_km)
        self.rho_km = float(rho_km) if rho_km is not None else float(radius_km)

        self.item_latlon = item_latlon
        self.user_home = user_home_latlon

        # Optional geo-smoothing regularizer
        self.mu_geo_smooth = float(mu_geo_smooth)
        if self.mu_geo_smooth > 0.0:
            self._geo_neighbors, self._geo_weights = precompute_geo_knn(item_latlon, k=knn_k, rho_r_km=rho_r_km)
        else:
            self._geo_neighbors, self._geo_weights = None, None

    def _geo_kernel(self, d_km: float) -> float:
        return math.exp(-d_km / self.rho_km)

    def _batch_geo_bias(self, users: torch.Tensor, items: torch.Tensor, centers_latlon=None) -> torch.Tensor:
        """
        Returns exp(-d/rho) for each (u, item) in the batch.
        If a user's location is missing, uses centers_latlon[u] if provided (e.g., pos item coords).
        """
        out = []
        u_list = users.tolist()
        i_list = items.tolist()

        if centers_latlon is None:
            centers_latlon = [None] * len(u_list)

        for idx, (u, it) in enumerate(zip(u_list, i_list)):
            center = self.user_home.get(int(u))
            if center is None or any(np.isnan(center)):
                center = centers_latlon[idx]

            lat_i, lon_i = self.item_latlon[int(it)]
            if center is None or any(np.isnan(center)) or math.isnan(lat_i) or math.isnan(lon_i):
                out.append(0.0)
                continue

            d = haversine(center, (lat_i, lon_i))  # km
            out.append(self._geo_kernel(d))

        return torch.tensor(out, device=DEVICE, dtype=torch.float32)

    def _geo_smooth_loss(self, item_emb_all: torch.Tensor, items_uniq: torch.Tensor) -> torch.Tensor:
        if self._geo_neighbors is None:
            return torch.zeros((), device=item_emb_all.device)

        loss = 0.0
        count = 0
        for it in items_uniq.tolist():
            neigh = self._geo_neighbors[it]
            if not neigh:
                continue
            w = torch.tensor(self._geo_weights[it], device=item_emb_all.device, dtype=torch.float32).unsqueeze(1)
            nbr = torch.tensor(neigh, device=item_emb_all.device, dtype=torch.long)
            diff = item_emb_all[it].unsqueeze(0) - item_emb_all[nbr]
            loss = loss + (w * diff.pow(2)).sum()
            count += len(neigh)

        if count == 0:
            return torch.zeros((), device=item_emb_all.device)
        return loss / count

    def training_step(self, batch, opt, reg: float = 1e-4) -> float:
        u, i, j = batch

        user_emb_all, item_emb_all = self.base()

        # Build per-sample centers: user_home if known else pos-item coordinate
        centers = []
        for it in i.tolist():
            latc, lonc = self.item_latlon[int(it)]
            centers.append(None if (math.isnan(latc) or math.isnan(lonc)) else (latc, lonc))

        pos = self.base.score(u, i, user_emb_all, item_emb_all) + self.beta * self._batch_geo_bias(u, i, centers_latlon=centers)
        neg = self.base.score(u, j, user_emb_all, item_emb_all) + self.beta * self._batch_geo_bias(u, j, centers_latlon=centers)

        loss_bpr = bpr_loss(pos, neg, reg, [self.base.user_emb.weight, self.base.item_emb.weight])

        # Contrastive alignment (in-batch)
        u_uniq = torch.unique(u)
        it_uniq = torch.unique(torch.cat([i, j], dim=0))

        cl_u = info_nce_inbatch(user_emb_all[u_uniq], self.Uk[u_uniq].to(user_emb_all.device), temperature=self.temperature)
        cl_i = info_nce_inbatch(item_emb_all[it_uniq], self.Vk[it_uniq].to(item_emb_all.device), temperature=self.temperature)

        loss = loss_bpr + self.lambda_cl * (cl_u + cl_i)

        # Geo smoothing (optional)
        if self.mu_geo_smooth > 0.0:
            loss = loss + self.mu_geo_smooth * self._geo_smooth_loss(item_emb_all, it_uniq)

        opt.zero_grad()
        loss.backward()
        opt.step()
        return float(loss.item())

def train_lightgcl_geo(epochs: int = 5,
                       emb_dim: int = 64,
                       batch_size: int = 2048,
                       lr: float = 1e-3,
                       reg: float = 1e-4,
                       lambda_cl: float = 0.1,
                       temperature: float = 0.2,
                       beta: float = 0.3,
                       R: float = 5.0,
                       mu_geo_smooth: float = 0.0):
    # Build coordinate tables (robust to missing metadata)
    df_enc_all = encode_df(df_all)  # adds u/i columns
    item_latlon = build_item_latlon(df_enc_all, num_items)
    user_home = build_user_home_latlon(train_df, item_latlon)

    base = LightGCN(num_users, num_items, emb_dim=emb_dim).to(DEVICE)
    model = LightGCL_Geo(
        base=base,
        item_latlon=item_latlon,
        user_home_latlon=user_home,
        beta=beta,
        radius_km=R,
        rho_km=R,
        lambda_cl=lambda_cl,
        temperature=temperature,
        mu_geo_smooth=mu_geo_smooth,
    ).to(DEVICE)

    opt = torch.optim.Adam(base.parameters(), lr=lr)

    sampler = bpr_triplet_sampler(
        train_df,
        num_items,
        radius_km=R,
        item_latlon=item_latlon,
        user_pos_map=None,           # let sampler build it from train_df
        user_home_latlon=user_home,  # enables radius-aware negatives when possible
        batch_size=batch_size
    )

    for ep in range(1, epochs + 1):
        total = 0.0
        steps = max(1, len(train_df) // batch_size)
        for _ in range(steps):
            u, i, j = next(sampler)
            total += model.training_step((u, i, j), opt, reg=reg)
        print(f"[LightGCL+Geo] epoch {ep} | loss {total / steps:.4f}")

    # expose for evaluation helpers
    return model, item_latlon, user_home

lightgcl_geo_model, item_latlon, user_home = train_lightgcl_geo(
    epochs=5, emb_dim=64, beta=0.3, R=5.0, lambda_cl=0.1
)

Comparative Model 1: PinSage

In [None]:
# Comparative Model 1: PinSage (practical PyG approximation)
# ----------------------------------------------------------
# A full PinSage implementation requires random-walk neighbor sampling + importance pooling.
# For this project notebook, we implement a *drop-in runnable* approximation:
#   - Build an item-item graph from co-review edges (already in `data["item","similar","item"]`)
#   - Learn item embeddings with GraphSAGE over that item graph
#   - Learn user embeddings directly (trainable table)
#   - Train with the same BPR triplets as the baseline
#
# This preserves the "item-graph recommender" spirit and runs on the current dataset format.

class PinSageRecommender(nn.Module):
    def __init__(self, num_users: int, num_items: int, hidden_dim: int = 64, num_layers: int = 2, dropout: float = 0.1):
        super().__init__()
        self.num_users, self.num_items = num_users, num_items
        self.hidden_dim = hidden_dim

        self.user_emb = nn.Embedding(num_users, hidden_dim)
        self.item_emb = nn.Embedding(num_items, hidden_dim)
        nn.init.normal_(self.user_emb.weight, std=0.1)
        nn.init.normal_(self.item_emb.weight, std=0.1)

        self.convs = nn.ModuleList([SAGEConv(hidden_dim, hidden_dim) for _ in range(num_layers)])
        self.dropout = float(dropout)

    def forward(self, item_edge_index: torch.Tensor):
        # Item graph message passing
        x = self.item_emb.weight
        if item_edge_index is not None and item_edge_index.numel() > 0:
            for conv in self.convs:
                x = conv(x, item_edge_index)
                x = F.relu(x)
                x = F.dropout(x, p=self.dropout, training=self.training)
        return self.user_emb.weight, x

    def score(self, users, items, user_emb=None, item_emb=None, item_edge_index=None):
        if user_emb is None or item_emb is None:
            user_emb, item_emb = self.forward(item_edge_index)
        return (user_emb[users] * item_emb[items]).sum(dim=1)

def train_pinsage(epochs: int = 5,
                  hidden_dim: int = 64,
                  num_layers: int = 2,
                  batch_size: int = 2048,
                  lr: float = 1e-3,
                  reg: float = 1e-4):
    model = PinSageRecommender(num_users, num_items, hidden_dim=hidden_dim, num_layers=num_layers).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)

    ii = data["item", "similar", "item"].edge_index.to(DEVICE)
    sampler = bpr_triplet_sampler(train_df, num_items, batch_size=batch_size)

    for ep in range(1, epochs + 1):
        model.train()
        total = 0.0
        steps = max(1, len(train_df) // batch_size)
        for _ in range(steps):
            u, i, j = next(sampler)
            ue, ie = model(ii)
            pos = (ue[u] * ie[i]).sum(1)
            neg = (ue[u] * ie[j]).sum(1)
            loss = bpr_loss(pos, neg, reg, [model.user_emb.weight, model.item_emb.weight])
            opt.zero_grad()
            loss.backward()
            opt.step()
            total += float(loss.item())
        print(f"[PinSage(GraphSAGE)] epoch {ep} | loss {total / steps:.4f}")

    model.eval()
    return model

pinsage_model = train_pinsage(epochs=5, hidden_dim=64, num_layers=2)

Comparative Model 2: LightGCN + HGT

Elements:
1. Base Model LightGCN
2. HGT item representation layer

In [None]:
# Comparative Model 2: LightGCN + HGT (heterogeneous metadata)
# ------------------------------------------------------------
# Fixes vs. the original draft:
#  (1) Uses the encoded item index `i` (not the raw `df` without encoding).
#  (2) Robustly parses categories/price and builds a HeteroData graph with reverse edges.
#  (3) Uses the PyG HGTConv API (metadata-based constructor).

import ast
import re

def _parse_price(x):
    if pd.isna(x):
        return 0
    if isinstance(x, str):
        s = x.strip()
        # "$$", "$$$" style
        if s and all(ch == "$" for ch in s):
            return len(s)
        try:
            return int(float(s))
        except Exception:
            return 0
    try:
        return int(x)
    except Exception:
        return 0

def _parse_cats(x):
    if isinstance(x, list):
        return [str(t).strip() for t in x if pd.notna(t) and str(t).strip()]
    if pd.isna(x):
        return []
    if isinstance(x, str):
        s = x.strip()
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                y = ast.literal_eval(s)
                if isinstance(y, (list, tuple)):
                    return [str(t).strip() for t in y if pd.notna(t) and str(t).strip()]
            except Exception:
                pass
        return [p.strip() for p in re.split(r"[;,|]", s) if p.strip()]
    return [str(x).strip()] if str(x).strip() else []

# Build an encoded "all interactions" DF with i (item index)
df_enc_all = encode_df(df_all)  # contains u/i + original columns

# Collect per-item metadata (categories / price)
item_meta = {}
for it, g in df_enc_all.groupby("i"):
    it = int(it)
    # categories
    cats = []
    if "categories" in g.columns:
        for v in g["categories"].tolist():
            cats = _parse_cats(v)
            if len(cats) > 0:
                break
    # price
    p = 0
    if "price" in g.columns:
        for v in g["price"].tolist():
            p = _parse_price(v)
            if p != 0:
                break
    item_meta[it] = {"cats": cats, "price": p}

# Category vocabulary
all_cats = []
for it in range(num_items):
    all_cats += item_meta.get(it, {}).get("cats", [])
if len(all_cats) == 0:
    all_cats = ["unknown"]

cat_encoder = LabelEncoder()
cat_encoder.fit(sorted(set(all_cats)))
num_cats = len(cat_encoder.classes_)

# Price vocabulary (ensure 0 exists)
all_prices = [_parse_price(item_meta.get(i, {}).get("price", 0)) for i in range(num_items)]
price_levels = sorted(set(all_prices + [0]))
price_to_idx = {p: idx for idx, p in enumerate(price_levels)}
num_prices = len(price_levels)

# Build hetero metadata graph
meta = HeteroData()
meta["item"].num_nodes = num_items
meta["cat"].num_nodes = num_cats
meta["price"].num_nodes = num_prices

# item <-> cat edges
src_item, dst_cat = [], []
for it in range(num_items):
    cats = item_meta.get(it, {}).get("cats", [])
    if not cats:
        cats = [cat_encoder.classes_[0]]
    for c in cats[:3]:
        src_item.append(it)
        dst_cat.append(int(cat_encoder.transform([c])[0]))

edge_ic = torch.tensor([src_item, dst_cat], dtype=torch.long)
meta["item", "has_cat", "cat"].edge_index = edge_ic
meta["cat", "rev_has_cat", "item"].edge_index = edge_ic.flip(0)

# item <-> price edges
src_item, dst_price = [], []
for it in range(num_items):
    p = _parse_price(item_meta.get(it, {}).get("price", 0))
    src_item.append(it)
    dst_price.append(int(price_to_idx.get(p, 0)))

edge_ip = torch.tensor([src_item, dst_price], dtype=torch.long)
meta["item", "has_price", "price"].edge_index = edge_ip
meta["price", "rev_has_price", "item"].edge_index = edge_ip.flip(0)

meta = meta.to(DEVICE)

class HGTItemEncoder(nn.Module):
    def __init__(self, hidden: int = 64, heads: int = 2, layers: int = 2, metadata=None):
        super().__init__()
        self.hidden = hidden
        self.emb = nn.ModuleDict({
            "item": nn.Embedding(num_items, hidden),
            "cat": nn.Embedding(num_cats, hidden),
            "price": nn.Embedding(num_prices, hidden),
        })
        for k in self.emb:
            nn.init.xavier_uniform_(self.emb[k].weight)

        if metadata is None:
            metadata = meta.metadata()

        self.convs = nn.ModuleList([
            HGTConv(hidden, hidden, metadata, heads=heads)
            for _ in range(layers)
        ])

    def forward(self, meta_data: HeteroData):
        x_dict = {k: self.emb[k].weight for k in meta_data.node_types}
        for conv in self.convs:
            x_dict = conv(x_dict, meta_data.edge_index_dict)
            x_dict = {k: F.relu(v) for k, v in x_dict.items()}
        return x_dict["item"]

class LightGCN_HGT(nn.Module):
    def __init__(self, emb_dim: int = 64, hgt_hidden: int = 64, heads: int = 2, layers: int = 2):
        super().__init__()
        self.lgcn = LightGCN(num_users, num_items, emb_dim=emb_dim).to(DEVICE)
        self.hgt = HGTItemEncoder(hidden=hgt_hidden, heads=heads, layers=layers, metadata=meta.metadata()).to(DEVICE)
        self.fuse = nn.Linear(emb_dim + hgt_hidden, emb_dim)

    def fused_item_emb(self):
        ue, ie = self.lgcn()
        hgt_item = self.hgt(meta)
        fused = self.fuse(torch.cat([ie, hgt_item], dim=1))
        return ue, fused

def train_lightgcn_hgt(epochs: int = 5,
                       emb_dim: int = 64,
                       hgt_hidden: int = 64,
                       batch_size: int = 2048,
                       lr: float = 1e-3,
                       reg: float = 1e-4):
    model = LightGCN_HGT(emb_dim=emb_dim, hgt_hidden=hgt_hidden).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)

    sampler = bpr_triplet_sampler(train_df, num_items, batch_size=batch_size)

    for ep in range(1, epochs + 1):
        model.train()
        total = 0.0
        steps = max(1, len(train_df) // batch_size)
        for _ in range(steps):
            u, i, j = next(sampler)
            ue, fused = model.fused_item_emb()
            pos = (ue[u] * fused[i]).sum(1)
            neg = (ue[u] * fused[j]).sum(1)
            loss = bpr_loss(pos, neg, reg, list(model.parameters()))
            opt.zero_grad()
            loss.backward()
            opt.step()
            total += float(loss.item())
        print(f"[LightGCN+HGT] epoch {ep} | loss {total / steps:.4f}")

    model.eval()
    return model

lghgt_model = train_lightgcn_hgt(epochs=5, emb_dim=64, hgt_hidden=64)

Evaluation Helpers

In [None]:
# Evaluation Helpers
# ------------------
# Computes ranking metrics (Recall/NDCG/GeoNDCG/MRR) for each trained model.
# Fixes:
#  - Evaluates the *actual* model objects (not just their base components).
#  - Handles each model's embedding/scoring interface consistently.

def _vectorized_haversine_km(lat1, lon1, lat2, lon2):
    """
    Vectorized haversine distance from a single point (lat1, lon1) to arrays lat2/lon2.
    Inputs are degrees, output is km.
    """
    lat1 = np.radians(lat1); lon1 = np.radians(lon1)
    lat2 = np.radians(lat2); lon2 = np.radians(lon2)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2
    c = 2.0 * np.arcsin(np.minimum(1.0, np.sqrt(a)))
    return 6371.0 * c

def full_ranking_scores(model, model_name: str, K_list=(5, 10), geo_R: float = 5.0):
    K = int(max(K_list))

    # train positives (masking) + heldout ground truth
    train_pos = {u: set(g["i"].values.tolist()) for u, g in train_df.groupby("u")}
    gt_val = {u: set(g["i"].values.tolist()) for u, g in val_df.groupby("u")}
    gt_test = {u: set(g["i"].values.tolist()) for u, g in test_df.groupby("u")}

    # Precompute item coords arrays for fast geo scoring (if available)
    if "item_latlon" in globals() and item_latlon is not None:
        _item_lat = np.array([x[0] for x in item_latlon], dtype=np.float64)
        _item_lon = np.array([x[1] for x in item_latlon], dtype=np.float64)
        _item_has = ~np.isnan(_item_lat) & ~np.isnan(_item_lon)
    else:
        _item_lat = _item_lon = None
        _item_has = None

    with torch.no_grad():
        # Get embeddings / scoring behavior per model
        if isinstance(model, LightGCL_Geo):
            ue, ie = model.base()
            beta = model.beta
            rho = model.rho_km
            uhome = model.user_home

            def score_u(u):
                s = (ue[u].unsqueeze(0) * ie).sum(1)

                # geo bonus
                loc = uhome.get(int(u))
                if loc is not None and _item_lat is not None and not any(np.isnan(loc)):
                    latu, lonu = loc
                    d = np.full(num_items, np.inf, dtype=np.float64)
                    d[_item_has] = _vectorized_haversine_km(latu, lonu, _item_lat[_item_has], _item_lon[_item_has])
                    geo = np.exp(-d / float(rho)).astype(np.float32)
                    geo[~_item_has] = 0.0
                    s = s + beta * torch.tensor(geo, device=DEVICE)

                for it in train_pos.get(u, []):
                    s[it] = -1e9
                return s

        elif isinstance(model, LightGCL):
            ue, ie = model.base()

            def score_u(u):
                s = (ue[u].unsqueeze(0) * ie).sum(1)
                for it in train_pos.get(u, []):
                    s[it] = -1e9
                return s

        elif isinstance(model, LightGCN):
            ue, ie = model()

            def score_u(u):
                s = (ue[u].unsqueeze(0) * ie).sum(1)
                for it in train_pos.get(u, []):
                    s[it] = -1e9
                return s

        elif isinstance(model, PinSageRecommender):
            ii = data["item", "similar", "item"].edge_index.to(DEVICE)
            ue, ie = model(ii)

            def score_u(u):
                s = (ue[u].unsqueeze(0) * ie).sum(1)
                for it in train_pos.get(u, []):
                    s[it] = -1e9
                return s

        elif isinstance(model, LightGCN_HGT):
            ue, ie = model.fused_item_emb()

            def score_u(u):
                s = (ue[u].unsqueeze(0) * ie).sum(1)
                for it in train_pos.get(u, []):
                    s[it] = -1e9
                return s

        else:
            raise ValueError(f"Unknown model type for evaluation: {type(model)}")

        def evaluate(gt_dict):
            recs, ndcgs, geondcgs, mrrs = [], [], [], []
            for u, truth in gt_dict.items():
                if not truth:
                    continue
                scores = score_u(u)
                topk = torch.topk(scores, k=K).indices.tolist()
                recs.append(recall_at_k(topk, truth, k=K))
                ndcgs.append(ndcg_at_k(topk, truth, k=K))
                # geo-aware NDCG (if coords exist)
                u_loc = None
                if isinstance(model, LightGCL_Geo):
                    u_loc = model.user_home.get(int(u))
                elif "user_home" in globals():
                    u_loc = user_home.get(int(u))
                geondcgs.append(geo_ndcg_at_k(topk, truth, u_loc, item_latlon, k=K, R=geo_R) if ("item_latlon" in globals() and item_latlon is not None) else ndcg_at_k(topk, truth, k=K))
                mrrs.append(mrr(topk, truth, k=K))

            return (
                float(np.mean(recs)) if recs else 0.0,
                float(np.mean(ndcgs)) if ndcgs else 0.0,
                float(np.mean(geondcgs)) if geondcgs else 0.0,
                float(np.mean(mrrs)) if mrrs else 0.0,
            )

        val_scores = evaluate(gt_val)
        test_scores = evaluate(gt_test)

    print(f"[{model_name}] Val  Recall@{K}:{val_scores[0]:.4f}  NDCG@{K}:{val_scores[1]:.4f}  GeoNDCG@{K}:{val_scores[2]:.4f}  MRR@{K}:{val_scores[3]:.4f}")
    print(f"[{model_name}] Test Recall@{K}:{test_scores[0]:.4f}  NDCG@{K}:{test_scores[1]:.4f}  GeoNDCG@{K}:{test_scores[2]:.4f}  MRR@{K}:{test_scores[3]:.4f}")

    return {
        "model": model_name,
        "val_recall": val_scores[0],
        "val_ndcg": val_scores[1],
        "val_geondcg": val_scores[2],
        "val_mrr": val_scores[3],
        "test_recall": test_scores[0],
        "test_ndcg": test_scores[1],
        "test_geondcg": test_scores[2],
        "test_mrr": test_scores[3],
    }

results = []
results.append(full_ranking_scores(lightgcn_model, "LightGCN"))
results.append(full_ranking_scores(lightgcl_model, "LightGCL"))
results.append(full_ranking_scores(lightgcl_geo_model, "LightGCL+Geo", geo_R=5.0))
results.append(full_ranking_scores(pinsage_model, "PinSage (item-item GraphSAGE)"))
results.append(full_ranking_scores(lghgt_model, "LightGCN+HGT"))

pd.DataFrame(results)

Rating Metrics (RMSE/MAE)

In [None]:
# Rating Metrics (RMSE/MAE)
# -------------------------
# Note: Models are trained with implicit-feedback BPR, so raw dot-product scores are *not*
# calibrated to the 1–5 rating scale. We still compute RMSE/MAE on these scores as a
# relative comparison (lower is better).

def pointwise_eval(model, which: str = "val"):
    df_eval = val_df if which == "val" else test_df
    preds, trues = [], []

    with torch.no_grad():
        if isinstance(model, LightGCL_Geo):
            ue, ie = model.base()
            for _, r in df_eval.iterrows():
                u = int(r.u); it = int(r.i)
                s = float((ue[u] * ie[it]).sum().item())
                # geo bonus (if possible)
                loc = model.user_home.get(u)
                lat_i, lon_i = model.item_latlon[it]
                if loc is not None and not any(np.isnan(loc)) and not (math.isnan(lat_i) or math.isnan(lon_i)):
                    s += float(model.beta * math.exp(-haversine(loc, (lat_i, lon_i)) / model.rho_km))
                preds.append(s)
                trues.append(float(r.rating))

        elif isinstance(model, LightGCL):
            ue, ie = model.base()
            for _, r in df_eval.iterrows():
                preds.append(float((ue[int(r.u)] * ie[int(r.i)]).sum().item()))
                trues.append(float(r.rating))

        elif isinstance(model, LightGCN):
            ue, ie = model()
            for _, r in df_eval.iterrows():
                preds.append(float((ue[int(r.u)] * ie[int(r.i)]).sum().item()))
                trues.append(float(r.rating))

        elif isinstance(model, PinSageRecommender):
            ii = data["item", "similar", "item"].edge_index.to(DEVICE)
            ue, ie = model(ii)
            for _, r in df_eval.iterrows():
                preds.append(float((ue[int(r.u)] * ie[int(r.i)]).sum().item()))
                trues.append(float(r.rating))

        elif isinstance(model, LightGCN_HGT):
            ue, ie = model.fused_item_emb()
            for _, r in df_eval.iterrows():
                preds.append(float((ue[int(r.u)] * ie[int(r.i)]).sum().item()))
                trues.append(float(r.rating))

        else:
            raise ValueError(f"Unknown model type: {type(model)}")

    return {"rmse": rmse(preds, trues), "mae": mae(preds, trues), "n": len(trues)}

for name, mdl in [
    ("LightGCN", lightgcn_model),
    ("LightGCL", lightgcl_model),
    ("LightGCL+Geo", lightgcl_geo_model),
    ("PinSage", pinsage_model),
    ("LightGCN+HGT", lghgt_model),
]:
    out = pointwise_eval(mdl, "test")
    print(f"[{name}] RMSE: {out['rmse']:.4f} | MAE: {out['mae']:.4f} | n={out['n']}")

Vistualization: Comparison Table

In [None]:
# Visualization: Comparison Table
# -------------------------------
summary = pd.DataFrame(results).copy()

models_in_order = [lightgcn_model, lightgcl_model, lightgcl_geo_model, pinsage_model, lghgt_model]
summary["rmse"] = [pointwise_eval(m, "test")["rmse"] for m in models_in_order]
summary["mae"]  = [pointwise_eval(m, "test")["mae"]  for m in models_in_order]

summary = summary[["model", "test_recall", "test_ndcg", "test_geondcg", "test_mrr", "rmse", "mae"]]
summary.sort_values("test_ndcg", ascending=False)