Environment and Import Preparation

In [1]:
!pip install torch torchvision torchaudio -q
!pip install torch-geometric -q
!pip install dgl -q  # generic DGL (CPU/GPU autodetect)
!pip install torchmetrics==1.4.0.post0 scikit-learn pandas numpy tqdm geopy haversine -q

In [7]:
import os, json, math, random, gc, time
from dataclasses import dataclass
from typing import Dict, Tuple, List, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torch_geometric.data import HeteroData
from torch_geometric.utils import to_undirected, coalesce
from torch_geometric.nn import HGTConv, SAGEConv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from haversine import haversine

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED);
if DEVICE.type == 'cuda':
    torch.cuda.manual_seed_all(SEED)

print("Device:", DEVICE)

Device: cuda


Data Loading and Graph Construction

In [10]:
import json

with open("filter_all_t.json", "r") as f:
    data = json.load(f)

print(type(data))
print(list(data.keys())[:10] if isinstance(data, dict) else data[0])

<class 'dict'>
['train', 'val', 'test']


In [11]:
import json
import pandas as pd

# Load the JSON file
with open("filter_all_t.json", "r") as f:
    data = json.load(f)

# Loop through each dataset and save separately
for split_name, records in data.items():
    df = pd.DataFrame(records)
    csv_name = f"{split_name}.csv"
    df.to_csv(csv_name, index=False)
    print(f"✅ Saved {csv_name} with {len(df)} rows.")

✅ Saved train.csv with 87013 rows.
✅ Saved val.csv with 10860 rows.
✅ Saved test.csv with 11015 rows.


In [None]:
DATA_DIR = "/content/data"  # <- change this to your path in Drive or Colab
REVIEWS_CSV = os.path.join(DATA_DIR, "reviews.csv")
REVIEWS_JSONL = os.path.join(DATA_DIR, "reviews.jsonl")
RESTAURANTS_JSON = os.path.join(DATA_DIR, "restaurants.json")   # nested form like screenshot
MIN_USER_INTERACTIONS = 2
MIN_ITEM_INTERACTIONS = 2

def _safe_float(x, default=np.nan):
    try:
        return float(x)
    except:
        return default

def load_google_restaurants() -> pd.DataFrame:
    """
    Returns a DataFrame with columns:
    user_id, item_id, rating, ts (unix int),
    user_lat, user_lon (optional), item_lat, item_lon,
    price (int 1-4 or str like '$$'), categories (list[str])
    """
    if os.path.exists(REVIEWS_CSV):
        df = pd.read_csv(REVIEWS_CSV)
    elif os.path.exists(REVIEWS_JSONL):
        df = pd.read_json(REVIEWS_JSONL, lines=True)
    elif os.path.exists(RESTAURANTS_JSON):
        # Expect either one JSON array or JSONL with one restaurant per line
        rows = []
        def normalize_price(p):
            if isinstance(p, str) and p.count("$")>0:
                return len(p)
            return int(p) if pd.notna(p) else np.nan
        with open(RESTAURANTS_JSON,'r') as f:
            txt = f.read().strip()
            if txt.startswith('['):
                items = json.loads(txt)
            else:
                items = [json.loads(line) for line in txt.splitlines() if line.strip()]
        for place in tqdm(items, desc="Parsing restaurants.json"):
            pid = place.get("gmap_url") or place.get("place_id") or place.get("name")
            ilat = _safe_float(place.get("Latitude"))
            ilon = _safe_float(place.get("Longitude"))
            price = normalize_price(place.get("price", np.nan))
            cats = place.get("category", [])
            if isinstance(cats,str): cats=[cats]
            revs = place.get("Reviews") or place.get("reviews") or []
            for r in revs:
                rows.append({
                    "user_id": str(r.get("user_id") or r.get("userId") or r.get("uid")),
                    "item_id": str(pid),
                    "rating": _safe_float(r.get("Rating") or r.get("rating") or 0.0),
                    "ts": int(time.time()) if r.get("time") is None else int(time.time()) if isinstance(r.get("time"), str) else int(r.get("time")),
                    "user_lat": np.nan, "user_lon": np.nan,
                    "item_lat": ilat, "item_lon": ilon,
                    "price": price, "categories": cats
                })
        df = pd.DataFrame(rows)
    else:
        raise FileNotFoundError("Place your dataset at /content/data. Supported files: reviews.csv, reviews.jsonl, or restaurants.json")

    # Clean/standardize
    for col in ["user_id","item_id"]:
        df[col] = df[col].astype(str)
    if "rating" not in df: df["rating"] = 5.0
    if "ts" not in df: df["ts"] = int(time.time())
    for c in ["item_lat","item_lon","user_lat","user_lon"]:
        if c not in df: df[c] = np.nan
    if "price" not in df: df["price"] = np.nan
    if "categories" not in df: df["categories"] = [[] for _ in range(len(df))]
    return df

df = load_google_restaurants()
print(df.head(), "\n", df.shape)

In [None]:
# Filter to users/items with minimum interactions
def filter_min_interactions(df, umin=MIN_USER_INTERACTIONS, imin=MIN_ITEM_INTERACTIONS):
    grouped_u = df.groupby("user_id").size()
    keep_users = set(grouped_u[grouped_u>=umin].index)
    grouped_i = df.groupby("item_id").size()
    keep_items = set(grouped_i[grouped_i>=imin].index)
    out = df[df.user_id.isin(keep_users) & df.item_id.isin(keep_items)].copy()
    return out

df = filter_min_interactions(df)
print("After filtering:", df.shape)

# Encode ids -> indices
u_enc = LabelEncoder().fit(df["user_id"])
i_enc = LabelEncoder().fit(df["item_id"])
df["u"] = u_enc.transform(df["user_id"])
df["i"] = i_enc.transform(df["item_id"])

num_users = df["u"].max()+1
num_items = df["i"].max()+1
print(f"num_users={num_users}, num_items={num_items}")

# Train/val/test split by user (leave-one-out style)
df = df.sort_values("ts")
def split_by_user(group):
    if len(group) < 3:  # small fallback
        test = group.iloc[[-1]]
        val = group.iloc[[-2]] if len(group)>=2 else group.iloc[[-1]]
        train = group.drop(test.index).drop(val.index, errors='ignore')
    else:
        test = group.iloc[[-1]]
        val = group.iloc[[-2]]
        train = group.iloc[:-2]
    return train, val, test

train_rows, val_rows, test_rows = [], [], []
for uid, g in df.groupby("u"):
    tr, va, te = split_by_user(g)
    train_rows.append(tr); val_rows.append(va); test_rows.append(te)
train_df = pd.concat(train_rows); val_df = pd.concat(val_rows); test_df = pd.concat(test_rows)

# Build hetero graph (user<->item); also build item<->item edges for PinSage
data = HeteroData()
data["user"].num_nodes = num_users
data["item"].num_nodes = num_items

# user-item edges (train only for supervision)
ui_src = torch.tensor(train_df["u"].values, dtype=torch.long)
ui_dst = torch.tensor(train_df["i"].values, dtype=torch.long)
edge_index = torch.stack([ui_src, ui_dst], dim=0)
data["user","rates","item"].edge_index = edge_index
data["item","rev_by","user"].edge_index = edge_index.flip(0)

# item coordinates + metadata (optional features)
item_lat = torch.full((num_items,), float('nan'))
item_lon = torch.full((num_items,), float('nan'))
price = torch.zeros(num_items)
for i, sub in df.groupby("i").head(1).groupby("i"):
    item_lat[i] = float(sub["item_lat"].iloc[0]) if pd.notna(sub["item_lat"].iloc[0]) else float('nan')
    item_lon[i] = float(sub["item_lon"].iloc[0]) if pd.notna(sub["item_lon"].iloc[0]) else float('nan')
    pr = sub["price"].iloc[0]
    price[i] = float(pr) if pd.notna(pr) else 0.
data["item"].x = torch.stack([
    torch.nan_to_num(item_lat, nan=0.0),
    torch.nan_to_num(item_lon, nan=0.0),
    price
], dim=1)

# Quick item-item edges via co-review + geo proximity
co_counts = {}
for u, grp in train_df.groupby("u"):
    items = grp["i"].tolist()
    for a in items:
        for b in items:
            if a>=b: continue
            co_counts[(a,b)] = co_counts.get((a,b),0)+1
pairs = [(a,b,c) for (a,b),c in co_counts.items() if c>=2]
ii_src = [a for a,b,_ in pairs]; ii_dst = [b for a,b,_ in pairs]

# geo proximity (<= 2km)
for a in range(num_items):
    if math.isnan(item_lat[a]) or math.isnan(item_lon[a]):
        continue
    for b in range(a+1, min(num_items, a+300)):  # local window to keep edges sparse
        if math.isnan(item_lat[b]) or math.isnan(item_lon[b]):
            continue
        dkm = haversine((float(item_lat[a]), float(item_lon[a])), (float(item_lat[b]), float(item_lon[b])))
        if dkm <= 2.0:
            ii_src.append(a); ii_dst.append(b)

ii_edge = torch.tensor([ii_src, ii_dst], dtype=torch.long)
ii_edge = to_undirected(ii_edge)
data["item","similar","item"].edge_index = coalesce(ii_edge, num_nodes=num_items)

data = data.to(DEVICE)
print(data)

Sampling, Metrics, and Utilities

In [None]:
def bpr_triplet_sampler(train_df: pd.DataFrame, num_items: int,
                        radius_km: Optional[float]=None,
                        item_latlon: Optional[List[Tuple[float,float]]]=None,
                        user_pos_map: Optional[Dict[int,set]]=None,
                        user_home_latlon: Optional[Dict[int,Tuple[float,float]]]=None,
                        batch_size: int = 2048):
    """
    Yields batches of (u, pos_i, neg_j) for BPR.
    If radius_km is set, negative items are sampled within radius of user's home (or of pos item if home unknown).
    """
    if user_pos_map is None:
        user_pos_map = {u:set(g["i"].values.tolist()) for u,g in train_df.groupby("u")}
    users = list(user_pos_map.keys())
    while True:
        uu, ii, jj = [], [], []
        for _ in range(batch_size):
            u = random.choice(users)
            pos_i = random.choice(list(user_pos_map[u]))
            # radius-aware negative sampling
            if radius_km is not None and item_latlon is not None:
                center = None
                if user_home_latlon and u in user_home_latlon:
                    center = user_home_latlon[u]
                else:
                    lat_i, lon_i = item_latlon[pos_i]
                    if not math.isnan(lat_i) and not math.isnan(lon_i):
                        center = (lat_i, lon_i)
                candidates = None
                if center:
                    latc, lonc = center
                    idxs = list(range(num_items))
                    candidates = [k for k in idxs if k not in user_pos_map[u]]
                    random.shuffle(candidates)
                    # filter by distance lazily
                    neg_j = None
                    for k in candidates:
                        latk, lonk = item_latlon[k]
                        if math.isnan(latk) or math.isnan(lonk):
                            continue
                        if haversine((latc,lonc),(latk,lonk)) <= radius_km:
                            neg_j = k; break
                    if neg_j is None:
                        # fallback random
                        while True:
                            k = random.randrange(num_items)
                            if k not in user_pos_map[u]:
                                neg_j = k; break
                else:
                    # fallback random
                    while True:
                        k = random.randrange(num_items)
                        if k not in user_pos_map[u]:
                            neg_j = k; break
            else:
                # uniform negative
                while True:
                    k = random.randrange(num_items)
                    if k not in user_pos_map[u]:
                        neg_j = k; break
            uu.append(u); ii.append(pos_i); jj.append(neg_j)
        yield torch.tensor(uu, device=DEVICE), torch.tensor(ii, device=DEVICE), torch.tensor(jj, device=DEVICE)

# Ranking metrics
def recall_at_k(ranked_items, ground_truth, k=10):
    hits = sum([1 for x in ranked_items[:k] if x in ground_truth])
    return hits / float(min(k, len(ground_truth))) if ground_truth else 0.0

def ndcg_at_k(ranked_items, ground_truth, k=10):
    dcg = 0.0
    for idx, it in enumerate(ranked_items[:k], start=1):
        if it in ground_truth:
            dcg += 1.0 / math.log2(idx+1)
    idcg = sum([1.0 / math.log2(i+2) for i in range(min(k, len(ground_truth)))])
    return dcg / idcg if idcg>0 else 0.0

def geo_discount(distance_km, R=5.0):
    return math.exp(-distance_km / R)

def geo_ndcg_at_k(ranked_items, ground_truth, user_loc, item_latlon, k=10, R=5.0):
    dcg = 0.0
    for idx, it in enumerate(ranked_items[:k], start=1):
        if it in ground_truth:
            if user_loc and not any(np.isnan(user_loc)):
                latu, lonu = user_loc
                lati, loni = item_latlon[it]
                if not math.isnan(lati) and not math.isnan(loni):
                    d = haversine((latu,lonu),(lati,loni))
                    w = geo_discount(d, R=R)
                else:
                    w = 1.0
            else:
                w = 1.0
            dcg += w / math.log2(idx+1)
    # ideal geo-dcg assume w=1 for positives (upper bound)
    idcg = sum([1.0 / math.log2(i+2) for i in range(min(k, len(ground_truth)))])
    return dcg / idcg if idcg>0 else 0.0

def mrr(ranked_items, ground_truth, k=10):
    for idx, it in enumerate(ranked_items[:k], start=1):
        if it in ground_truth:
            return 1.0 / idx
    return 0.0

def rmse(preds, trues):
    return float(np.sqrt(np.mean((np.array(preds)-np.array(trues))**2)))

def mae(preds, trues):
    return float(np.mean(np.abs(np.array(preds)-np.array(trues))))

# Helper: materialize item latlon for geo bits
item_latlon = [(float(v[0]), float(v[1])) for v in data["item"].x[:, :2].tolist()]
user_home = {}  # if you have per-user coords, populate {u: (lat,lon)}

Baseline Model: LightGCN

In [None]:
class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, emb_dim=64, num_layers=3, alpha=None):
        super().__init__()
        self.num_users, self.num_items = num_users, num_items
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)
        nn.init.normal_(self.user_emb.weight, std=0.1)
        nn.init.normal_(self.item_emb.weight, std=0.1)
        self.num_layers = num_layers
        self.alpha = alpha if alpha is not None else [1/(num_layers+1)]*(num_layers+1)
        # precompute normalized adjacency indices for propagation (bipartite)
        edge = data["user","rates","item"].edge_index
        u,i = edge[0], edge[1]
        deg_u = torch.bincount(u, minlength=num_users).float()
        deg_i = torch.bincount(i, minlength=num_items).float()
        self.pairs = (u,i,deg_u,deg_i)

    def propagate(self, user_x, item_x):
        u,i,deg_u,deg_i = self.pairs
        all_user = [user_x]; all_item = [item_x]
        for _ in range(self.num_layers):
            # message passing along normalized bipartite edges
            msg_u = torch.zeros_like(user_x)
            msg_i = torch.zeros_like(item_x)
            # item -> user
            msg_u.index_add_(0, u, item_x[i] / torch.sqrt(deg_u[u].unsqueeze(1)*deg_i[i].unsqueeze(1)+1e-8))
            # user -> item
            msg_i.index_add_(0, i, user_x[u] / torch.sqrt(deg_i[i].unsqueeze(1)*deg_u[u].unsqueeze(1)+1e-8))
            user_x, item_x = msg_u, msg_i
            all_user.append(user_x); all_item.append(item_x)
        # layer-wise averaging
        user_out = torch.stack([a*b for a,b in zip(self.alpha, all_user)]).sum(0)
        item_out = torch.stack([a*b for a,b in zip(self.alpha, all_item)]).sum(0)
        return user_out, item_out

    def forward(self):
        u0 = self.user_emb.weight
        i0 = self.item_emb.weight
        return self.propagate(u0, i0)

    def score(self, users, items, user_emb=None, item_emb=None):
        if user_emb is None or item_emb is None:
            user_emb, item_emb = self.forward()
        return (user_emb[users] * item_emb[items]).sum(dim=1)

def bpr_loss(pos_scores, neg_scores, reg=None, params: List[torch.Tensor]=[]):
    loss = -F.logsigmoid(pos_scores - neg_scores).mean()
    if reg:
        loss = loss + reg*sum(p.norm(2).pow(2) for p in params)/len(params)
    return loss

def train_lightgcn(epochs=5, emb_dim=64, batch_size=2048, lr=1e-3, reg=1e-4):
    model = LightGCN(num_users, num_items, emb_dim=emb_dim).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    sampler = bpr_triplet_sampler(train_df, num_items, batch_size=batch_size)
    user_pos = {u:set(g["i"].values.tolist()) for u,g in train_df.groupby("u")}
    for ep in range(1, epochs+1):
        model.train()
        total = 0.0
        for step in range(max(1, len(train_df)//batch_size)):
            u,i,j = next(sampler)
            user_emb, item_emb = model()
            pos = model.score(u,i,user_emb,item_emb)
            neg = model.score(u,j,user_emb,item_emb)
            loss = bpr_loss(pos,neg,reg, [model.user_emb.weight, model.item_emb.weight])
            opt.zero_grad(); loss.backward(); opt.step()
            total += float(loss)
        print(f"[LightGCN] epoch {ep} loss {total/(step+1):.4f}")
    return model

lightgcn_model = train_lightgcn(epochs=5, emb_dim=64, batch_size=2048)


Extension Model 1: LightGCL

Elements:
1. Contrastive self-supervision

In [None]:
def build_svd_view(train_df, num_users, num_items, rank=64):
    # Sparse user-item matrix
    rows = torch.tensor(train_df["u"].values, dtype=torch.long)
    cols = torch.tensor(train_df["i"].values, dtype=torch.long)
    vals = torch.ones(len(train_df), dtype=torch.float32)
    A = torch.sparse_coo_tensor(
        indices=torch.stack([rows, cols]), values=vals, size=(num_users, num_items)
    ).to_dense()  # for simplicity; if too big, sample or chunk
    U, S, Vt = torch.linalg.svd(A, full_matrices=False)
    Uk = U[:, :rank] * S[:rank]
    Vk = Vt[:rank, :].T * S[:rank]
    return Uk.to(DEVICE), Vk.to(DEVICE)

def info_nce(z, z_tgt, temperature=0.2):
    z = F.normalize(z, dim=1)
    z_tgt = F.normalize(z_tgt, dim=1)
    logits = z @ z_tgt.T / temperature
    labels = torch.arange(z.size(0), device=z.device)
    return F.cross_entropy(logits, labels)

class LightGCL(nn.Module):
    def __init__(self, base: LightGCN, lambda_cl=0.1, svd_rank=64):
        super().__init__()
        self.base = base
        self.lambda_cl = lambda_cl
        self.Uk, self.Vk = build_svd_view(train_df, base.num_users, base.num_items, rank=svd_rank)

    def training_step(self, batch, opt, reg=1e-4):
        u,i,j = batch
        user_emb, item_emb = self.base()
        pos = self.base.score(u,i,user_emb,item_emb)
        neg = self.base.score(u,j,user_emb,item_emb)
        loss_bpr = bpr_loss(pos,neg,reg,[self.base.user_emb.weight, self.base.item_emb.weight])
        # contrastive loss: align learned embeddings with SVD embeddings
        cl_u = info_nce(user_emb, self.Uk)
        cl_i = info_nce(item_emb, self.Vk)
        loss = loss_bpr + self.lambda_cl*(cl_u + cl_i)
        opt.zero_grad(); loss.backward(); opt.step()
        return float(loss.item())

def train_lightgcl(epochs=5, emb_dim=64, batch_size=2048, lr=1e-3, reg=1e-4, lambda_cl=0.1):
    base = LightGCN(num_users, num_items, emb_dim=emb_dim).to(DEVICE)
    model = LightGCL(base, lambda_cl=lambda_cl, svd_rank=emb_dim).to(DEVICE)
    opt = torch.optim.Adam(base.parameters(), lr=lr)
    sampler = bpr_triplet_sampler(train_df, num_items, batch_size=batch_size)
    for ep in range(1, epochs+1):
        total=0.0
        for step in range(max(1, len(train_df)//batch_size)):
            u,i,j = next(sampler)
            total += model.training_step((u,i,j), opt, reg=reg)
        print(f"[LightGCL] epoch {ep} loss {total/(step+1):.4f}")
    return model

lightgcl_model = train_lightgcl(epochs=5, emb_dim=64, lambda_cl=0.1)


Extension Model 2: LightGCL + Geographical Awareness

Elements:
1. Contrastive self-supervision
2. Distance-aware scoring
3. Radius-aware negative sampling

In [None]:
class LightGCL_Geo(LightGCL):
    def __init__(self, base: LightGCN, beta=0.2, R=5.0, lambda_cl=0.1):
        super().__init__(base, lambda_cl=lambda_cl, svd_rank=base.user_emb.embedding_dim)
        self.beta = beta
        self.R = R

    def geo_term(self, users, items, user_home, item_latlon):
        vals = []
        for u,i in zip(users.tolist(), items.tolist()):
            uh = user_home.get(u)
            if uh is None:
                lati,loni = item_latlon[i]
                if math.isnan(lati) or math.isnan(loni):
                    vals.append(0.0); continue
                uh = (lati,loni)  # weak fallback
            lati,loni = item_latlon[i]
            if math.isnan(lati) or math.isnan(loni):
                vals.append(0.0); continue
            d = haversine(uh, (lati,loni))
            vals.append(math.exp(-d/self.R))
        return torch.tensor(vals, device=DEVICE, dtype=torch.float32)

    def training_step(self, batch, opt, reg=1e-4):
        u,i,j = batch
        user_emb, item_emb = self.base()
        pos = self.base.score(u,i,user_emb,item_emb) + self.beta * self.geo_term(u,i,user_home,item_latlon)
        neg = self.base.score(u,j,user_emb,item_emb) + self.beta * self.geo_term(u,j,user_home,item_latlon)
        loss_bpr = bpr_loss(pos,neg,reg,[self.base.user_emb.weight, self.base.item_emb.weight])
        cl_u = info_nce(user_emb, self.Uk)
        cl_i = info_nce(item_emb, self.Vk)
        loss = loss_bpr + self.lambda_cl*(cl_u + cl_i)
        opt.zero_grad(); loss.backward(); opt.step()
        return float(loss.item())

def train_lightgcl_geo(epochs=5, emb_dim=64, batch_size=2048, lr=1e-3, reg=1e-4, lambda_cl=0.1, beta=0.2, R=5.0):
    base = LightGCN(num_users, num_items, emb_dim=emb_dim).to(DEVICE)
    model = LightGCL_Geo(base, beta=beta, R=R, lambda_cl=lambda_cl).to(DEVICE)
    opt = torch.optim.Adam(base.parameters(), lr=lr)
    sampler = bpr_triplet_sampler(train_df, num_items, radius_km=R,
                                  item_latlon=item_latlon, user_pos_map=None,
                                  user_home_latlon=user_home, batch_size=batch_size)
    for ep in range(1, epochs+1):
        total=0.0
        for step in range(max(1, len(train_df)//batch_size)):
            u,i,j = next(sampler)
            total += model.training_step((u,i,j), opt, reg=reg)
        print(f"[LightGCL+Geo] epoch {ep} loss {total/(step+1):.4f}")
    return model

lightgcl_geo_model = train_lightgcl_geo(epochs=5, emb_dim=64, beta=0.3, R=5.0)


Comparative Model 1: PinSage

In [None]:
class PinSageItemEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim=64, num_layers=2):
        super().__init__()
        self.proj = nn.Linear(in_dim, hidden_dim) if in_dim>0 else None
        self.layers = nn.ModuleList([
            PinSAGEConv(in_channels=hidden_dim, out_channels=hidden_dim, heads=1)
            for _ in range(num_layers)
        ])

    def forward(self, x, edge_index):
        h = x
        if self.proj is not None:
            h = self.proj(h)
        for conv in self.layers:
            h = conv(h, edge_index)
            h = F.relu(h)
        return F.normalize(h, dim=1)

class PinSageRecommender(nn.Module):
    def __init__(self, item_in_dim, hidden_dim=64):
        super().__init__()
        self.item_enc = PinSAGEItemEncoder(item_in_dim, hidden_dim=hidden_dim)

    def forward(self, item_x, ii_edge):
        return self.item_enc(item_x, ii_edge)

    def user_repr(self, user_pos_items, item_emb):
        # mean pool over positives
        return F.normalize(item_emb[user_pos_items].mean(0, keepdim=True), dim=1)

def train_pinsage(epochs=5, hidden_dim=64, batch_size=2048, lr=1e-3):
    item_x = data['item'].x
    ii = data['item','similar','item'].edge_index
    model = PinSageRecommender(item_x.size(1), hidden_dim=hidden_dim).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    user_pos_map = {u:set(g["i"].values.tolist()) for u,g in train_df.groupby("u")}
    sampler = bpr_triplet_sampler(train_df, num_items, batch_size=batch_size)
    for ep in range(1, epochs+1):
        total = 0.0
        for step in range(max(1, len(train_df)//batch_size)):
            u, i, j = next(sampler)
            item_emb = model(item_x, ii)
            u_repr = item_emb[i]  # treat pos item as anchor (approximate)
            pos = (u_repr * item_emb[i]).sum(1)
            neg = (u_repr * item_emb[j]).sum(1)
            loss = -F.logsigmoid(pos - neg).mean()
            opt.zero_grad(); loss.backward(); opt.step()
            total += float(loss)
        print(f"[PinSage] epoch {ep} loss {total/(step+1):.4f}")
    return model

pinsage_model = train_pinsage(epochs=5, hidden_dim=64)


Comparative Model 2: LightGCN + HGT

Elements:
1. Base Model LightGCN
2. HGT item representation layer

In [None]:
# Build minimal hetero metadata graph for items: (item)-[has_cat]->(cat), (item)-[has_price]->(price_level)
# From df categories/price
cat_encoder = LabelEncoder()
all_cats = []
for row in df["categories"]:
    if isinstance(row, list):
        all_cats += row
    elif pd.notna(row):
        all_cats.append(str(row))
if len(all_cats)==0:
    all_cats = ["unknown"]
cat_encoder.fit(list(set(all_cats)))
num_cats = len(cat_encoder.classes_)
price_levels = sorted(list(set([int(p) if pd.notna(p) else 0 for p in df["price"]])))
price_to_idx = {p:i for i,p in enumerate(price_levels)}
num_prices = len(price_levels)

meta = HeteroData()
meta["item"].num_nodes = num_items
meta["cat"].num_nodes = num_cats
meta["price"].num_nodes = num_prices
# item->cat edges
src, dst = [], []
for i, g in df.groupby("i"):
    cats = g["categories"].iloc[0]
    if isinstance(cats, list) and len(cats)>0:
        for c in cats[:3]:
            src.append(i); dst.append(cat_encoder.transform([c])[0])
    else:
        src.append(i); dst.append(cat_encoder.transform([cat_encoder.classes_[0]])[0])
meta["item","has_cat","cat"].edge_index = torch.tensor([src,dst], dtype=torch.long)
# item->price edges
src, dst = [], []
for i, g in df.groupby("i"):
    p = g["price"].iloc[0]
    p = int(p) if pd.notna(p) else 0
    src.append(i); dst.append(price_to_idx.get(p, 0))
meta["item","has_price","price"].edge_index = torch.tensor([src,dst], dtype=torch.long)
meta = meta.to(DEVICE)

class HGTItemEncoder(nn.Module):
    def __init__(self, hidden=64, heads=2, layers=2):
        super().__init__()
        self.emb = nn.ModuleDict({
            'item': nn.Embedding(num_items, hidden),
            'cat': nn.Embedding(num_cats, hidden),
            'price': nn.Embedding(num_prices, hidden),
        })
        for k in self.emb:
            nn.init.xavier_uniform_(self.emb[k].weight)
        self.layers = nn.ModuleList([
            HGTConv(hidden_channels=hidden, out_channels=hidden, num_types=3, num_relations=2, heads=heads)
            for _ in range(layers)
        ])

    def forward(self, meta: HeteroData):
        x_dict = {k: self.emb[k].weight for k in ['item','cat','price']}
        for conv in self.layers:
            x_dict = conv(x_dict, meta.edge_index_dict)
            x_dict = {k: F.relu(v) for k,v in x_dict.items()}
        return x_dict['item']

class LightGCN_HGT(nn.Module):
    def __init__(self, lightgcn: LightGCN, hgt_hidden=64):
        super().__init__()
        self.lgcn = lightgcn
        self.hgt = HGTItemEncoder(hidden=hgt_hidden)
        self.fuse = nn.Linear(self.lgcn.user_emb.embedding_dim + hgt_hidden, self.lgcn.user_emb.embedding_dim)

    def fused_item_emb(self):
        user_e, item_e = self.lgcn()
        hgt_item = self.hgt(meta)
        item_fused = self.fuse(torch.cat([item_e, hgt_item], dim=1))
        return user_e, item_fused

    def score(self, users, items):
        ue, ie = self.fused_item_emb()
        return (ue[users]*ie[items]).sum(1)

def train_lightgcn_hgt(epochs=5, emb_dim=64, batch_size=2048, lr=1e-3, reg=1e-4):
    base = LightGCN(num_users, num_items, emb_dim=emb_dim).to(DEVICE)
    model = LightGCN_HGT(base, hgt_hidden=emb_dim).to(DEVICE)
    params = list(model.lgcn.parameters()) + list(model.hgt.parameters()) + list(model.fuse.parameters())
    opt = torch.optim.Adam(params, lr=lr)
    sampler = bpr_triplet_sampler(train_df, num_items, batch_size=batch_size)
    for ep in range(1, epochs+1):
        total=0.0
        for step in range(max(1, len(train_df)//batch_size)):
            u,i,j = next(sampler)
            ue, ie = model.lgcn()
            hgt_item = model.hgt(meta)
            fused = model.fuse(torch.cat([ie, hgt_item], dim=1))
            pos = (ue[u]*fused[i]).sum(1)
            neg = (ue[u]*fused[j]).sum(1)
            loss = bpr_loss(pos,neg,reg, params)
            opt.zero_grad(); loss.backward(); opt.step()
            total += float(loss)
        print(f"[LightGCN+HGT] epoch {ep} loss {total/(step+1):.4f}")
    return model

lghgt_model = train_lightgcn_hgt(epochs=5, emb_dim=64)


Evaluation Helpers

In [None]:
def full_ranking_scores(model, model_name, K_list=(5,10), geo_R=5.0, geo_beta=0.0):
    # prepare user->train positives and heldouts
    train_pos = {u:set(g["i"].values.tolist()) for u,g in train_df.groupby("u")}
    gt_val = {u:set(g["i"].values.tolist()) for u,g in val_df.groupby("u")}
    gt_test = {u:set(g["i"].values.tolist()) for u,g in test_df.groupby("u")}
    # compute item/user embeddings or scorers per model
    with torch.no_grad():
        if isinstance(model, LightGCN):
            ue, ie = model()
            def score_u(u):
                # mask train positives for ranking
                s = (ue[u].unsqueeze(0) * ie).sum(1)
                for it in train_pos.get(u,[]):
                    s[it] = -1e9
                return s
        elif isinstance(model, LightGCL) and not isinstance(model, LightGCL_Geo):
            ue, ie = model.base()
            def score_u(u):
                s = (ue[u].unsqueeze(0) * ie).sum(1)
                for it in train_pos.get(u,[]): s[it] = -1e9
                return s
        elif isinstance(model, LightGCL_Geo):
            ue, ie = model.base()
            def score_u(u):
                s = (ue[u].unsqueeze(0) * ie).sum(1)
                # add geo distance term
                geo = []
                for it in range(num_items):
                    uh = user_home.get(u)
                    lati,loni = item_latlon[it]
                    w = 0.0
                    if uh and not any(np.isnan(uh)) and not math.isnan(lati) and not math.isnan(loni):
                        d = haversine(uh,(lati,loni))
                        w = math.exp(-d/model.R)
                    geo.append(w)
                s = s + model.beta*torch.tensor(geo, device=DEVICE)
                for it in train_pos.get(u,[]): s[it] = -1e9
                return s
        elif isinstance(model, PinSageRecommender):
            item_x = data['item'].x
            ii = data['item','similar','item'].edge_index
            item_emb = model(item_x, ii)
            def score_u(u):
                pos = list(train_pos.get(u, [])) or [0]
                uvec = item_emb[pos].mean(0, keepdim=True)
                s = (uvec * item_emb).sum(1)
                for it in train_pos.get(u,[]): s[it] = -1e9
                return s
        elif isinstance(model, LightGCN_HGT):
            def score_u(u):
                ue, ie = model.fused_item_emb()
                s = (ue[u].unsqueeze(0) * ie).sum(1)
                for it in train_pos.get(u,[]): s[it] = -1e9
                return s
        else:
            raise ValueError("Unknown model for evaluation")

        def evaluate(gt_dict, tag):
            rec, ndcg, gndcg, rr = [], [], [], []
            for u in gt_dict.keys():
                s = score_u(u)
                ranked = torch.topk(s, k=min(100, num_items)).indices.tolist()
                truth = gt_dict[u]
                rec.append(recall_at_k(ranked, truth, k=max(K_list)))
                ndcg.append(ndcg_at_k(ranked, truth, k=max(K_list)))
                # geo-aware NDCG
                u_loc = user_home.get(u)
                gndcg.append(geo_ndcg_at_k(ranked, truth, u_loc, item_latlon, k=max(K_list), R=geo_R))
                rr.append(mrr(ranked, truth, k=max(K_list)))
            return np.mean(rec), np.mean(ndcg), np.mean(gndcg), np.mean(rr)

        val_scores = evaluate(gt_val, "val")
        test_scores = evaluate(gt_test, "test")
        print(f"[{model_name}] Val  Recall@K:{val_scores[0]:.4f}  NDCG@K:{val_scores[1]:.4f}  GeoNDCG@K:{val_scores[2]:.4f}  MRR:{val_scores[3]:.4f}")
        print(f"[{model_name}] Test Recall@K:{test_scores[0]:.4f}  NDCG@K:{test_scores[1]:.4f}  GeoNDCG@K:{test_scores[2]:.4f}  MRR:{test_scores[3]:.4f}")
        return {"model": model_name, "val_recall":val_scores[0], "val_ndcg":val_scores[1], "val_geondcg":val_scores[2], "val_mrr":val_scores[3],
                "test_recall":test_scores[0], "test_ndcg":test_scores[1], "test_geondcg":test_scores[2], "test_mrr":test_scores[3]}

results = []
results.append(full_ranking_scores(lightgcn_model, "LightGCN"))
results.append(full_ranking_scores(lightgcl_model.base, "LightGCL (base view)"))
results.append(full_ranking_scores(lightgcl_geo_model.base, "LightGCL+Geo (inference uses geo)", geo_R=5.0))
results.append(full_ranking_scores(pinsage_model, "PinSage"))
results.append(full_ranking_scores(lghgt_model.lgcn, "LightGCN+HGT (fused)"))
pd.DataFrame(results)


Rating Metrics (RMSE/MAE)

In [None]:
def pointwise_eval(model, which="val"):
    df_eval = val_df if which=="val" else test_df
    preds, trues = [], []
    with torch.no_grad():
        if isinstance(model, LightGCN):
            ue, ie = model()
            for _, r in df_eval.iterrows():
                preds.append(float((ue[r.u] * ie[r.i]).sum().item()))
                trues.append(float(r.rating))
        elif isinstance(model, LightGCL):
            ue, ie = model.base()
            for _, r in df_eval.iterrows():
                s = float((ue[r.u]*ie[r.i]).sum().item())
                preds.append(s); trues.append(float(r.rating))
        elif isinstance(model, LightGCL_Geo):
            ue, ie = model.base()
            for _, r in df_eval.iterrows():
                s = float((ue[r.u]*ie[r.i]).sum().item())
                # add geo term
                uh = user_home.get(int(r.u))
                lati,loni = item_latlon[int(r.i)]
                if uh and not any(np.isnan(uh)) and not math.isnan(lati) and not math.isnan(loni):
                    d = haversine(uh,(lati,loni))
                    s += model.beta*math.exp(-d/model.R)
                preds.append(s); trues.append(float(r.rating))
        elif isinstance(model, PinSageRecommender):
            item_x = data['item'].x
            ii = data['item','similar','item'].edge_index
            item_emb = model(item_x, ii)
            for _, r in df_eval.iterrows():
                # approximate user vector by mean of their train items
                pos = train_df[train_df.u==r.u]["i"].values
                if len(pos)==0: continue
                uvec = item_emb[pos].mean(0)
                preds.append(float((uvec * item_emb[r.i]).sum().item()))
                trues.append(float(r.rating))
        elif isinstance(model, LightGCN_HGT):
            ue, ie = model.fused_item_emb()
            for _, r in df_eval.iterrows():
                preds.append(float((ue[r.u]*ie[r.i]).sum().item()))
                trues.append(float(r.rating))
        else:
            raise ValueError("unknown model")
    return {"rmse": rmse(preds,trues), "mae": mae(preds,trues), "n": len(trues)}

for name, mdl in [
    ("LightGCN", lightgcn_model),
    ("LightGCL", lightgcl_model),
    ("LightGCL+Geo", lightgcl_geo_model),
    ("PinSage", pinsage_model),
    ("LightGCN+HGT", lghgt_model),
]:
    pe = pointwise_eval(mdl, "test")
    print(f"{name} rating RMSE={pe['rmse']:.4f} MAE={pe['mae']:.4f} (n={pe['n']})")


Vistualization: Comparison Table

In [None]:
summary = pd.DataFrame(results).assign(
    rmse=[pointwise_eval(m, 'test')["rmse"] for m in [lightgcn_model, lightgcl_model, lightgcl_geo_model, pinsage_model, lghgt_model]],
    mae =[pointwise_eval(m, 'test')["mae"]  for m in [lightgcn_model, lightgcl_model, lightgcl_geo_model, pinsage_model, lghgt_model]],
)
summary = summary[["model","test_recall","test_ndcg","test_geondcg","test_mrr","rmse","mae"]]
summary.sort_values("test_ndcg", ascending=False)