In [7]:
import os, math, time, random, sys, platform
import numpy as np
import pandas as pd
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = (torch.device("mps") if torch.backends.mps.is_available()
          else (torch.device("cuda") if torch.cuda.is_available()
                else torch.device("cpu")))
print("device:", device)

device: mps


In [15]:
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

from torch_geometric.data import HeteroData
from torch_geometric.nn import HeteroConv, GCNConv, Linear

In [16]:
def read_json_in_chunks(file_path, chunk_size=10000):
    chunks, chunk = [], []
    with open(file_path, 'r') as f:
        for i, line in enumerate(f):
            chunk.append(line)
            if (i + 1) % chunk_size == 0:
                chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
        if chunk:
            chunks.append(pd.read_json('\n'.join(chunk), lines=True))
    return pd.concat(chunks, ignore_index=True)

reviews_path = "../data/processed/sf-sampled-reviews.json"
items_path   = "../data/processed/sf-restaurants.json"

reviews_df   = read_json_in_chunks(reviews_path)
items_df     = read_json_in_chunks(items_path)

assert {'user_id','gmap_id','time'}.issubset(reviews_df.columns)
assert {'gmap_id','avg_rating','num_of_reviews','price'}.issubset(items_df.columns)
print(reviews_df.shape, items_df.shape)

  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chunk = []
  chunks.append(pd.read_json('\n'.join(chunk), lines=True)); chu

(411496, 6) (3721, 15)


In [17]:
def chrono_split_per_user(df, train_ratio=0.8, min_hist=3):
    df = df.sort_values(by=['user_id','time'])
    train, test = [], []
    for u, g in df.groupby('user_id', sort=False):
        if len(g) < min_hist: 
            continue
        t = int(len(g) * train_ratio)
        if t == 0 or len(g) - t == 0:
            continue
        train.append(g.iloc[:t]); test.append(g.iloc[t:])
    return pd.concat(train), pd.concat(test)

train_df, test_df = chrono_split_per_user(reviews_df, 0.8, 3)
print("train:", train_df.shape, "test:", test_df.shape)

train: (224017, 6) test: (75747, 6)


In [18]:
## ID encoding (fit on TRAIN only) + cold-start filtering for TEST
le_user = LabelEncoder()
le_item = LabelEncoder()

train_df["user_idx"] = le_user.fit_transform(train_df["user_id"].values)
train_df["item_idx"] = le_item.fit_transform(train_df["gmap_id"].values)

seen_users = set(train_df["user_id"])
seen_items = set(train_df["gmap_id"])

test_df = test_df[test_df["user_id"].isin(seen_users) & test_df["gmap_id"].isin(seen_items)].copy()
test_df["user_idx"] = le_user.transform(test_df["user_id"].values)
test_df["item_idx"] = le_item.transform(test_df["gmap_id"].values)

n_users = train_df["user_idx"].nunique()
n_items = train_df["item_idx"].nunique()
n_users, n_items

(44335, 3708)

### Build Hetero Graph (users & restaurants)
Construct the core **user–restaurant graph** that the GNN will learn from.
- Each **user node** represents an individual reviewer (no explicit features yet; only an ID embedding).
- Each **restaurant node** includes a small numeric feature vector containing:
  - `avg_rating`: the restaurant’s average rating (quality signal)
  - `num_of_reviews`: number of reviews (popularity signal)
  - `price_level`: encoded from `$`, `$$`, etc. (cost signal)
- Each **edge** `(user → restaurant)` corresponds to a past review or interaction, with a reverse edge added for message passing (`restaurant → user`).
 
We use only these three features initially because they are **dense, reliable, and numeric**, allowing the model to focus on learning the **structural relationships** between users and restaurants.
 
Other fields (e.g. `category`, `description`, `latitude`, `MISC`) are intentionally excluded for now since they are sparse, text-heavy, or require additional preprocessing.  
Once the base GNN is stable, these richer attributes can be incrementally integrated either as additional node features or new node types (e.g. category, aspect, or location nodes).

In [21]:
# simple item features from metadata
def coerce_price(p):
    if pd.isna(p): 
        return 0
    return int(str(p).count('$'))

items_small = items_df[["gmap_id","avg_rating","num_of_reviews","price"]].copy()
items_small["price_level"] = items_small["price"].apply(coerce_price)
items_small = items_small.drop(columns=["price"]).fillna({"avg_rating":0.0,"num_of_reviews":0,"price_level":0})

# keep only items in train index space
items_small = items_small[items_small["gmap_id"].isin(seen_items)].copy()
items_small["item_idx"] = le_item.transform(items_small["gmap_id"].values)
items_small = items_small.set_index("item_idx").sort_index()

item_feats = torch.tensor(items_small[["avg_rating","num_of_reviews","price_level"]].values, dtype=torch.float)

# edges (train interactions)
ui = torch.tensor(train_df[["user_idx","item_idx"]].values.T, dtype=torch.long)

data = HeteroData()
data["user"].num_nodes = n_users
data["restaurant"].num_nodes = n_items
data["restaurant"].x = item_feats  # users: ID-emb only for v1

# both directions for message passing
data["user","interacts","restaurant"].edge_index = ui
data["restaurant","rev_interacts","user"].edge_index = ui.flip(0)

# ----- Aspect hooks (add later when ABSA aggregates are ready) -----
# data["aspect"].x = torch.eye(8)
# data["user","expresses","aspect"].edge_index = ua_edge_index
# data["aspect","rev_expresses","user"].edge_index = ua_edge_index.flip(0)
# data["restaurant","has","aspect"].edge_index = ia_edge_index
# data["aspect","rev_has","restaurant"].edge_index = ia_edge_index.flip(0)
# ---------------------------------------------------------------

data = data.to(device)
data

HeteroData(
  user={ num_nodes=44335 },
  restaurant={
    num_nodes=3708,
    x=[3708, 3],
  },
  (user, interacts, restaurant)={ edge_index=[2, 224017] },
  (restaurant, rev_interacts, user)={ edge_index=[2, 224017] }
)

In [20]:
train_pos = train_df.groupby('user_idx')['item_idx'].apply(set).to_dict()
test_pos  = test_df.groupby('user_idx')['item_idx'].apply(list).to_dict()
test_users = sorted(test_pos.keys())
len(train_pos), len(test_users)

(44335, 44304)

### Model: HetRecGNN (HeteroConv with GCNConv per relation)

In [None]:
class HetRecGNN(nn.Module):
    def __init__(self, n_users, n_items, hidden=64, layers=2, use_item_feats=True):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.use_item_feats = use_item_feats

        # ID embeddings
        self.user_emb = nn.Embedding(n_users, hidden)
        self.item_emb = nn.Embedding(n_items, hidden)
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)

        # fuse basic item metadata (optional)
        self.item_feat_mlp = None
        if use_item_feats and data["restaurant"].x is not None:
            in_dim = data["restaurant"].x.size(-1)
            self.item_feat_mlp = nn.Sequential(
                nn.Linear(in_dim, hidden), nn.ReLU(), nn.Linear(hidden, hidden)
            )

        # relation-specific convs
        self.convs = nn.ModuleList([
            HeteroConv({
                ("user","interacts","restaurant"):     GCNConv((-1, -1), hidden),
                ("restaurant","rev_interacts","user"): GCNConv((-1, -1), hidden),
                # add aspect relations here later:
                # ("user","expresses","aspect"):         GCNConv((-1,-1), hidden),
                # ("aspect","rev_expresses","user"):     GCNConv((-1,-1), hidden),
                # ("restaurant","has","aspect"):         GCNConv((-1,-1), hidden),
                # ("aspect","rev_has","restaurant"):     GCNConv((-1,-1), hidden),
            }, aggr="sum")
            for _ in range(layers)
        ])

    def forward(self, data: HeteroData):
        x = {
            "user": self.user_emb.weight,
            "restaurant": self.item_emb.weight
        }
        if self.item_feat_mlp is not None:
            x["restaurant"] = x["restaurant"] + self.item_feat_mlp(data["restaurant"].x)

        for conv in self.convs:
            x = conv(x, data.edge_index_dict)
            x = {k: F.relu(v) for k, v in x.items()}
        return x["user"], x["restaurant"]

    @staticmethod
    def dot_predict(user_z, item_z, pairs):
        u = user_z[pairs[0]]; i = item_z[pairs[1]]
        return (u * i).sum(dim=1)