Environment and Import Preparation

In [None]:
!pip install torch torchvision torchaudio
!pip install torch-geometric
!pip install dgl # generic DGL (CPU/GPU autodetect)
!pip install torchmetrics==1.4.0.post0 scikit-learn pandas numpy tqdm geopy haversine

Collecting torch
  Downloading torch-2.9.1-cp312-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting torchvision
  Downloading torchvision-0.24.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.9 kB)
Collecting torchaudio
  Downloading torchaudio-2.9.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.9 kB)
Collecting filelock (from torch)
  Using cached filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting setuptools (from torch)
  Using cached setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Downloading networkx-3.6.1-py3-none-any.whl.metadata (6.8 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=0.8.5 (from torch)
  Downloading fsspec-2025.12.0-

In [3]:
!pip install ipywidgets -q

In [None]:
import os, json, math, random, gc, time
from dataclasses import dataclass
from typing import Dict, Tuple, List, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torch_geometric.data import HeteroData
from torch_geometric.utils import to_undirected, coalesce
from torch_geometric.nn import HGTConv, SAGEConv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from haversine import haversine

# Set device: CUDA > MPS > CPU
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    DEVICE = torch.device('mps')
else:
    DEVICE = torch.device('cpu')

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED);
if DEVICE.type == 'cuda':
    torch.cuda.manual_seed_all(SEED)
if DEVICE.type == 'mps':
    torch.mps.manual_seed(SEED)

print("Device:", DEVICE)

True
True
Device: mps


# JSON Processing

In [5]:
import json

### Metadata

In [6]:
metadata_path = "data/meta-Vermont.json"
review_path = "data/review-Vermont_10.json"

In [7]:
keywords = {
    "restaurant",
    "cafe",
    "bar",
    "beer",
    "pub",
    "brewery",
    "winery",
    "distillery",
    "brewpub",
    "brewery",
    "brewpub",
    "deli",
    "sandwich",
    "coffee",
    "tea",
    "juice",
    "smoothie"
}

In [8]:
# Load JSONL file (one JSON object per line)
metadata = []
with open(metadata_path, "r") as f:
    for line in f:
        line = line.strip()
        if line:  # Skip empty lines
            metadata.append(json.loads(line))

print(f"Loaded {len(metadata)} records")
print(f"Type: {type(metadata)}")
if metadata:
    print(f"First record keys: {list(metadata[0].keys())}")

Loaded 11291 records
Type: <class 'list'>
First record keys: ['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude', 'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results', 'url']


In [9]:
metadata[1]

{'name': 'Foxglove Farm and Forest',
 'address': 'Foxglove Farm and Forest, 777 Delorm Rd, Leicester, VT 05733',
 'gmap_id': '0x4cb549e8877cf0d7:0xe8f003e6d73392ae',
 'description': None,
 'latitude': 43.855742899999996,
 'longitude': -73.08818,
 'category': ['Indoor lodging', 'Farm', 'Gift shop'],
 'avg_rating': 5,
 'num_of_reviews': 3,
 'price': None,
 'hours': None,
 'MISC': None,
 'state': None,
 'relative_results': None,
 'url': 'https://www.google.com/maps/place//data=!4m2!3m1!1s0x4cb549e8877cf0d7:0xe8f003e6d73392ae?authuser=-1&hl=en&gl=us'}

In [10]:
# Filter metadata to only include entries with at least one keyword in categories
filtered_metadata = []

for entry in metadata:
    categories = entry.get('category', [])
    if categories is None:
        continue
    
    # Check if any keyword appears in any category string (case-insensitive)
    matches = False
    for category in categories:
        if category is None:
            continue
        category_lower = str(category).lower()
        for keyword in keywords:
            if keyword.lower() in category_lower:
                matches = True
                break
        if matches:
            break
    
    if matches:
        filtered_metadata.append(entry)

print(f"Original metadata: {len(metadata)} records")
print(f"Filtered metadata: {len(filtered_metadata)} records")
print(f"Filtered {len(metadata) - len(filtered_metadata)} records ({100 * (len(metadata) - len(filtered_metadata)) / len(metadata):.1f}%)")

Original metadata: 11291 records
Filtered metadata: 2305 records
Filtered 8986 records (79.6%)


In [11]:
# Pre-process filtered_metadata: create set of gmap_ids and mapping to metadata entries
filtered_gmap_ids = set()
gmap_id_to_metadata = {}

for entry in filtered_metadata:
    gmap_id = entry.get('gmap_id')
    if gmap_id:
        filtered_gmap_ids.add(gmap_id)
        gmap_id_to_metadata[gmap_id] = entry

print(f"Created set of {len(filtered_gmap_ids)} unique gmap_ids from filtered_metadata")

Created set of 2304 unique gmap_ids from filtered_metadata


In [12]:
# Load reviews from JSONL file
reviews = []
with open(review_path, "r") as f:
    for line in tqdm(f, desc="Loading reviews"):
        line = line.strip()
        if line:  # Skip empty lines
            reviews.append(json.loads(line))

print(f"Loaded {len(reviews)} reviews")
if reviews:
    print(f"First review keys: {list(reviews[0].keys())}")

Loading reviews: 0it [00:00, ?it/s]

Loading reviews: 324725it [00:00, 376331.32it/s]

Loaded 324725 reviews
First review keys: ['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id']





In [13]:
# Filter reviews and merge metadata attributes
filtered_reviews = []

for review in tqdm(reviews, desc="Filtering and merging reviews"):
    gmap_id = review.get('gmap_id')
    
    # Only keep reviews with gmap_id in filtered_metadata
    if gmap_id and gmap_id in filtered_gmap_ids:
        # Create a copy of the review to avoid modifying the original
        merged_review = review.copy()
        
        # Get the corresponding metadata entry
        metadata_entry = gmap_id_to_metadata[gmap_id]
        
        # Append all attributes from metadata to the review
        # Use update to merge, which will overwrite if keys conflict (review takes precedence)
        for key, value in metadata_entry.items():
            if key not in merged_review:  # Only add if not already in review
                merged_review[key] = value
            else:
                # If key exists, you might want to prefix it or handle differently
                # For now, we'll keep the review's original value
                pass
        
        filtered_reviews.append(merged_review)

print(f"Original reviews: {len(reviews)}")
print(f"Filtered reviews: {len(filtered_reviews)}")
print(f"Filtered {len(reviews) - len(filtered_reviews)} reviews ({100 * (len(reviews) - len(filtered_reviews)) / len(reviews):.1f}%)")
if filtered_reviews:
    print(f"Sample filtered review keys: {list(filtered_reviews[0].keys())}")

Filtering and merging reviews: 100%|██████████| 324725/324725 [00:00<00:00, 569496.29it/s]

Original reviews: 324725
Filtered reviews: 157215
Filtered 167510 reviews (51.6%)
Sample filtered review keys: ['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id', 'address', 'description', 'latitude', 'longitude', 'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results', 'url']





In [14]:
filtered_reviews[1]

{'user_id': '103949805645203359878',
 'name': 'Greg Carlton',
 'time': 1623279276514,
 'rating': 5,
 'text': 'Consistently hits the spot for a straightforward pizza. Staff is always friendly.',
 'pics': [{'url': ['https://lh5.googleusercontent.com/p/AF1QipM3-rDOfWTvsofRy9bygNpf-n46z3mHdgpGSdxW=w150-h150-k-no-p']}],
 'resp': None,
 'gmap_id': '0x4cca9d0401cd8345:0xfd0f1a365e865e14',
 'address': 'Rockers Pizzeria, 191 Main St, Vergennes, VT 05491',
 'description': None,
 'latitude': 44.167623299999995,
 'longitude': -73.2520566,
 'category': ['Restaurant'],
 'avg_rating': 4.4,
 'num_of_reviews': 38,
 'price': None,
 'hours': [['Wednesday', '11AM–8PM'],
  ['Thursday', '11AM–8PM'],
  ['Friday', '11AM–8PM'],
  ['Saturday', '11AM–8PM'],
  ['Sunday', 'Closed'],
  ['Monday', '11AM–8PM'],
  ['Tuesday', '11AM–8PM']],
 'MISC': {'Service options': ['Curbside pickup',
   'No-contact delivery',
   'Delivery',
   'Takeout'],
  'Health & safety': ['Staff required to disinfect surfaces between visits']

### Train-Val-Test Split



In [15]:
# Sort filtered_reviews by time to avoid data leakage
filtered_reviews_sorted = sorted(filtered_reviews, key=lambda x: x.get('time', 0))

# Add review_id field in ascending order
for idx, review in enumerate(filtered_reviews_sorted):
    review['review_id'] = idx

# Extract text field into a dictionary from review_id to review text
review_id_to_text = {}
for review in filtered_reviews_sorted:
    review_id = review['review_id']
    text = review.get('text')
    review_id_to_text[review_id] = text  # Will be None if text is None

print(f"Sorted {len(filtered_reviews_sorted)} reviews by time")
if filtered_reviews_sorted:
    print(f"Earliest time: {filtered_reviews_sorted[0].get('time')}")
    print(f"Latest time: {filtered_reviews_sorted[-1].get('time')}")

# Split into 80:10:10 train:val:test
total = len(filtered_reviews_sorted)
train_size = int(0.8 * total)
val_size = int(0.1 * total)
test_size = total - train_size - val_size  # Remaining goes to test

train_reviews = filtered_reviews_sorted[:train_size]
val_reviews = filtered_reviews_sorted[train_size:train_size + val_size]
test_reviews = filtered_reviews_sorted[train_size + val_size:]

print(f"\nSplit results:")
print(f"Train: {len(train_reviews)} reviews ({100 * len(train_reviews) / total:.1f}%)")
print(f"Val: {len(val_reviews)} reviews ({100 * len(val_reviews) / total:.1f}%)")
print(f"Test: {len(test_reviews)} reviews ({100 * len(test_reviews) / total:.1f}%)")

Sorted 157215 reviews by time
Earliest time: 1182960009677
Latest time: 1629671965744

Split results:
Train: 125772 reviews (80.0%)
Val: 15721 reviews (10.0%)
Test: 15722 reviews (10.0%)


In [16]:
train_reviews[-1]

{'user_id': '116185219049396177446',
 'name': 'Mary Santini',
 'time': 1577663323710,
 'rating': 4,
 'text': None,
 'pics': None,
 'resp': None,
 'gmap_id': '0x89e0247d3c5e0b3b:0x31f9c4d988f48ec7',
 'address': 'China Kitchen, 178 West St, Rutland, VT 05701',
 'description': None,
 'latitude': 43.6074285,
 'longitude': -72.9827102,
 'category': ['Chinese restaurant'],
 'avg_rating': 4.1,
 'num_of_reviews': 168,
 'price': '$',
 'hours': [['Friday', '11AM–10:30PM'],
  ['Saturday', '11AM–10:30PM'],
  ['Sunday', '12–10PM'],
  ['Monday', '11AM–10PM'],
  ['Tuesday', '11AM–10PM'],
  ['Wednesday', '11AM–10PM'],
  ['Thursday', '11AM–10PM']],
 'MISC': {'Service options': ['Takeout', 'Delivery', 'Dine-in'],
  'Health & safety': ['Mask required'],
  'Popular for': ['Lunch', 'Dinner', 'Solo dining'],
  'Accessibility': ['Wheelchair accessible entrance'],
  'Offerings': ['Comfort food',
   'Quick bite',
   'Small plates',
   'Vegetarian options'],
  'Amenities': ['Good for kids'],
  'Atmosphere': ['C

## Textual Embeddings

In [17]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.2.1-py3-none-any.whl.metadata (13 kB)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading regex-2025.11.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from tra

In [18]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # 384-dim

In [21]:
# Setup Sentence-BERT embeddings for each review
from sentence_transformers import SentenceTransformer

# Determine device for SentenceTransformer (CUDA > MPS > CPU)
if torch.cuda.is_available():
    embedding_device = 'cuda'
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    embedding_device = 'mps'
else:
    embedding_device = 'cpu'

print(f"Using device: {embedding_device} for embeddings")

# Load the model on the specified device
model = SentenceTransformer(model_name, device=embedding_device)

# Collect review_ids and texts (only for reviews with non-None text)
review_ids_with_text = []
review_texts = []

for review_id, text in review_id_to_text.items():
    if text is not None:
        review_ids_with_text.append(review_id)
        review_texts.append(text)

print(f"Generating embeddings for {len(review_texts)} reviews with text (out of {len(filtered_reviews_sorted)} total)")

# Generate embeddings
embeddings = model.encode(
    review_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,  # L2-normalize (often helpful)
)

# Create dictionary mapping review_id to embeddings (only for reviews with text)
review_id_to_embedding = {}
for idx, review_id in enumerate(review_ids_with_text):
    review_id_to_embedding[review_id] = embeddings[idx]

print(f"Created embeddings dictionary with {len(review_id_to_embedding)} entries")
print(f"Embedding shape: {embeddings.shape}")  # (num_reviews_with_text, 384) for MiniLM

Using device: mps for embeddings
Generating embeddings for 89108 reviews with text (out of 157215 total)


Batches: 100%|██████████| 1393/1393 [01:53<00:00, 12.22it/s]


Created embeddings dictionary with 89108 entries
Embedding shape: (89108, 384)


In [25]:
review_id_to_embedding[0]

array([-4.16470431e-02, -2.51492974e-03,  3.40744965e-02, -4.88018356e-02,
       -7.00416192e-02,  3.29151675e-02,  3.72760780e-02,  4.48724860e-03,
       -1.68774407e-02, -3.24930460e-03,  2.73592230e-02, -3.28951180e-02,
        7.28193857e-03, -4.55975905e-02,  3.45369726e-02, -3.59975733e-02,
        1.77359402e-01, -1.38382256e-01,  1.84881110e-02, -8.38461667e-02,
       -1.00873746e-01, -2.55082585e-02,  2.55317986e-02,  1.46996872e-02,
        5.81086148e-03,  8.26392919e-02,  3.39584574e-02,  3.41941714e-02,
       -2.52507571e-02,  5.79340942e-03, -5.23107089e-02,  2.90929358e-02,
       -1.06076291e-02, -2.78808083e-02,  1.58691946e-02,  2.25422662e-02,
        1.39843747e-01, -2.03393698e-02,  5.25584593e-02,  5.49879298e-02,
       -5.81298284e-02, -1.11694285e-03,  5.79708517e-02, -3.58999409e-02,
       -3.75791714e-02, -2.35325247e-02, -1.51160164e-02,  2.73245927e-02,
        3.87939736e-02, -2.75379755e-02,  2.70045567e-02,  1.27133112e-02,
        7.15237111e-02, -