Environment and Import Preparation

In [1]:
!pip install torch torchvision torchaudio
!pip install torch-geometric
!pip install dgl # generic DGL (CPU/GPU autodetect)
!pip install torchmetrics==1.4.0.post0 scikit-learn pandas numpy tqdm geopy haversine



In [2]:
!pip install ipywidgets -q

In [3]:
import os, json, math, random, gc, time
from dataclasses import dataclass
from typing import Dict, Tuple, List, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torch_geometric.data import HeteroData
from torch_geometric.utils import to_undirected, coalesce
from torch_geometric.nn import HGTConv, SAGEConv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from haversine import haversine

# Set device: CUDA > MPS > CPU
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    DEVICE = torch.device('mps')
else:
    DEVICE = torch.device('cpu')

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED);
if DEVICE.type == 'cuda':
    torch.cuda.manual_seed_all(SEED)
if DEVICE.type == 'mps':
    torch.mps.manual_seed(SEED)

print("Device:", DEVICE)

Device: mps


# JSON Processing

In [4]:
import json

### Metadata

In [9]:
metadata_path = "meta-District_of_Columbia.json"
review_path = "review-District_of_Columbia_10.json"

In [10]:
keywords = {
    "restaurant",
    "cafe",
    "bar",
    "beer",
    "pub",
    "brewery",
    "winery",
    "distillery",
    "brewpub",
    "brewery",
    "brewpub",
    "deli",
    "sandwich",
    "coffee",
    "tea",
    "juice",
    "smoothie"
}

In [11]:
# Load JSONL file (one JSON object per line)
metadata = []
with open(metadata_path, "r") as f:
    for line in f:
        line = line.strip()
        if line:  # Skip empty lines
            metadata.append(json.loads(line))

print(f"Loaded {len(metadata)} records")
print(f"Type: {type(metadata)}")
if metadata:
    print(f"First record keys: {list(metadata[0].keys())}")

Loaded 11060 records
Type: <class 'list'>
First record keys: ['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude', 'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results', 'url']


In [12]:
metadata[1]

{'name': "Joseph's Barbershop",
 'address': "Joseph's Barbershop, 2624B Georgia Ave NW, Washington, DC 20001",
 'gmap_id': '0x89b7b797548dfcfd:0xe3a4b60261c60313',
 'description': None,
 'latitude': 38.9249134,
 'longitude': -77.0228857,
 'category': ['Barber shop'],
 'avg_rating': 4.3,
 'num_of_reviews': 8,
 'price': None,
 'hours': [['Thursday', '10AM–7PM'],
  ['Friday', '10AM–7PM'],
  ['Saturday', '10AM–7PM'],
  ['Sunday', 'Closed'],
  ['Monday', '10AM–7PM'],
  ['Tuesday', '10AM–7PM'],
  ['Wednesday', '10AM–7PM']],
 'MISC': None,
 'state': 'Permanently closed',
 'relative_results': ['0x89b7c81aec442c2f:0x2df3cce722072454',
  '0x89b7c86180bf7b99:0xf9c20a51d787e071'],
 'url': 'https://www.google.com/maps/place//data=!4m2!3m1!1s0x89b7b797548dfcfd:0xe3a4b60261c60313?authuser=-1&hl=en&gl=us'}

In [13]:
# Filter metadata to only include entries with at least one keyword in categories
filtered_metadata = []

for entry in metadata:
    categories = entry.get('category', [])
    if categories is None:
        continue
    
    # Check if any keyword appears in any category string (case-insensitive)
    matches = False
    for category in categories:
        if category is None:
            continue
        category_lower = str(category).lower()
        for keyword in keywords:
            if keyword.lower() in category_lower:
                matches = True
                break
        if matches:
            break
    
    if matches:
        filtered_metadata.append(entry)

print(f"Original metadata: {len(metadata)} records")
print(f"Filtered metadata: {len(filtered_metadata)} records")
print(f"Filtered {len(metadata) - len(filtered_metadata)} records ({100 * (len(metadata) - len(filtered_metadata)) / len(metadata):.1f}%)")

Original metadata: 11060 records
Filtered metadata: 3719 records
Filtered 7341 records (66.4%)


In [14]:
# Pre-process filtered_metadata: create set of gmap_ids and mapping to metadata entries
filtered_gmap_ids = set()
gmap_id_to_metadata = {}

for entry in filtered_metadata:
    gmap_id = entry.get('gmap_id')
    if gmap_id:
        filtered_gmap_ids.add(gmap_id)
        gmap_id_to_metadata[gmap_id] = entry

print(f"Created set of {len(filtered_gmap_ids)} unique gmap_ids from filtered_metadata")

Created set of 3707 unique gmap_ids from filtered_metadata


In [15]:
# Load reviews from JSONL file
reviews = []
with open(review_path, "r") as f:
    for line in tqdm(f, desc="Loading reviews"):
        line = line.strip()
        if line:  # Skip empty lines
            reviews.append(json.loads(line))

print(f"Loaded {len(reviews)} reviews")
if reviews:
    print(f"First review keys: {list(reviews[0].keys())}")

Loading reviews: 564783it [00:02, 220773.50it/s]

Loaded 564783 reviews
First review keys: ['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id']





In [16]:
# Filter reviews and merge metadata attributes
filtered_reviews = []

for review in tqdm(reviews, desc="Filtering and merging reviews"):
    gmap_id = review.get('gmap_id')
    
    # Only keep reviews with gmap_id in filtered_metadata
    if gmap_id and gmap_id in filtered_gmap_ids:
        # Create a copy of the review to avoid modifying the original
        merged_review = review.copy()
        
        # Get the corresponding metadata entry
        metadata_entry = gmap_id_to_metadata[gmap_id]
        
        # Append all attributes from metadata to the review
        # Use update to merge, which will overwrite if keys conflict (review takes precedence)
        for key, value in metadata_entry.items():
            if key not in merged_review:  # Only add if not already in review
                merged_review[key] = value
            else:
                # If key exists, you might want to prefix it or handle differently
                # For now, we'll keep the review's original value
                pass
        
        filtered_reviews.append(merged_review)

print(f"Original reviews: {len(reviews)}")
print(f"Filtered reviews: {len(filtered_reviews)}")
print(f"Filtered {len(reviews) - len(filtered_reviews)} reviews ({100 * (len(reviews) - len(filtered_reviews)) / len(reviews):.1f}%)")
if filtered_reviews:
    print(f"Sample filtered review keys: {list(filtered_reviews[0].keys())}")

Filtering and merging reviews: 100%|██████████| 564783/564783 [00:01<00:00, 406886.91it/s]

Original reviews: 564783
Filtered reviews: 347984
Filtered 216799 reviews (38.4%)
Sample filtered review keys: ['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id', 'address', 'description', 'latitude', 'longitude', 'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results', 'url']





In [17]:
filtered_reviews[1]

{'user_id': '108642560086289718425',
 'name': 'Keshava Mysore',
 'time': 1510920735052,
 'rating': 4,
 'text': 'High prices for not so big portions! But I think it is the only decent food available on the National Mall area without walking a few miles away from there.',
 'pics': None,
 'resp': None,
 'gmap_id': '0x89b7b7851b06ef6b:0x5f356b1eb1da27',
 'address': 'Cascade Café, 599 Constitution Ave. NW, Washington, DC 20565',
 'description': 'Cafeteria-style dining at the National Gallery of Art, with casual fare such as pizza & sandwiches.',
 'latitude': 38.8920767,
 'longitude': -77.0199082,
 'category': ['American restaurant'],
 'avg_rating': 2.6,
 'num_of_reviews': 28,
 'price': None,
 'hours': [['Thursday', '11AM–3PM'],
  ['Friday', '11AM–3PM'],
  ['Saturday', '11AM–3PM'],
  ['Sunday', '11AM–4PM'],
  ['Monday', '11AM–3PM'],
  ['Tuesday', '11AM–3PM'],
  ['Wednesday', '11AM–3PM']],
 'MISC': {'Service options': ['Takeout', 'Dine-in', 'Delivery'],
  'Highlights': ['Great coffee'],
  'Po

### Train-Val-Test Split



In [18]:
# Sort filtered_reviews by time to avoid data leakage
filtered_reviews_sorted = sorted(filtered_reviews, key=lambda x: x.get('time', 0))

# Add review_id field in ascending order
for idx, review in enumerate(filtered_reviews_sorted):
    review['review_id'] = idx

# Extract text field into a dictionary from review_id to review text
review_id_to_text = {}
for review in filtered_reviews_sorted:
    review_id = review['review_id']
    text = review.get('text')
    review_id_to_text[review_id] = text  # Will be None if text is None

print(f"Sorted {len(filtered_reviews_sorted)} reviews by time")
if filtered_reviews_sorted:
    print(f"Earliest time: {filtered_reviews_sorted[0].get('time')}")
    print(f"Latest time: {filtered_reviews_sorted[-1].get('time')}")

# Split into 80:10:10 train:val:test
total = len(filtered_reviews_sorted)
train_size = int(0.8 * total)
val_size = int(0.1 * total)
test_size = total - train_size - val_size  # Remaining goes to test

train_reviews = filtered_reviews_sorted[:train_size]
val_reviews = filtered_reviews_sorted[train_size:train_size + val_size]
test_reviews = filtered_reviews_sorted[train_size + val_size:]

print(f"\nSplit results:")
print(f"Train: {len(train_reviews)} reviews ({100 * len(train_reviews) / total:.1f}%)")
print(f"Val: {len(val_reviews)} reviews ({100 * len(val_reviews) / total:.1f}%)")
print(f"Test: {len(test_reviews)} reviews ({100 * len(test_reviews) / total:.1f}%)")

Sorted 347984 reviews by time
Earliest time: 662601600000
Latest time: 1630966701405

Split results:
Train: 278387 reviews (80.0%)
Val: 34798 reviews (10.0%)
Test: 34799 reviews (10.0%)


In [19]:
train_reviews[-1]

{'user_id': '114719101114449289199',
 'name': 'Alejandro Suarez',
 'time': 1571933437638,
 'rating': 5,
 'text': None,
 'pics': None,
 'resp': None,
 'gmap_id': '0x89b7b82e9ef154c3:0xa310bf3ea7d34a66',
 'address': 'Capitol Lounge, 229 Pennsylvania Ave. SE, Washington, DC 20003',
 'description': 'Neighborhood fixture drawing lots of Hill staffers with a lengthy beer list & happy-hour specials.',
 'latitude': 38.8869652,
 'longitude': -77.0024373,
 'category': ['Bar',
  'American restaurant',
  'Beer hall',
  'Event venue',
  'Pool hall',
  'Sports bar'],
 'avg_rating': 4.2,
 'num_of_reviews': 458,
 'price': '$',
 'hours': [['Sunday', '10AM–2AM'],
  ['Monday', '4PM–2AM'],
  ['Tuesday', '4PM–2AM'],
  ['Wednesday', '4PM–2AM'],
  ['Thursday', '11AM–2AM'],
  ['Friday', '11AM–2AM'],
  ['Saturday', '10AM–3AM']],
 'MISC': {'Service options': ['Delivery', 'Takeout', 'Dine-in'],
  'Accessibility': ['Wheelchair accessible entrance',
   'Wheelchair accessible seating'],
  'Offerings': ['Alcohol',
 

In [None]:
import json
import pandas as pd

train_df = pd.json_normalize(train_reviews)
train_df.to_csv("train_reviews.csv", index=False, escapechar="\\")

  train = train.applymap(


In [21]:
val_df = pd.json_normalize(val_reviews)

val = val_df.copy()
val = val.applymap(
    lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x
)

val.to_csv("val_reviews.csv", index=False, escapechar="\\")

  val = val.applymap(


In [22]:
test_df = pd.json_normalize(test_reviews)

test = test_df.copy()
test = test.applymap(
    lambda x: json.dumps(x) if isinstance(x, (list, dict)) else x
)

test.to_csv("test_reviews.csv", index=False, escapechar="\\")

  test = test.applymap(


## Textual Embeddings

In [17]:
!pip install sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.2.1-py3-none-any.whl.metadata (13 kB)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading regex-2025.11.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from tra

In [18]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # 384-dim

In [21]:
# Setup Sentence-BERT embeddings for each review
from sentence_transformers import SentenceTransformer

# Determine device for SentenceTransformer (CUDA > MPS > CPU)
if torch.cuda.is_available():
    embedding_device = 'cuda'
elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    embedding_device = 'mps'
else:
    embedding_device = 'cpu'

print(f"Using device: {embedding_device} for embeddings")

# Load the model on the specified device
model = SentenceTransformer(model_name, device=embedding_device)

# Collect review_ids and texts (only for reviews with non-None text)
review_ids_with_text = []
review_texts = []

for review_id, text in review_id_to_text.items():
    if text is not None:
        review_ids_with_text.append(review_id)
        review_texts.append(text)

print(f"Generating embeddings for {len(review_texts)} reviews with text (out of {len(filtered_reviews_sorted)} total)")

# Generate embeddings
embeddings = model.encode(
    review_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,  # L2-normalize (often helpful)
)

# Create dictionary mapping review_id to embeddings (only for reviews with text)
review_id_to_embedding = {}
for idx, review_id in enumerate(review_ids_with_text):
    review_id_to_embedding[review_id] = embeddings[idx]

print(f"Created embeddings dictionary with {len(review_id_to_embedding)} entries")
print(f"Embedding shape: {embeddings.shape}")  # (num_reviews_with_text, 384) for MiniLM

Using device: mps for embeddings
Generating embeddings for 89108 reviews with text (out of 157215 total)


Batches: 100%|██████████| 1393/1393 [01:53<00:00, 12.22it/s]


Created embeddings dictionary with 89108 entries
Embedding shape: (89108, 384)


In [25]:
review_id_to_embedding[0]

array([-4.16470431e-02, -2.51492974e-03,  3.40744965e-02, -4.88018356e-02,
       -7.00416192e-02,  3.29151675e-02,  3.72760780e-02,  4.48724860e-03,
       -1.68774407e-02, -3.24930460e-03,  2.73592230e-02, -3.28951180e-02,
        7.28193857e-03, -4.55975905e-02,  3.45369726e-02, -3.59975733e-02,
        1.77359402e-01, -1.38382256e-01,  1.84881110e-02, -8.38461667e-02,
       -1.00873746e-01, -2.55082585e-02,  2.55317986e-02,  1.46996872e-02,
        5.81086148e-03,  8.26392919e-02,  3.39584574e-02,  3.41941714e-02,
       -2.52507571e-02,  5.79340942e-03, -5.23107089e-02,  2.90929358e-02,
       -1.06076291e-02, -2.78808083e-02,  1.58691946e-02,  2.25422662e-02,
        1.39843747e-01, -2.03393698e-02,  5.25584593e-02,  5.49879298e-02,
       -5.81298284e-02, -1.11694285e-03,  5.79708517e-02, -3.58999409e-02,
       -3.75791714e-02, -2.35325247e-02, -1.51160164e-02,  2.73245927e-02,
        3.87939736e-02, -2.75379755e-02,  2.70045567e-02,  1.27133112e-02,
        7.15237111e-02, -