Environment and Import Preparation

In [1]:
!pip install torch torchvision torchaudio -q
!pip install torch-geometric -q
!pip install dgl -q  # generic DGL (CPU/GPU autodetect)
!pip install torchmetrics==1.4.0.post0 scikit-learn pandas numpy tqdm geopy haversine -q

In [21]:
import os, json, math, random, gc, time
from dataclasses import dataclass
from typing import Dict, Tuple, List, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torch_geometric.data import HeteroData
from torch_geometric.utils import to_undirected, coalesce
from torch_geometric.nn import HGTConv, SAGEConv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from haversine import haversine

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED);
if DEVICE.type == 'cuda':
    torch.cuda.manual_seed_all(SEED)

print("Device:", DEVICE)

Device: cpu


# JSON Processing

In [22]:
import json

### Metadata

In [36]:
metadata_path = "meta-Vermont.json"
review_path = "review-Vermont_10.json"

In [27]:
keywords = {
    "restaurant",
    "cafe",
    "bar",
    "beer",
    "pub",
    "brewery",
    "winery",
    "distillery",
    "brewpub",
    "brewery",
    "brewpub",
    "deli",
    "sandwich",
    "coffee",
    "tea",
    "juice",
    "smoothie"
}

In [28]:
# Load JSONL file (one JSON object per line)
metadata = []
with open(metadata_path, "r") as f:
    for line in f:
        line = line.strip()
        if line:  # Skip empty lines
            metadata.append(json.loads(line))

print(f"Loaded {len(metadata)} records")
print(f"Type: {type(metadata)}")
if metadata:
    print(f"First record keys: {list(metadata[0].keys())}")

Loaded 11291 records
Type: <class 'list'>
First record keys: ['name', 'address', 'gmap_id', 'description', 'latitude', 'longitude', 'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results', 'url']


In [29]:
metadata[1]

{'name': 'Foxglove Farm and Forest',
 'address': 'Foxglove Farm and Forest, 777 Delorm Rd, Leicester, VT 05733',
 'gmap_id': '0x4cb549e8877cf0d7:0xe8f003e6d73392ae',
 'description': None,
 'latitude': 43.855742899999996,
 'longitude': -73.08818,
 'category': ['Indoor lodging', 'Farm', 'Gift shop'],
 'avg_rating': 5,
 'num_of_reviews': 3,
 'price': None,
 'hours': None,
 'MISC': None,
 'state': None,
 'relative_results': None,
 'url': 'https://www.google.com/maps/place//data=!4m2!3m1!1s0x4cb549e8877cf0d7:0xe8f003e6d73392ae?authuser=-1&hl=en&gl=us'}

In [31]:
# Filter metadata to only include entries with at least one keyword in categories
filtered_metadata = []

for entry in metadata:
    categories = entry.get('category', [])
    if categories is None:
        continue
    
    # Check if any keyword appears in any category string (case-insensitive)
    matches = False
    for category in categories:
        if category is None:
            continue
        category_lower = str(category).lower()
        for keyword in keywords:
            if keyword.lower() in category_lower:
                matches = True
                break
        if matches:
            break
    
    if matches:
        filtered_metadata.append(entry)

print(f"Original metadata: {len(metadata)} records")
print(f"Filtered metadata: {len(filtered_metadata)} records")
print(f"Filtered {len(metadata) - len(filtered_metadata)} records ({100 * (len(metadata) - len(filtered_metadata)) / len(metadata):.1f}%)")

Original metadata: 11291 records
Filtered metadata: 2305 records
Filtered 8986 records (79.6%)


In [32]:
# Pre-process filtered_metadata: create set of gmap_ids and mapping to metadata entries
filtered_gmap_ids = set()
gmap_id_to_metadata = {}

for entry in filtered_metadata:
    gmap_id = entry.get('gmap_id')
    if gmap_id:
        filtered_gmap_ids.add(gmap_id)
        gmap_id_to_metadata[gmap_id] = entry

print(f"Created set of {len(filtered_gmap_ids)} unique gmap_ids from filtered_metadata")

Created set of 2304 unique gmap_ids from filtered_metadata


In [37]:
# Load reviews from JSONL file
reviews = []
with open(review_path, "r") as f:
    for line in tqdm(f, desc="Loading reviews"):
        line = line.strip()
        if line:  # Skip empty lines
            reviews.append(json.loads(line))

print(f"Loaded {len(reviews)} reviews")
if reviews:
    print(f"First review keys: {list(reviews[0].keys())}")

Loading reviews: 324725it [00:00, 372355.23it/s]

Loaded 324725 reviews
First review keys: ['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id']





In [38]:
# Filter reviews and merge metadata attributes
filtered_reviews = []

for review in tqdm(reviews, desc="Filtering and merging reviews"):
    gmap_id = review.get('gmap_id')
    
    # Only keep reviews with gmap_id in filtered_metadata
    if gmap_id and gmap_id in filtered_gmap_ids:
        # Create a copy of the review to avoid modifying the original
        merged_review = review.copy()
        
        # Get the corresponding metadata entry
        metadata_entry = gmap_id_to_metadata[gmap_id]
        
        # Append all attributes from metadata to the review
        # Use update to merge, which will overwrite if keys conflict (review takes precedence)
        for key, value in metadata_entry.items():
            if key not in merged_review:  # Only add if not already in review
                merged_review[key] = value
            else:
                # If key exists, you might want to prefix it or handle differently
                # For now, we'll keep the review's original value
                pass
        
        filtered_reviews.append(merged_review)

print(f"Original reviews: {len(reviews)}")
print(f"Filtered reviews: {len(filtered_reviews)}")
print(f"Filtered {len(reviews) - len(filtered_reviews)} reviews ({100 * (len(reviews) - len(filtered_reviews)) / len(reviews):.1f}%)")
if filtered_reviews:
    print(f"Sample filtered review keys: {list(filtered_reviews[0].keys())}")

Filtering and merging reviews: 100%|██████████| 324725/324725 [00:00<00:00, 447365.37it/s] 

Original reviews: 324725
Filtered reviews: 157215
Filtered 167510 reviews (51.6%)
Sample filtered review keys: ['user_id', 'name', 'time', 'rating', 'text', 'pics', 'resp', 'gmap_id', 'address', 'description', 'latitude', 'longitude', 'category', 'avg_rating', 'num_of_reviews', 'price', 'hours', 'MISC', 'state', 'relative_results', 'url']





In [39]:
filtered_reviews[1]

{'user_id': '103949805645203359878',
 'name': 'Greg Carlton',
 'time': 1623279276514,
 'rating': 5,
 'text': 'Consistently hits the spot for a straightforward pizza. Staff is always friendly.',
 'pics': [{'url': ['https://lh5.googleusercontent.com/p/AF1QipM3-rDOfWTvsofRy9bygNpf-n46z3mHdgpGSdxW=w150-h150-k-no-p']}],
 'resp': None,
 'gmap_id': '0x4cca9d0401cd8345:0xfd0f1a365e865e14',
 'address': 'Rockers Pizzeria, 191 Main St, Vergennes, VT 05491',
 'description': None,
 'latitude': 44.167623299999995,
 'longitude': -73.2520566,
 'category': ['Restaurant'],
 'avg_rating': 4.4,
 'num_of_reviews': 38,
 'price': None,
 'hours': [['Wednesday', '11AM–8PM'],
  ['Thursday', '11AM–8PM'],
  ['Friday', '11AM–8PM'],
  ['Saturday', '11AM–8PM'],
  ['Sunday', 'Closed'],
  ['Monday', '11AM–8PM'],
  ['Tuesday', '11AM–8PM']],
 'MISC': {'Service options': ['Curbside pickup',
   'No-contact delivery',
   'Delivery',
   'Takeout'],
  'Health & safety': ['Staff required to disinfect surfaces between visits']

### Train-Val-Test Split



In [40]:
# Sort filtered_reviews by time to avoid data leakage
filtered_reviews_sorted = sorted(filtered_reviews, key=lambda x: x.get('time', 0))

print(f"Sorted {len(filtered_reviews_sorted)} reviews by time")
if filtered_reviews_sorted:
    print(f"Earliest time: {filtered_reviews_sorted[0].get('time')}")
    print(f"Latest time: {filtered_reviews_sorted[-1].get('time')}")

# Split into 80:10:10 train:val:test
total = len(filtered_reviews_sorted)
train_size = int(0.8 * total)
val_size = int(0.1 * total)
test_size = total - train_size - val_size  # Remaining goes to test

train_reviews = filtered_reviews_sorted[:train_size]
val_reviews = filtered_reviews_sorted[train_size:train_size + val_size]
test_reviews = filtered_reviews_sorted[train_size + val_size:]

print(f"\nSplit results:")
print(f"Train: {len(train_reviews)} reviews ({100 * len(train_reviews) / total:.1f}%)")
print(f"Val: {len(val_reviews)} reviews ({100 * len(val_reviews) / total:.1f}%)")
print(f"Test: {len(test_reviews)} reviews ({100 * len(test_reviews) / total:.1f}%)")

Sorted 157215 reviews by time
Earliest time: 1182960009677
Latest time: 1629671965744

Split results:
Train: 125772 reviews (80.0%)
Val: 15721 reviews (10.0%)
Test: 15722 reviews (10.0%)


In [41]:
train_reviews[-1]

{'user_id': '116185219049396177446',
 'name': 'Mary Santini',
 'time': 1577663323710,
 'rating': 4,
 'text': None,
 'pics': None,
 'resp': None,
 'gmap_id': '0x89e0247d3c5e0b3b:0x31f9c4d988f48ec7',
 'address': 'China Kitchen, 178 West St, Rutland, VT 05701',
 'description': None,
 'latitude': 43.6074285,
 'longitude': -72.9827102,
 'category': ['Chinese restaurant'],
 'avg_rating': 4.1,
 'num_of_reviews': 168,
 'price': '$',
 'hours': [['Friday', '11AM–10:30PM'],
  ['Saturday', '11AM–10:30PM'],
  ['Sunday', '12–10PM'],
  ['Monday', '11AM–10PM'],
  ['Tuesday', '11AM–10PM'],
  ['Wednesday', '11AM–10PM'],
  ['Thursday', '11AM–10PM']],
 'MISC': {'Service options': ['Takeout', 'Delivery', 'Dine-in'],
  'Health & safety': ['Mask required'],
  'Popular for': ['Lunch', 'Dinner', 'Solo dining'],
  'Accessibility': ['Wheelchair accessible entrance'],
  'Offerings': ['Comfort food',
   'Quick bite',
   'Small plates',
   'Vegetarian options'],
  'Amenities': ['Good for kids'],
  'Atmosphere': ['C

In [43]:
val_reviews[-1]

{'user_id': '112219725312542091926',
 'name': 'Chris',
 'time': 1598254522776,
 'rating': 4,
 'text': None,
 'pics': None,
 'resp': None,
 'gmap_id': '0x4cca6fc65d72c235:0xd5599b65c5d0ae99',
 'address': "McDonald's, 44 S Park Dr, Colchester, VT 05446",
 'description': 'Classic, long-running fast-food chain known for its burgers, fries & shakes.',
 'latitude': 44.503427099999996,
 'longitude': -73.1810126,
 'category': ['Fast food restaurant',
  'Breakfast restaurant',
  'Coffee shop',
  'Hamburger restaurant',
  'Restaurant',
  'Sandwich shop'],
 'avg_rating': 3.5,
 'num_of_reviews': 658,
 'price': '$',
 'hours': [['Saturday', '5AM–12AM'],
  ['Sunday', '5AM–12AM'],
  ['Monday', '5AM–12AM'],
  ['Tuesday', '5AM–12AM'],
  ['Wednesday', '5AM–12AM'],
  ['Thursday', '5AM–12AM'],
  ['Friday', '5AM–12AM']],
 'MISC': {'Service options': ['Curbside pickup',
   'No-contact delivery',
   'Delivery',
   'Drive-through',
   'Takeout',
   'Dine-in'],
  'Highlights': ['Fast service', 'Great coffee'],


In [44]:
val_reviews[1]

{'user_id': '101883984291248544171',
 'name': 'Micah Raymond',
 'time': 1577664489275,
 'rating': 5,
 'text': None,
 'pics': None,
 'resp': None,
 'gmap_id': '0x4cca9d9cccf4bd75:0xa2dee32ccba80a',
 'address': '3 Squares Cafe, 141 Main St, Vergennes, VT 05491',
 'description': 'Inventive American dishes with a local focus are presented in a vintage dining room & patio.',
 'latitude': 44.1680377,
 'longitude': -73.2510102,
 'category': ['American restaurant', 'Cafe'],
 'avg_rating': 4.5,
 'num_of_reviews': 468,
 'price': '$$',
 'hours': [['Tuesday', '9AM–7PM'],
  ['Wednesday', '9AM–7PM'],
  ['Thursday', '9AM–7PM'],
  ['Friday', '9AM–7PM'],
  ['Saturday', '9AM–7PM'],
  ['Sunday', '9AM–7PM'],
  ['Monday', '9AM–7PM']],
 'MISC': {'Service options': ['Outdoor seating',
   'Curbside pickup',
   'Takeout',
   'Dine-in',
   'Delivery'],
  'Health & safety': ['Mask required',
   'Staff wear masks',
   'Staff get temperature checks',
   'Staff required to disinfect surfaces between visits'],
  'Hi