# Notebook 5: Web Data Preparation
## Shade-Optimized Pedestrian Routing to Transit

**Author:** Kavana Raju  
**Course:** MUSA 5500 - Geospatial Data Science with Python  
**Date:** December 2025

---

This notebook prepares data for the interactive web application:
1. Load processed network data
2. Convert to web-friendly JSON format
3. Optimize for browser loading
4. Create data files for React app
5. Validate output

## Setup & Imports

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("✓ Imports successful")

## 1. Load Processed Data

In [None]:
print("Loading processed data from previous notebooks...\n")

# Load network with shade scores (from Notebook 2)
edges = gpd.read_file('data/processed/network_edges_with_shade.geojson')
print(f"✓ Edges loaded: {len(edges):,}")

# Load nodes
nodes = gpd.read_file('data/processed/network_nodes.geojson')
print(f"✓ Nodes loaded: {len(nodes):,}")

# Load SEPTA stops
septa = gpd.read_file('data/processed/septa_stops.geojson')
print(f"✓ SEPTA stops loaded: {len(septa)}")

# Load study area
study_area = gpd.read_file('data/processed/study_area.geojson')
print(f"✓ Study area loaded")

# Get available shade scenarios
shade_cols = [c for c in edges.columns if c.startswith('shade_')]
scenarios = [c.replace('shade_', '') for c in shade_cols]

print(f"\n✓ Found {len(scenarios)} temporal scenarios:")
for s in scenarios:
    print(f"  • {s}")

## 2. Convert to WGS84 (Web Standard)

In [None]:
print("Converting to WGS84 (EPSG:4326) for web compatibility...\n")

# Convert all data to WGS84
edges_web = edges.to_crs('EPSG:4326')
nodes_web = nodes.to_crs('EPSG:4326')
septa_web = septa.to_crs('EPSG:4326')
study_area_web = study_area.to_crs('EPSG:4326')

print("✓ All data converted to WGS84")
print(f"  Edges: {edges_web.crs}")
print(f"  Nodes: {nodes_web.crs}")

## 3. Prepare Nodes Data

In [None]:
print("Preparing nodes data for web application...\n")

nodes_data = []

for idx, node in nodes_web.iterrows():
    # Get node ID
    node_id = int(idx) if isinstance(idx, (int, np.integer)) else str(idx)
    
    # Get coordinates
    lat = float(node.geometry.y)
    lon = float(node.geometry.x)
    
    nodes_data.append({
        'id': node_id,
        'lat': lat,
        'lon': lon
    })

print(f"✓ Prepared {len(nodes_data):,} nodes")
print(f"\nSample node:")
print(f"  {nodes_data[0]}")

## 4. Prepare Edges Data with Shade Scores

In [None]:
print("Preparing edges data with shade scores...\n")
print("This may take a few minutes...\n")

edges_data = []

for idx, edge in edges_web.iterrows():
    try:
        # Get edge endpoints (u, v, key)
        if isinstance(idx, tuple) and len(idx) >= 2:
            u, v = idx[0], idx[1]
            key = idx[2] if len(idx) > 2 else 0
        else:
            # Try to get from columns
            u = edge.get('u', edge.get('node_start', edge.get('from')))
            v = edge.get('v', edge.get('node_end', edge.get('to')))
            key = edge.get('key', 0)
        
        # Convert to appropriate types
        u = int(u) if isinstance(u, (int, np.integer)) else str(u)
        v = int(v) if isinstance(v, (int, np.integer)) else str(v)
        key = int(key)
        
        # Get geometry coordinates
        coords = list(edge.geometry.coords)
        coordinates = [[float(lon), float(lat)] for lon, lat in coords]
        
        # Get edge length
        length = float(edge.get('length', 0))
        
        # Get shade scores for all scenarios
        shade_scores = {}
        for scenario in scenarios:
            col = f'shade_{scenario}'
            if col in edge.index or col in edges_web.columns:
                shade_val = edge[col]
                if pd.notna(shade_val):
                    shade_scores[scenario] = float(shade_val)
        
        # Create edge object
        edge_obj = {
            'u': u,
            'v': v,
            'key': key,
            'length': length,
            'coordinates': coordinates,
            'shade': shade_scores
        }
        
        edges_data.append(edge_obj)
        
        # Progress indicator
        if (len(edges_data)) % 2000 == 0:
            print(f"  Processed {len(edges_data):,} / {len(edges_web):,} edges")
    
    except Exception as e:
        print(f"  ⚠ Warning: Could not process edge {idx}: {e}")
        continue

print(f"\n✓ Prepared {len(edges_data):,} edges with shade scores")
print(f"\nSample edge:")
print(json.dumps(edges_data[0], indent=2))

## 5. Prepare Transit Stops Data

In [None]:
print("Preparing transit stops data...\n")

stops_data = []

for idx, stop in septa_web.iterrows():
    stops_data.append({
        'name': str(stop.get('stop_name', stop.get('name', 'Transit Stop'))),
        'category': str(stop.get('category', 'Transit')),
        'lat': float(stop.geometry.y),
        'lon': float(stop.geometry.x)
    })

print(f"✓ Prepared {len(stops_data)} transit stops")
print(f"\nSample stop:")
print(f"  {stops_data[0]}")

## 6. Calculate Metadata and Bounds

In [None]:
print("Calculating metadata and bounds...\n")

# Calculate bounding box
all_lats = [n['lat'] for n in nodes_data]
all_lons = [n['lon'] for n in nodes_data]

bounds = {
    'north': float(max(all_lats)),
    'south': float(min(all_lats)),
    'east': float(max(all_lons)),
    'west': float(min(all_lons))
}

# Calculate center
center = {
    'lat': (bounds['north'] + bounds['south']) / 2,
    'lon': (bounds['east'] + bounds['west']) / 2
}

# Create metadata
metadata = {
    'scenarios': scenarios,
    'num_nodes': len(nodes_data),
    'num_edges': len(edges_data),
    'num_stops': len(stops_data),
    'bounds': bounds,
    'center': center,
    'crs': 'EPSG:4326',
    'generated': pd.Timestamp.now().isoformat()
}

print(f"✓ Metadata created")
print(f"\nBounds:")
print(f"  North: {bounds['north']:.4f}")
print(f"  South: {bounds['south']:.4f}")
print(f"  East:  {bounds['east']:.4f}")
print(f"  West:  {bounds['west']:.4f}")
print(f"\nCenter: ({center['lat']:.4f}, {center['lon']:.4f})")

## 7. Save Data Files for Web Application

In [None]:
print("Saving data files for web application...\n")

# Create output directory
output_dir = Path('website/data')
output_dir.mkdir(parents=True, exist_ok=True)

# Save nodes
print("1. Saving nodes.json...")
with open(output_dir / 'nodes.json', 'w') as f:
    json.dump(nodes_data, f)
nodes_size = (output_dir / 'nodes.json').stat().st_size / (1024 * 1024)
print(f"   ✓ Saved: {nodes_size:.2f} MB")

# Save edges
print("\n2. Saving edges.json...")
print("   (This may take a moment for large networks)")
with open(output_dir / 'edges.json', 'w') as f:
    json.dump(edges_data, f)
edges_size = (output_dir / 'edges.json').stat().st_size / (1024 * 1024)
print(f"   ✓ Saved: {edges_size:.2f} MB")

# Save stops
print("\n3. Saving stops.json...")
with open(output_dir / 'stops.json', 'w') as f:
    json.dump(stops_data, f)
stops_size = (output_dir / 'stops.json').stat().st_size / (1024 * 1024)
print(f"   ✓ Saved: {stops_size:.2f} MB")

# Save metadata
print("\n4. Saving metadata.json...")
with open(output_dir / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
metadata_size = (output_dir / 'metadata.json').stat().st_size / 1024
print(f"   ✓ Saved: {metadata_size:.2f} KB")

print(f"\n✓ All files saved to: {output_dir}/")

## 8. Create Compressed Versions (Optional)

In [None]:
import gzip
import shutil

print("Creating compressed versions for faster loading...\n")

# Compress large files
files_to_compress = ['nodes.json', 'edges.json', 'stops.json']

for filename in files_to_compress:
    filepath = output_dir / filename
    if filepath.exists():
        print(f"Compressing {filename}...")
        
        # Read original
        with open(filepath, 'rb') as f_in:
            # Write compressed
            with gzip.open(str(filepath) + '.gz', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        # Check compression ratio
        original_size = filepath.stat().st_size / (1024 * 1024)
        compressed_size = Path(str(filepath) + '.gz').stat().st_size / (1024 * 1024)
        ratio = (1 - compressed_size / original_size) * 100
        
        print(f"  Original: {original_size:.2f} MB")
        print(f"  Compressed: {compressed_size:.2f} MB")
        print(f"  Savings: {ratio:.1f}%\n")

print("✓ Compressed versions created")
print("\nNote: Web servers can serve .gz files directly with Content-Encoding: gzip")

## 9. Validate Output Data

In [None]:
print("Validating output data...\n")

# Load and validate
validation_errors = []

try:
    # Validate nodes
    with open(output_dir / 'nodes.json') as f:
        nodes_check = json.load(f)
    print(f"✓ nodes.json: {len(nodes_check):,} nodes loaded")
    
    # Check node structure
    if nodes_check[0].get('id') is None:
        validation_errors.append("Nodes missing 'id' field")
    if nodes_check[0].get('lat') is None:
        validation_errors.append("Nodes missing 'lat' field")
    if nodes_check[0].get('lon') is None:
        validation_errors.append("Nodes missing 'lon' field")
    
    # Validate edges
    with open(output_dir / 'edges.json') as f:
        edges_check = json.load(f)
    print(f"✓ edges.json: {len(edges_check):,} edges loaded")
    
    # Check edge structure
    if edges_check[0].get('u') is None:
        validation_errors.append("Edges missing 'u' field")
    if edges_check[0].get('v') is None:
        validation_errors.append("Edges missing 'v' field")
    if edges_check[0].get('shade') is None:
        validation_errors.append("Edges missing 'shade' field")
    
    # Check shade scenarios
    edge_scenarios = list(edges_check[0].get('shade', {}).keys())
    print(f"  Shade scenarios: {len(edge_scenarios)}")
    for s in edge_scenarios[:3]:
        print(f"    • {s}")
    
    # Validate stops
    with open(output_dir / 'stops.json') as f:
        stops_check = json.load(f)
    print(f"✓ stops.json: {len(stops_check)} stops loaded")
    
    # Validate metadata
    with open(output_dir / 'metadata.json') as f:
        metadata_check = json.load(f)
    print(f"✓ metadata.json: {len(metadata_check['scenarios'])} scenarios")
    
except Exception as e:
    validation_errors.append(f"File loading error: {str(e)}")

# Report validation
print("\n" + "="*70)
if len(validation_errors) == 0:
    print("VALIDATION PASSED - ALL DATA FILES ARE VALID")
else:
    print("VALIDATION ISSUES FOUND:")
    for error in validation_errors:
        print(f"  • {error}")
print("="*70)

## 10. Summary and Next Steps

In [None]:
print("\n" + "="*70)
print("WEB DATA PREPARATION COMPLETE")
print("="*70)

print(f"\nOutput Directory: {output_dir.absolute()}")
print(f"\nFiles Created:")
print(f"  • nodes.json:     {nodes_size:.2f} MB ({len(nodes_data):,} nodes)")
print(f"  • edges.json:     {edges_size:.2f} MB ({len(edges_data):,} edges)")
print(f"  • stops.json:     {stops_size:.2f} MB ({len(stops_data)} stops)")
print(f"  • metadata.json:  {metadata_size:.2f} KB")
print(f"\n  Total: {nodes_size + edges_size + stops_size:.2f} MB")

print(f"\nData Includes:")
print(f"  • {len(scenarios)} temporal scenarios")
print(f"  • Shade scores for all street segments")
print(f"  • Network structure for routing")
print(f"  • Transit stop locations")

print(f"\nNext Steps:")
print(f"  1. Copy website/data/ folder to your web hosting")
print(f"  2. Deploy React application with InteractiveShadeRouting component")
print(f"  3. Configure web server to serve compressed .gz files")
print(f"  4. Test routing functionality")

print(f"\nPerformance Tips:")
if edges_size > 20:
    print(f"  ⚠ edges.json is {edges_size:.1f} MB - consider:")
    print(f"    • Using .gz compressed version")
    print(f"    • Implementing lazy loading")
    print(f"    • Adding loading progress indicator")
else:
    print(f"  ✓ File sizes are reasonable for web loading")

print(f"\nReady for Interactive Web Application!")
print("="*70)