# Uber NYC Ride-Hailing: Geographic Evolution 2018-2025

Comparative spatial analysis of Uber pickup and dropoff patterns across New York City taxi zones, using K-means clustering on geographic coordinates, Lorenz curve concentration measures, and Local Indicators of Spatial Association (LISA) to quantify how ride-hailing demand has redistributed over seven years.

---
## 1. Setup and Configuration

In [1]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from scipy.stats import ks_2samp, levene
from math import radians, cos, sin, asin, sqrt
import geopandas as gpd
import libpysal
from esda.moran import Moran, Moran_Local
import json
import gc
import os

# ── Paths ──────────────────────────────────────────────────────────────────
BASE_DIR = '/Users/leoss/Desktop/Portfolio/Website-/'
DATA_DIR = BASE_DIR + 'projects/uber/data/'
OUTPUT_DIR = BASE_DIR + 'projects/uber/outputs/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

PATH_2018 = DATA_DIR + 'fhv_tripdata_2018-01.parquet'
PATH_2025 = DATA_DIR + 'fhvhv_tripdata_2025-01.parquet'
PATH_CENTROIDS = DATA_DIR + 'zone_centroids.csv'
PATH_SHAPEFILE = DATA_DIR + 'taxi_zones/taxi_zones.shp'

# ── Analysis parameters ────────────────────────────────────────────────────
SAMPLE_SIZE = 20_000_000
N_CLUSTERS = 6

UBER_2018_BASES = ['B02512', 'B02598', 'B02617', 'B02682', 'B02764', 'B02765', 'B02835', 'B02836']
UBER_2025_LICENSE = 'HV0003'

AIRPORT_ZONE_IDS = {132, 138, 1}
AIRPORT_LABELS = {132: 'JFK', 138: 'LaGuardia', 1: 'Newark (EWR)'}

# ── Unified style system ──────────────────────────────────────────────────
STYLE = {
    'font_family': 'IBM Plex Sans, -apple-system, BlinkMacSystemFont, sans-serif',
    'tick_size': 11,
    'axis_title_size': 13,
    'legend_size': 11,
    'template': 'plotly_white',
    'plot_bg': 'rgba(0,0,0,0)',
    'paper_bg': 'white',
    'chart_height': 550,
    'margin_default': dict(l=60, r=40, t=20, b=50),
    'margin_map': dict(l=0, r=0, t=20, b=0),
    'grid_color': '#e5e7eb',
    'grid_width': 0.5,
    'hover_font_size': 13,
    'hover_font_color': '#1a2744',
    # Year comparison palette
    'year_2018': '#ff6b6b',
    'year_2025': '#4ecdc4',
    # Cluster palette (6 clusters)
    'cluster_colors': ['#e6194b', '#3cb44b', '#4363d8', '#f58231', '#911eb4', '#42d4f4'],
    # Borough palette
    'borough_colors': {
        'Manhattan': '#4363d8', 'Brooklyn': '#3cb44b', 'Queens': '#f58231',
        'Bronx': '#e6194b', 'Staten Island': '#911eb4', 'EWR': '#42d4f4',
    },
    # LISA palette
    'lisa_colors': {
        'HH': '#d7191c', 'LL': '#2c7bb6', 'HL': '#fdae61',
        'LH': '#abd9e9', 'ns': '#e8e8e8',
    },
    'lisa_labels': {
        'HH': 'High-High (hot spot)', 'LL': 'Low-Low (cold spot)',
        'HL': 'High-Low (outlier)', 'LH': 'Low-High (outlier)',
        'ns': 'Not significant',
    },
    # Map defaults
    'map_style': 'carto-positron-nolabels',
    'map_center': {'lat': 40.7128, 'lon': -73.9352},
    'map_zoom': 9.5,
}


def base_layout(height=None, width=None, **kwargs):
    """Standard layout applied to every chart."""
    layout = dict(
        title='',
        font=dict(family=STYLE['font_family']),
        template=STYLE['template'],
        plot_bgcolor=STYLE['plot_bg'],
        paper_bgcolor=STYLE['paper_bg'],
        height=height or STYLE['chart_height'],
        margin=STYLE['margin_default'],
        hoverlabel=dict(
            font_size=STYLE['hover_font_size'],
            font_color=STYLE['hover_font_color'],
        ),
    )
    if width:
        layout['width'] = width
    layout.update(kwargs)
    return layout


def styled_axis(**kwargs):
    """Standard axis styling."""
    return dict(
        tickfont=dict(size=STYLE['tick_size']),
        title_font=dict(size=STYLE['axis_title_size']),
        gridcolor=STYLE['grid_color'],
        gridwidth=STYLE['grid_width'],
        **kwargs,
    )


def save_html(fig, filename):
    """Save figure as CDN-loaded HTML with mode bar suppressed."""
    fig.write_html(
        OUTPUT_DIR + filename,
        include_plotlyjs='cdn',
        config={'displayModeBar': False},
    )
    print(f"  Saved: {filename}")


print(f"Configuration:")
print(f"  Sample size: {SAMPLE_SIZE:,} trips per year")
print(f"  Clusters: {N_CLUSTERS}")
print(f"  Output: {OUTPUT_DIR}")

Configuration:
  Sample size: 20,000,000 trips per year
  Clusters: 6
  Output: /Users/leoss/Desktop/Portfolio/Website-/projects/uber/outputs/


---
## 2. Helper Functions

In [2]:
def haversine_km(lat1, lon1, lat2, lon2):
    """Great-circle distance between two points in km."""
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    return 2 * 6371 * asin(sqrt(a))


def name_cluster_by_location(center_lat, center_lon, zone_centroids):
    """Name cluster based on nearest major zone."""
    zone_distances = np.sqrt(
        (zone_centroids['latitude'] - center_lat) ** 2 +
        (zone_centroids['longitude'] - center_lon) ** 2
    )
    nearest_idx = zone_distances.idxmin()
    nearest_zone = zone_centroids.iloc[nearest_idx]
    return f"{nearest_zone['borough']}: {nearest_zone['zone_name']}"


def match_clusters_by_proximity(centers_2018, centers_2025):
    """Match 2018 to 2025 clusters by geographic proximity (greedy nearest)."""
    distances = cdist(centers_2018, centers_2025, metric='cityblock')
    matches = {}
    used_2025 = set()
    pairs = []
    for i in range(len(centers_2018)):
        for j in range(len(centers_2025)):
            pairs.append((i, j, distances[i, j]))
    pairs.sort(key=lambda x: x[2])
    for i, j, dist in pairs:
        if i not in matches and j not in used_2025:
            matches[i] = j
            used_2025.add(j)
    return matches


def lorenz_data(df, zone_col='PU_zone_id'):
    """Return (cumulative share of zones, cumulative share of trips)."""
    zone_counts = df.groupby(zone_col).size().sort_values().values
    cum_zones = np.arange(1, len(zone_counts) + 1) / len(zone_counts)
    cum_trips = np.cumsum(zone_counts) / zone_counts.sum()
    return cum_zones, cum_trips


def gini_from_lorenz(cum_zones, cum_trips):
    """Gini coefficient via trapezoidal integration under the Lorenz curve."""
    area_under = np.trapz(cum_trips, cum_zones)
    return 1 - 2 * area_under


def merge_zone_info(df, zone_col, centroids, prefix):
    """Merge zone centroid info onto a dataframe, with column prefix."""
    merged = df.merge(
        centroids[['zone_id', 'zone_name', 'borough', 'latitude', 'longitude']],
        left_on=zone_col, right_on='zone_id', how='left'
    )
    merged = merged.rename(columns={
        'zone_id': f'{prefix}_zone_id',
        'zone_name': f'{prefix}_zone_name',
        'borough': f'{prefix}_borough',
        'latitude': f'{prefix}_lat',
        'longitude': f'{prefix}_lon',
    })
    return merged


def mismatch_ratios(df, pu_col='PU_zone_id', do_col='DO_zone_id'):
    """Compute PU/(PU+DO) ratio per zone. 0.5 = perfectly balanced."""
    pu = df.groupby(pu_col).size().rename('pu')
    do = df.dropna(subset=[do_col]).groupby(do_col).size().rename('do')
    combined = pd.concat([pu, do], axis=1).fillna(0)
    combined['ratio'] = combined['pu'] / (combined['pu'] + combined['do'])
    combined['total'] = combined['pu'] + combined['do']
    return combined


def short_cluster_label(full_name):
    """Extract borough from 'Borough: Zone Name' format."""
    return full_name.split(':')[0].strip()


def make_short_labels(cluster_names, n_clusters):
    """Create short labels, appending (2), (3) for duplicate boroughs."""
    labels = {}
    seen = {}
    for i in range(n_clusters):
        base = short_cluster_label(cluster_names[i])
        if base in seen:
            seen[base] += 1
            labels[i] = f"{base} ({seen[base]})"
        else:
            seen[base] = 1
            labels[i] = base
    return labels

---
## 3. Load Zone Centroids and Shapefile

In [3]:
zone_centroids = pd.read_csv(PATH_CENTROIDS)
print(f"Loaded {len(zone_centroids)} zone centroids")

gdf_raw = gpd.read_file(PATH_SHAPEFILE)
gdf_raw = gdf_raw.to_crs(epsg=4326)
taxi_zones_geo_4326 = json.loads(gdf_raw.to_json())

for f in taxi_zones_geo_4326['features']:
    f['properties']['LocationID'] = str(int(f['properties']['LocationID']))

all_zone_ids = [f['properties']['LocationID'] for f in taxi_zones_geo_4326['features']]
print(f"Loaded {len(all_zone_ids)} taxi zone geometries")

Loaded 263 zone centroids
Loaded 263 taxi zone geometries


---
## 4. 2018 Uber Data: Load, Process, Cluster

In [4]:
print("[4.1] Loading 2018 data...")
table_2018 = pq.read_table(PATH_2018, columns=[])
total_2018 = table_2018.num_rows
print(f"  Total rows: {total_2018:,}")

columns_2018 = ['pickup_datetime', 'PUlocationID', 'DOlocationID', 'dispatching_base_num']
table_2018 = pq.read_table(PATH_2018, columns=columns_2018)

df_2018_full = table_2018.to_pandas()
df_2018_full = df_2018_full[df_2018_full['dispatching_base_num'].isin(UBER_2018_BASES)].copy()

uber_count_2018 = len(df_2018_full)
print(f"  Uber trips: {uber_count_2018:,} ({100 * uber_count_2018 / total_2018:.1f}%)")

df_2018 = df_2018_full.sample(n=min(SAMPLE_SIZE, uber_count_2018), random_state=42)
del df_2018_full, table_2018
gc.collect()

print("[4.2] Processing temporal features...")
df_2018['pickup_datetime'] = pd.to_datetime(df_2018['pickup_datetime'])
df_2018['hour'] = df_2018['pickup_datetime'].dt.hour
df_2018['day_of_week'] = df_2018['pickup_datetime'].dt.dayofweek
df_2018['day_name'] = df_2018['pickup_datetime'].dt.day_name()

print("[4.3] Merging zone info...")
df_2018 = df_2018.dropna(subset=['PUlocationID'])
df_2018['PUlocationID'] = df_2018['PUlocationID'].astype(int)
df_2018 = merge_zone_info(df_2018, 'PUlocationID', zone_centroids, 'PU')
df_2018 = df_2018.dropna(subset=['PU_lat', 'PU_lon'])

df_2018['DOlocationID'] = pd.to_numeric(df_2018['DOlocationID'], errors='coerce')
df_2018.loc[df_2018['DOlocationID'].notna(), 'DOlocationID'] = \
    df_2018.loc[df_2018['DOlocationID'].notna(), 'DOlocationID'].astype(int)
df_2018 = merge_zone_info(df_2018, 'DOlocationID', zone_centroids, 'DO')

print("[4.4] Clustering on pickup coordinates...")
coords_2018 = df_2018[['PU_lat', 'PU_lon']].values
kmeans_2018 = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
df_2018['PU_cluster'] = kmeans_2018.fit_predict(coords_2018)

cluster_names_2018 = {}
for i in range(N_CLUSTERS):
    lat, lon = kmeans_2018.cluster_centers_[i]
    cluster_names_2018[i] = name_cluster_by_location(lat, lon, zone_centroids)

df_2018['PU_cluster_name'] = df_2018['PU_cluster'].map(cluster_names_2018)

do_mask_2018 = df_2018['DO_lat'].notna() & df_2018['DO_lon'].notna()
df_2018.loc[do_mask_2018, 'DO_cluster'] = kmeans_2018.predict(
    df_2018.loc[do_mask_2018, ['DO_lat', 'DO_lon']].values
)
df_2018['DO_cluster_name'] = df_2018['DO_cluster'].map(cluster_names_2018)

print("  Cluster distribution (2018):")
for i in range(N_CLUSTERS):
    count = (df_2018['PU_cluster'] == i).sum()
    pct = 100 * count / len(df_2018)
    print(f"    {cluster_names_2018[i]}: {count:>7,} ({pct:>5.1f}%)")

[4.1] Loading 2018 data...
  Total rows: 19,808,094
  Uber trips: 4,502,999 (22.7%)
[4.2] Processing temporal features...
[4.3] Merging zone info...
[4.4] Clustering on pickup coordinates...
  Cluster distribution (2018):
    Bronx: East Tremont: 477,684 ( 10.6%)
    Manhattan: Gramercy: 1,883,665 ( 41.7%)
    Brooklyn: Ocean Hill: 626,926 ( 13.9%)
    Queens: Briarwood/Jamaica Hills: 373,460 (  8.3%)
    Brooklyn: Borough Park: 388,956 (  8.6%)
    Manhattan: Yorkville East: 769,118 ( 17.0%)


---
## 5. 2025 Uber Data: Load, Process, Cluster

In [5]:
print("[5.1] Loading 2025 data...")
table_2025 = pq.read_table(PATH_2025, columns=[])
total_2025 = table_2025.num_rows
print(f"  Total rows: {total_2025:,}")

columns_2025 = ['pickup_datetime', 'PULocationID', 'DOLocationID', 'hvfhs_license_num']
table_2025 = pq.read_table(PATH_2025, columns=columns_2025)

df_2025_full = table_2025.to_pandas()
df_2025_full = df_2025_full[df_2025_full['hvfhs_license_num'] == UBER_2025_LICENSE].copy()

uber_count_2025 = len(df_2025_full)
print(f"  Uber trips: {uber_count_2025:,} ({100 * uber_count_2025 / total_2025:.1f}%)")

df_2025 = df_2025_full.sample(n=min(SAMPLE_SIZE, uber_count_2025), random_state=42)
del df_2025_full, table_2025
gc.collect()

print("[5.2] Processing temporal features...")
df_2025['pickup_datetime'] = pd.to_datetime(df_2025['pickup_datetime'])
df_2025['hour'] = df_2025['pickup_datetime'].dt.hour
df_2025['day_of_week'] = df_2025['pickup_datetime'].dt.dayofweek
df_2025['day_name'] = df_2025['pickup_datetime'].dt.day_name()

print("[5.3] Merging zone info...")
df_2025 = df_2025.dropna(subset=['PULocationID'])
df_2025['PULocationID'] = df_2025['PULocationID'].astype(int)
df_2025 = merge_zone_info(df_2025, 'PULocationID', zone_centroids, 'PU')
df_2025 = df_2025.dropna(subset=['PU_lat', 'PU_lon'])

df_2025['DOLocationID'] = pd.to_numeric(df_2025['DOLocationID'], errors='coerce')
df_2025.loc[df_2025['DOLocationID'].notna(), 'DOLocationID'] = \
    df_2025.loc[df_2025['DOLocationID'].notna(), 'DOLocationID'].astype(int)
df_2025 = merge_zone_info(df_2025, 'DOLocationID', zone_centroids, 'DO')

print("[5.4] Clustering on pickup coordinates...")
coords_2025 = df_2025[['PU_lat', 'PU_lon']].values
kmeans_2025 = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
df_2025['PU_cluster'] = kmeans_2025.fit_predict(coords_2025)

cluster_names_2025 = {}
for i in range(N_CLUSTERS):
    lat, lon = kmeans_2025.cluster_centers_[i]
    cluster_names_2025[i] = name_cluster_by_location(lat, lon, zone_centroids)

df_2025['PU_cluster_name'] = df_2025['PU_cluster'].map(cluster_names_2025)

do_mask_2025 = df_2025['DO_lat'].notna() & df_2025['DO_lon'].notna()
df_2025.loc[do_mask_2025, 'DO_cluster'] = kmeans_2025.predict(
    df_2025.loc[do_mask_2025, ['DO_lat', 'DO_lon']].values
)
df_2025['DO_cluster_name'] = df_2025['DO_cluster'].map(cluster_names_2025)

print("  Cluster distribution (2025):")
for i in range(N_CLUSTERS):
    count = (df_2025['PU_cluster'] == i).sum()
    pct = 100 * count / len(df_2025)
    print(f"    {cluster_names_2025[i]}: {count:>7,} ({pct:>5.1f}%)")

[5.1] Loading 2025 data...
  Total rows: 20,405,666
  Uber trips: 15,356,455 (75.3%)
[5.2] Processing temporal features...
[5.3] Merging zone info...


  df_2025.loc[df_2025['DOLocationID'].notna(), 'DOLocationID'] = \


[5.4] Clustering on pickup coordinates...
  Cluster distribution (2025):
    Bronx: Claremont/Bathgate: 2,714,803 ( 17.6%)
    Queens: Baisley Park: 1,358,465 (  8.8%)
    Staten Island: Westerleigh: 241,368 (  1.6%)
    Manhattan: Murray Hill: 5,992,333 ( 38.8%)
    Brooklyn: Prospect-Lefferts Gardens: 3,282,505 ( 21.2%)
    Queens: Elmhurst: 1,857,722 ( 12.0%)


---
## 6. Cross-Year Cluster Matching

Clusters are fitted independently for each year. To enable meaningful comparison, 2018 and 2025 clusters are matched by greedy nearest-centroid proximity (cityblock distance on lat/lon), and a shared color mapping is imposed so that geographically corresponding clusters share the same color across all charts.

In [6]:
centers_2018 = kmeans_2018.cluster_centers_
centers_2025 = kmeans_2025.cluster_centers_

cluster_matches = match_clusters_by_proximity(centers_2018, centers_2025)

print("Cluster matches (2018 -> 2025):")
for idx_2018, idx_2025 in sorted(cluster_matches.items()):
    lat1, lon1 = centers_2018[idx_2018]
    lat2, lon2 = centers_2025[idx_2025]
    dist = haversine_km(lat1, lon1, lat2, lon2)
    print(f"  {cluster_names_2018[idx_2018]} -> {cluster_names_2025[idx_2025]} ({dist:.2f} km)")

# Build color maps: matched clusters share the same color
cluster_color_map_2018 = {
    cluster_names_2018[i]: STYLE['cluster_colors'][i % len(STYLE['cluster_colors'])]
    for i in range(N_CLUSTERS)
}
cluster_color_map_2025 = {
    cluster_names_2025[idx_25]: STYLE['cluster_colors'][idx_18 % len(STYLE['cluster_colors'])]
    for idx_18, idx_25 in cluster_matches.items()
}

short_labels_2018 = make_short_labels(cluster_names_2018, N_CLUSTERS)
short_labels_2025 = make_short_labels(cluster_names_2025, N_CLUSTERS)

Cluster matches (2018 -> 2025):
  Bronx: East Tremont -> Bronx: Claremont/Bathgate (0.72 km)
  Manhattan: Gramercy -> Manhattan: Murray Hill (1.22 km)
  Brooklyn: Ocean Hill -> Brooklyn: Prospect-Lefferts Gardens (3.03 km)
  Queens: Briarwood/Jamaica Hills -> Queens: Baisley Park (2.59 km)
  Brooklyn: Borough Park -> Staten Island: Westerleigh (11.62 km)
  Manhattan: Yorkville East -> Queens: Elmhurst (6.14 km)


---
## 7. Cluster Maps

Choropleth maps showing the dominant pickup cluster per taxi zone for each year, followed by centroid shift vectors illustrating how cluster centers migrated between 2018 and 2025.

In [7]:
# ── 2018 Cluster Map ──────────────────────────────────────────────────────
print("[7.1] 2018 cluster map...")

zone_clusters = df_2018.groupby('PU_zone_id').agg({
    'PU_cluster': lambda x: x.mode()[0],
    'PU_cluster_name': lambda x: x.mode()[0],
    'PU_zone_name': 'first',
    'PU_borough': 'first'
}).reset_index()
zone_clusters.columns = ['zone_id', 'cluster', 'cluster_name', 'zone_name', 'borough']
zone_clusters['zone_id'] = zone_clusters['zone_id'].astype(float).astype(int).astype(str)

zone_ids_with_data = set(zone_clusters['zone_id'].values)
filtered_geojson = {
    'type': 'FeatureCollection',
    'features': [f for f in taxi_zones_geo_4326['features']
                 if f['properties']['LocationID'] in zone_ids_with_data]
}

fig = go.Figure()

# Background: all zones in light grey
fig.add_trace(go.Choroplethmap(
    geojson=taxi_zones_geo_4326,
    locations=all_zone_ids,
    featureidkey='properties.LocationID',
    z=[1] * len(all_zone_ids),
    colorscale=[[0, '#e8e8e8'], [1, '#e8e8e8']],
    marker_opacity=0.4, marker_line_width=0.3, marker_line_color='#ccc',
    showscale=False, hoverinfo='skip',
))

# Overlay: cluster-colored zones
fig_tmp = px.choropleth_map(
    zone_clusters, geojson=filtered_geojson,
    locations='zone_id', featureidkey='properties.LocationID',
    color='cluster_name', color_discrete_map=cluster_color_map_2018,
    map_style=STYLE['map_style'], zoom=STYLE['map_zoom'],
    center=STYLE['map_center'], opacity=0.95,
    hover_data={'zone_id': False, 'zone_name': True, 'borough': True, 'cluster_name': True},
    labels={'cluster_name': 'Cluster'}
)
fig_tmp.update_traces(marker=dict(line=dict(width=0.8, color='rgba(255,255,255,0.8)')))
for trace in fig_tmp.data:
    fig.add_trace(trace)

# Centroid markers
for i, (lat, lon) in enumerate(centers_2018):
    name = cluster_names_2018[i]
    fig.add_trace(go.Scattermap(
        lat=[lat], lon=[lon], mode='markers+text',
        marker=dict(size=0, color='black', opacity=0.85),
        text=str(i), textfont=dict(size=10, color='white', family='Arial Black'),
        textposition='middle center', name=name, showlegend=False,
        hovertemplate=f'<b>Cluster {i}</b><br>{name}<extra></extra>',
    ))

fig.update_layout(
    **base_layout(height=650, width=1100, margin=STYLE['margin_map']),
    legend=dict(
        title="Clusters", yanchor="top", y=0.99, xanchor="left", x=0.01,
        bgcolor="rgba(255,255,255,0.95)", bordercolor="#dde1e7", borderwidth=1,
        font=dict(size=STYLE['legend_size'], family=STYLE['font_family']),
    ),
    map=dict(style=STYLE['map_style'], center=STYLE['map_center'], zoom=STYLE['map_zoom']),
)

save_html(fig, 'cluster_map_2018.html')

[7.1] 2018 cluster map...
  Saved: cluster_map_2018.html


In [8]:
# ── 2025 Cluster Map ──────────────────────────────────────────────────────
print("[7.2] 2025 cluster map...")

zone_clusters_2025 = df_2025.groupby('PU_zone_id').agg({
    'PU_cluster': lambda x: x.mode()[0],
    'PU_cluster_name': lambda x: x.mode()[0],
    'PU_zone_name': 'first',
    'PU_borough': 'first'
}).reset_index()
zone_clusters_2025.columns = ['zone_id', 'cluster', 'cluster_name', 'zone_name', 'borough']
zone_clusters_2025['zone_id'] = zone_clusters_2025['zone_id'].astype(float).astype(int).astype(str)

zone_ids_2025 = set(zone_clusters_2025['zone_id'].values)
filtered_geojson_2025 = {
    'type': 'FeatureCollection',
    'features': [f for f in taxi_zones_geo_4326['features']
                 if f['properties']['LocationID'] in zone_ids_2025]
}

fig_2025 = go.Figure()

fig_2025.add_trace(go.Choroplethmap(
    geojson=taxi_zones_geo_4326,
    locations=all_zone_ids,
    featureidkey='properties.LocationID',
    z=[1] * len(all_zone_ids),
    colorscale=[[0, '#e8e8e8'], [1, '#e8e8e8']],
    marker_opacity=0.4, marker_line_width=0.3, marker_line_color='#ccc',
    showscale=False, hoverinfo='skip',
))

fig_tmp_2025 = px.choropleth_map(
    zone_clusters_2025, geojson=filtered_geojson_2025,
    locations='zone_id', featureidkey='properties.LocationID',
    color='cluster_name', color_discrete_map=cluster_color_map_2025,
    map_style=STYLE['map_style'], zoom=STYLE['map_zoom'],
    center=STYLE['map_center'], opacity=0.95,
    hover_data={'zone_id': False, 'zone_name': True, 'borough': True, 'cluster_name': True},
    labels={'cluster_name': 'Cluster'}
)
fig_tmp_2025.update_traces(marker=dict(line=dict(width=0.8, color='rgba(255,255,255,0.8)')))
for trace in fig_tmp_2025.data:
    fig_2025.add_trace(trace)

for i, (lat, lon) in enumerate(centers_2025):
    name = cluster_names_2025[i]
    fig_2025.add_trace(go.Scattermap(
        lat=[lat], lon=[lon], mode='markers+text',
        marker=dict(size=0, color='black', opacity=0.85),
        text=str(i), textfont=dict(size=11, color='white', family='Arial Black'),
        textposition='middle center', name=name, showlegend=False,
        hovertemplate=f'<b>Cluster {i}</b><br>{name}<extra></extra>',
    ))

fig_2025.update_layout(
    **base_layout(height=650, width=1100, margin=STYLE['margin_map']),
    legend=dict(
        title="Clusters", yanchor="top", y=0.99, xanchor="left", x=0.01,
        bgcolor="rgba(255,255,255,0.95)", bordercolor="#dde1e7", borderwidth=1,
        font=dict(size=STYLE['legend_size'], family=STYLE['font_family']),
    ),
    map=dict(style=STYLE['map_style'], center=STYLE['map_center'], zoom=STYLE['map_zoom']),
)

save_html(fig_2025, 'cluster_map_2025.html')

[7.2] 2025 cluster map...
  Saved: cluster_map_2025.html


In [9]:
# ── Cluster Centroid Shifts ───────────────────────────────────────────────
print("[7.3] Cluster shifts map...")

shift_data = []
for idx_18, idx_25 in cluster_matches.items():
    lat1, lon1 = centers_2018[idx_18]
    lat2, lon2 = centers_2025[idx_25]
    shift_data.append({
        'idx_18': idx_18, 'idx_25': idx_25,
        'name_18': cluster_names_2018[idx_18],
        'name_25': cluster_names_2025[idx_25],
        'dist_km': haversine_km(lat1, lon1, lat2, lon2)
    })

max_shift = max(s['dist_km'] for s in shift_data)
min_shift = min(s['dist_km'] for s in shift_data)

fig_shift = go.Figure()

fig_shift.add_trace(go.Choroplethmap(
    geojson=taxi_zones_geo_4326,
    locations=all_zone_ids,
    featureidkey='properties.LocationID',
    z=[1] * len(all_zone_ids),
    colorscale=[[0, '#e8e8e8'], [1, '#e8e8e8']],
    marker_opacity=0.35, marker_line_width=0.3, marker_line_color='#ccc',
    showscale=False, hoverinfo='skip',
))

# Shift vectors
for sa in shift_data:
    lat1, lon1 = centers_2018[sa['idx_18']]
    lat2, lon2 = centers_2025[sa['idx_25']]
    if max_shift > min_shift:
        width = 1.5 + 3.5 * (sa['dist_km'] - min_shift) / (max_shift - min_shift)
    else:
        width = 3
    fig_shift.add_trace(go.Scattermap(
        lat=[lat1, lat2], lon=[lon1, lon2],
        mode='lines', line=dict(width=width, color='#333'),
        showlegend=False,
        hovertemplate=(
            f"<b>{sa['name_18']}</b> -> <b>{sa['name_25']}</b>"
            f"<br>Shift: {sa['dist_km']:.2f} km<extra></extra>"
        )
    ))

# 2018 centroids (circles)
for i in range(len(centers_2018)):
    fig_shift.add_trace(go.Scattermap(
        lat=[centers_2018[i, 0]], lon=[centers_2018[i, 1]],
        mode='markers',
        marker=dict(size=16, color=STYLE['cluster_colors'][i % len(STYLE['cluster_colors'])], opacity=0.9),
        name=f'2018: {cluster_names_2018[i]}',
        legendgroup='2018', legendgrouptitle_text='2018 Centroids',
        hovertemplate=f'<b>2018 Cluster {i}</b><br>{cluster_names_2018[i]}<extra></extra>',
    ))

# 2025 centroids (squares)
for i in range(len(centers_2025)):
    matched_18 = [k for k, v in cluster_matches.items() if v == i]
    color_idx = matched_18[0] if matched_18 else i
    fig_shift.add_trace(go.Scattermap(
        lat=[centers_2025[i, 0]], lon=[centers_2025[i, 1]],
        mode='markers',
        marker=dict(
            size=16,
            color=STYLE['cluster_colors'][color_idx % len(STYLE['cluster_colors'])],
            opacity=0.9, symbol='square',
        ),
        name=f'2025: {cluster_names_2025[i]}',
        legendgroup='2025', legendgrouptitle_text='2025 Centroids',
        hovertemplate=f'<b>2025 Cluster {i}</b><br>{cluster_names_2025[i]}<extra></extra>',
    ))

fig_shift.update_layout(
    **base_layout(height=750, width=1100, margin=STYLE['margin_map']),
    map_style=STYLE['map_style'],
    map_zoom=9.8,
    map_center=dict(lat=40.72, lon=-73.94),
    legend=dict(
        yanchor='top', y=0.99, xanchor='left', x=0.01,
        bgcolor='rgba(255,255,255,0.92)', bordercolor='#dde1e7',
        borderwidth=1, font=dict(size=10, family=STYLE['font_family']),
    ),
)

save_html(fig_shift, 'cluster_centroid_shifts.html')

[7.3] Cluster shifts map...
  Saved: cluster_centroid_shifts.html


---
## 8. Pickup Density Change

Zone-level change in pickup share (percentage points) between 2018 and 2025, capped at the 95th percentile of absolute values to prevent outlier zones from compressing the color range.

In [10]:
print("[8.1] Pickup density change...")

pu_counts_2018 = df_2018.groupby(['PU_zone_id', 'PU_zone_name', 'PU_borough']).size().reset_index(name='count_2018')
pu_counts_2025 = df_2025.groupby(['PU_zone_id', 'PU_zone_name', 'PU_borough']).size().reset_index(name='count_2025')
pu_counts_2018.columns = ['zone_id', 'zone_name', 'borough', 'count_2018']
pu_counts_2025.columns = ['zone_id', 'zone_name', 'borough', 'count_2025']

zone_comparison = pu_counts_2018.merge(
    pu_counts_2025, on=['zone_id', 'zone_name', 'borough'], how='outer'
).fillna(0)
zone_comparison['share_2018'] = 100 * zone_comparison['count_2018'] / zone_comparison['count_2018'].sum()
zone_comparison['share_2025'] = 100 * zone_comparison['count_2025'] / zone_comparison['count_2025'].sum()
zone_comparison['share_change'] = zone_comparison['share_2025'] - zone_comparison['share_2018']

zone_change = zone_comparison[['zone_id', 'zone_name', 'borough', 'share_change']].copy()
zone_change['zone_id'] = zone_change['zone_id'].astype(float).astype(int).astype(str)

zone_ids_change = set(zone_change['zone_id'].values)
filtered_geojson_change = {
    'type': 'FeatureCollection',
    'features': [f for f in taxi_zones_geo_4326['features']
                 if f['properties']['LocationID'] in zone_ids_change]
}

cap = zone_change['share_change'].abs().quantile(0.95)
zone_change['share_change_capped'] = zone_change['share_change'].clip(-cap, cap)

fig_pu_change = go.Figure()

fig_pu_change.add_trace(go.Choroplethmap(
    geojson=taxi_zones_geo_4326,
    locations=all_zone_ids,
    featureidkey='properties.LocationID',
    z=[0] * len(all_zone_ids),
    colorscale=[[0, '#e8e8e8'], [1, '#e8e8e8']],
    marker_opacity=0.4, marker_line_width=0.3, marker_line_color='#ccc',
    showscale=False, hoverinfo='skip',
))

fig_pu_change.add_trace(go.Choroplethmap(
    geojson=filtered_geojson_change,
    locations=zone_change['zone_id'],
    featureidkey='properties.LocationID',
    z=zone_change['share_change_capped'],
    colorscale='RdBu', zmid=0,
    marker_opacity=0.85,
    marker_line_width=0.5,
    marker_line_color='rgba(255,255,255,0.6)',
    colorbar=dict(
        title='Change (pp)', ticksuffix=' pp', x=0.99,
        titlefont=dict(family=STYLE['font_family']),
        tickfont=dict(family=STYLE['font_family']),
    ),
    customdata=np.column_stack([
        zone_change['zone_name'], zone_change['borough'], zone_change['share_change']
    ]),
    hovertemplate=(
        '<b>%{customdata[0]}</b> (%{customdata[1]})<br>'
        'Change: %{customdata[2]:.2f} pp<extra></extra>'
    ),
))

fig_pu_change.update_layout(
    **base_layout(height=700, width=1100, margin=STYLE['margin_map']),
    map_style=STYLE['map_style'],
    map_zoom=STYLE['map_zoom'],
    map_center=dict(**STYLE['map_center']),
)

save_html(fig_pu_change, 'pickup_density_change.html')

[8.1] Pickup density change...


ValueError: Invalid property specified for object of type plotly.graph_objs.choroplethmap.ColorBar: 'titlefont'

Did you mean "tickfont"?

    Valid properties:
        bgcolor
            Sets the color of padded area.
        bordercolor
            Sets the axis line color.
        borderwidth
            Sets the width (in px) or the border enclosing this
            color bar.
        dtick
            Sets the step in-between ticks on this axis. Use with
            `tick0`. Must be a positive number, or special strings
            available to "log" and "date" axes. If the axis `type`
            is "log", then ticks are set every 10^(n*dtick) where n
            is the tick number. For example, to set a tick mark at
            1, 10, 100, 1000, ... set dtick to 1. To set tick marks
            at 1, 100, 10000, ... set dtick to 2. To set tick marks
            at 1, 5, 25, 125, 625, 3125, ... set dtick to
            log_10(5), or 0.69897000433. "log" has several special
            values; "L<f>", where `f` is a positive number, gives
            ticks linearly spaced in value (but not position). For
            example `tick0` = 0.1, `dtick` = "L0.5" will put ticks
            at 0.1, 0.6, 1.1, 1.6 etc. To show powers of 10 plus
            small digits between, use "D1" (all digits) or "D2"
            (only 2 and 5). `tick0` is ignored for "D1" and "D2".
            If the axis `type` is "date", then you must convert the
            time to milliseconds. For example, to set the interval
            between ticks to one day, set `dtick` to 86400000.0.
            "date" also has special values "M<n>" gives ticks
            spaced by a number of months. `n` must be a positive
            integer. To set ticks on the 15th of every third month,
            set `tick0` to "2000-01-15" and `dtick` to "M3". To set
            ticks every 4 years, set `dtick` to "M48"
        exponentformat
            Determines a formatting rule for the tick exponents.
            For example, consider the number 1,000,000,000. If
            "none", it appears as 1,000,000,000. If "e", 1e+9. If
            "E", 1E+9. If "power", 1x10^9 (with 9 in a super
            script). If "SI", 1G. If "B", 1B. "SI" uses prefixes
            from "femto" f (10^-15) to "tera" T (10^12). *SI
            extended* covers instead the full SI range from
            "quecto" q (10^-30) to "quetta" Q (10^30). If "SI" or
            *SI extended* is used and the exponent is beyond the
            above ranges, the formatting rule will automatically be
            switched to the power notation.
        labelalias
            Replacement text for specific tick or hover labels. For
            example using {US: 'USA', CA: 'Canada'} changes US to
            USA and CA to Canada. The labels we would have shown
            must match the keys exactly, after adding any
            tickprefix or ticksuffix. For negative numbers the
            minus sign symbol used (U+2212) is wider than the
            regular ascii dash. That means you need to use −1
            instead of -1. labelalias can be used with any axis
            type, and both keys (if needed) and values (if desired)
            can include html-like tags or MathJax.
        len
            Sets the length of the color bar This measure excludes
            the padding of both ends. That is, the color bar length
            is this length minus the padding on both ends.
        lenmode
            Determines whether this color bar's length (i.e. the
            measure in the color variation direction) is set in
            units of plot "fraction" or in *pixels. Use `len` to
            set the value.
        minexponent
            Hide SI prefix for 10^n if |n| is below this number.
            This only has an effect when `tickformat` is "SI" or
            "B".
        nticks
            Specifies the maximum number of ticks for the
            particular axis. The actual number of ticks will be
            chosen automatically to be less than or equal to
            `nticks`. Has an effect only if `tickmode` is set to
            "auto".
        orientation
            Sets the orientation of the colorbar.
        outlinecolor
            Sets the axis line color.
        outlinewidth
            Sets the width (in px) of the axis line.
        separatethousands
            If "true", even 4-digit integers are separated
        showexponent
            If "all", all exponents are shown besides their
            significands. If "first", only the exponent of the
            first tick is shown. If "last", only the exponent of
            the last tick is shown. If "none", no exponents appear.
        showticklabels
            Determines whether or not the tick labels are drawn.
        showtickprefix
            If "all", all tick labels are displayed with a prefix.
            If "first", only the first tick is displayed with a
            prefix. If "last", only the last tick is displayed with
            a suffix. If "none", tick prefixes are hidden.
        showticksuffix
            Same as `showtickprefix` but for tick suffixes.
        thickness
            Sets the thickness of the color bar This measure
            excludes the size of the padding, ticks and labels.
        thicknessmode
            Determines whether this color bar's thickness (i.e. the
            measure in the constant color direction) is set in
            units of plot "fraction" or in "pixels". Use
            `thickness` to set the value.
        tick0
            Sets the placement of the first tick on this axis. Use
            with `dtick`. If the axis `type` is "log", then you
            must take the log of your starting tick (e.g. to set
            the starting tick to 100, set the `tick0` to 2) except
            when `dtick`=*L<f>* (see `dtick` for more info). If the
            axis `type` is "date", it should be a date string, like
            date data. If the axis `type` is "category", it should
            be a number, using the scale where each category is
            assigned a serial number from zero in the order it
            appears.
        tickangle
            Sets the angle of the tick labels with respect to the
            horizontal. For example, a `tickangle` of -90 draws the
            tick labels vertically.
        tickcolor
            Sets the tick color.
        tickfont
            Sets the color bar's tick label font
        tickformat
            Sets the tick label formatting rule using d3 formatting
            mini-languages which are very similar to those in
            Python. For numbers, see:
            https://github.com/d3/d3-format/tree/v1.4.5#d3-format.
            And for dates see: https://github.com/d3/d3-time-
            format/tree/v2.2.3#locale_format. We add two items to
            d3's date formatter: "%h" for half of the year as a
            decimal number as well as "%{n}f" for fractional
            seconds with n digits. For example, *2016-10-13
            09:15:23.456* with tickformat "%H~%M~%S.%2f" would
            display "09~15~23.46"
        tickformatstops
            A tuple of :class:`plotly.graph_objects.choroplethmap.c
            olorbar.Tickformatstop` instances or dicts with
            compatible properties
        tickformatstopdefaults
            When used in a template (as layout.template.data.chorop
            lethmap.colorbar.tickformatstopdefaults), sets the
            default property values to use for elements of
            choroplethmap.colorbar.tickformatstops
        ticklabeloverflow
            Determines how we handle tick labels that would
            overflow either the graph div or the domain of the
            axis. The default value for inside tick labels is *hide
            past domain*. In other cases the default is *hide past
            div*.
        ticklabelposition
            Determines where tick labels are drawn relative to the
            ticks. Left and right options are used when
            `orientation` is "h", top and bottom when `orientation`
            is "v".
        ticklabelstep
            Sets the spacing between tick labels as compared to the
            spacing between ticks. A value of 1 (default) means
            each tick gets a label. A value of 2 means shows every
            2nd label. A larger value n means only every nth tick
            is labeled. `tick0` determines which labels are shown.
            Not implemented for axes with `type` "log" or
            "multicategory", or when `tickmode` is "array".
        ticklen
            Sets the tick length (in px).
        tickmode
            Sets the tick mode for this axis. If "auto", the number
            of ticks is set via `nticks`. If "linear", the
            placement of the ticks is determined by a starting
            position `tick0` and a tick step `dtick` ("linear" is
            the default value if `tick0` and `dtick` are provided).
            If "array", the placement of the ticks is set via
            `tickvals` and the tick text is `ticktext`. ("array" is
            the default value if `tickvals` is provided).
        tickprefix
            Sets a tick label prefix.
        ticks
            Determines whether ticks are drawn or not. If "", this
            axis' ticks are not drawn. If "outside" ("inside"),
            this axis' are drawn outside (inside) the axis lines.
        ticksuffix
            Sets a tick label suffix.
        ticktext
            Sets the text displayed at the ticks position via
            `tickvals`. Only has an effect if `tickmode` is set to
            "array". Used with `tickvals`.
        ticktextsrc
            Sets the source reference on Chart Studio Cloud for
            `ticktext`.
        tickvals
            Sets the values at which ticks on this axis appear.
            Only has an effect if `tickmode` is set to "array".
            Used with `ticktext`.
        tickvalssrc
            Sets the source reference on Chart Studio Cloud for
            `tickvals`.
        tickwidth
            Sets the tick width (in px).
        title
            :class:`plotly.graph_objects.choroplethmap.colorbar.Tit
            le` instance or dict with compatible properties
        x
            Sets the x position with respect to `xref` of the color
            bar (in plot fraction). When `xref` is "paper",
            defaults to 1.02 when `orientation` is "v" and 0.5 when
            `orientation` is "h". When `xref` is "container",
            defaults to 1 when `orientation` is "v" and 0.5 when
            `orientation` is "h". Must be between 0 and 1 if `xref`
            is "container" and between "-2" and 3 if `xref` is
            "paper".
        xanchor
            Sets this color bar's horizontal position anchor. This
            anchor binds the `x` position to the "left", "center"
            or "right" of the color bar. Defaults to "left" when
            `orientation` is "v" and "center" when `orientation` is
            "h".
        xpad
            Sets the amount of padding (in px) along the x
            direction.
        xref
            Sets the container `x` refers to. "container" spans the
            entire `width` of the plot. "paper" refers to the width
            of the plotting area only.
        y
            Sets the y position with respect to `yref` of the color
            bar (in plot fraction). When `yref` is "paper",
            defaults to 0.5 when `orientation` is "v" and 1.02 when
            `orientation` is "h". When `yref` is "container",
            defaults to 0.5 when `orientation` is "v" and 1 when
            `orientation` is "h". Must be between 0 and 1 if `yref`
            is "container" and between "-2" and 3 if `yref` is
            "paper".
        yanchor
            Sets this color bar's vertical position anchor This
            anchor binds the `y` position to the "top", "middle" or
            "bottom" of the color bar. Defaults to "middle" when
            `orientation` is "v" and "bottom" when `orientation` is
            "h".
        ypad
            Sets the amount of padding (in px) along the y
            direction.
        yref
            Sets the container `y` refers to. "container" spans the
            entire `height` of the plot. "paper" refers to the
            height of the plotting area only.
        
Did you mean "tickfont"?

Bad property path:
titlefont
^^^^^^^^^

---
## 9. Temporal Analysis

Hourly and day-of-week demand profiles compared across years, both at the aggregate level and disaggregated by cluster.

In [None]:
# ── Hourly pickup profile ────────────────────────────────────────────────
print("[9.1] Hourly patterns...")

hourly_2018 = df_2018.groupby('hour').size()
hourly_2025 = df_2025.groupby('hour').size()
hourly_2018_pct = 100 * hourly_2018 / hourly_2018.sum()
hourly_2025_pct = 100 * hourly_2025 / hourly_2025.sum()

fig_hourly = go.Figure()
fig_hourly.add_trace(go.Scatter(
    x=hourly_2018_pct.index, y=hourly_2018_pct.values,
    name='2018', mode='lines+markers',
    line=dict(color=STYLE['year_2018'], width=2.5),
    marker=dict(size=6),
    hovertemplate='Hour %{x}: %{y:.1f}%<extra>2018</extra>',
))
fig_hourly.add_trace(go.Scatter(
    x=hourly_2025_pct.index, y=hourly_2025_pct.values,
    name='2025', mode='lines+markers',
    line=dict(color=STYLE['year_2025'], width=2.5),
    marker=dict(size=6),
    hovertemplate='Hour %{x}: %{y:.1f}%<extra>2025</extra>',
))

fig_hourly.update_layout(
    **base_layout(height=500, width=900),
    xaxis=styled_axis(title_text='Hour of Day'),
    yaxis=styled_axis(title_text='Share of Daily Trips (%)'),
    legend=dict(x=0.02, y=0.98, font=dict(family=STYLE['font_family'])),
)

save_html(fig_hourly, 'hourly_patterns.html')

In [None]:
# ── Hourly profile by cluster (2x3 subplots) ─────────────────────────────
print("[9.2] Cluster hourly profiles...")

fig_hourly_cluster = make_subplots(
    rows=2, cols=3,
    subplot_titles=[f"Cluster {i}: {short_labels_2018[i]}" for i in range(N_CLUSTERS)],
    shared_yaxes=True, shared_xaxes=True,
    vertical_spacing=0.12, horizontal_spacing=0.05,
)

for i in range(N_CLUSTERS):
    row = i // 3 + 1
    col = i % 3 + 1

    cluster_data_18 = df_2018[df_2018['PU_cluster'] == i]
    hourly_18 = cluster_data_18.groupby('hour').size()
    hourly_18_pct = 100 * hourly_18 / hourly_18.sum()

    matched_idx = cluster_matches[i]
    cluster_data_25 = df_2025[df_2025['PU_cluster'] == matched_idx]
    hourly_25 = cluster_data_25.groupby('hour').size()
    hourly_25_pct = 100 * hourly_25 / hourly_25.sum()

    fig_hourly_cluster.add_trace(go.Scatter(
        x=hourly_18_pct.index, y=hourly_18_pct.values,
        name='2018', mode='lines',
        line=dict(color=STYLE['year_2018'], width=2),
        showlegend=(i == 0),
        hovertemplate='Hour %{x}: %{y:.1f}%<extra>2018</extra>',
    ), row=row, col=col)

    fig_hourly_cluster.add_trace(go.Scatter(
        x=hourly_25_pct.index, y=hourly_25_pct.values,
        name='2025', mode='lines',
        line=dict(color=STYLE['year_2025'], width=2),
        showlegend=(i == 0),
        hovertemplate='Hour %{x}: %{y:.1f}%<extra>2025</extra>',
    ), row=row, col=col)

fig_hourly_cluster.update_layout(
    **base_layout(height=500, width=1100),
    legend=dict(x=0.92, y=0.98, font=dict(family=STYLE['font_family'])),
)
fig_hourly_cluster.update_xaxes(title_text='Hour', row=2)
fig_hourly_cluster.update_yaxes(title_text='% of cluster trips', col=1)

save_html(fig_hourly_cluster, 'cluster_hourly_profiles.html')

In [None]:
# ── Demand heatmap: Hour x Day (2018 | 2025 | Diff) ─────────────────────
print("[9.3] Demand heatmaps...")

day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

fig_heat = make_subplots(
    rows=1, cols=3,
    subplot_titles=('2018', '2025', '2025 minus 2018'),
    horizontal_spacing=0.06,
)

pivot_tables = {}
for label, df_year, col_idx in [('2018', df_2018, 1), ('2025', df_2025, 2)]:
    pivot = df_year.groupby(['day_name', 'hour']).size().reset_index(name='trips')
    pivot_table = pivot.pivot(index='day_name', columns='hour', values='trips').reindex(day_order)
    pivot_pct = 100 * pivot_table / pivot_table.sum().sum()
    pivot_tables[label] = pivot_pct

    fig_heat.add_trace(go.Heatmap(
        z=pivot_pct.values, x=pivot_pct.columns, y=pivot_pct.index,
        colorscale='Viridis', showscale=(col_idx == 2),
        colorbar=dict(
            title='% of Total', x=0.63,
            titlefont=dict(family=STYLE['font_family']),
            tickfont=dict(family=STYLE['font_family']),
        ) if col_idx == 2 else None,
        hovertemplate='<b>%{y}, Hour %{x}</b><br>Share: %{z:.2f}%<extra></extra>'
    ), row=1, col=col_idx)

diff = pivot_tables['2025'].values - pivot_tables['2018'].values
fig_heat.add_trace(go.Heatmap(
    z=diff, x=pivot_tables['2018'].columns, y=day_order,
    colorscale='RdBu', zmid=0, showscale=True,
    colorbar=dict(
        title='Change (pp)', x=1.0,
        titlefont=dict(family=STYLE['font_family']),
        tickfont=dict(family=STYLE['font_family']),
    ),
    hovertemplate='<b>%{y}, Hour %{x}</b><br>Change: %{z:.3f} pp<extra></extra>'
), row=1, col=3)

fig_heat.update_layout(**base_layout(height=450, width=1200))

save_html(fig_heat, 'demand_heatmaps.html')

---
## 10. Borough Distribution and Spatial Concentration

Pickup counts by borough (grouped bar) and Lorenz curves measuring how concentrated demand is across zones, with Gini coefficients quantifying the degree of inequality.

In [None]:
# ── Borough pickup distribution ──────────────────────────────────────────
print("[10.1] Borough analysis...")

borough_df_2018 = df_2018.groupby('PU_borough').size().reset_index(name='trips_2018')
borough_df_2025 = df_2025.groupby('PU_borough').size().reset_index(name='trips_2025')
borough_df = borough_df_2018.merge(borough_df_2025, on='PU_borough')

fig_borough = go.Figure()
fig_borough.add_trace(go.Bar(
    x=borough_df['PU_borough'], y=borough_df['trips_2018'],
    name='2018', marker_color=STYLE['year_2018'],
))
fig_borough.add_trace(go.Bar(
    x=borough_df['PU_borough'], y=borough_df['trips_2025'],
    name='2025', marker_color=STYLE['year_2025'],
))

fig_borough.update_layout(
    **base_layout(height=500, width=900),
    xaxis=styled_axis(title_text='Borough'),
    yaxis=styled_axis(title_text='Number of Trips'),
    barmode='group',
    legend=dict(x=0.85, y=0.98, font=dict(family=STYLE['font_family'])),
)

save_html(fig_borough, 'borough_analysis.html')

In [None]:
# ── Lorenz curve ─────────────────────────────────────────────────────────
print("[10.2] Lorenz curve...")

cz_18, ct_18 = lorenz_data(df_2018, 'PU_zone_id')
cz_25, ct_25 = lorenz_data(df_2025, 'PU_zone_id')
gini_18 = gini_from_lorenz(cz_18, ct_18)
gini_25 = gini_from_lorenz(cz_25, ct_25)

fig_lorenz = go.Figure()
fig_lorenz.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1], mode='lines',
    line=dict(color='#888', width=1.5, dash='dash'),
    name='Perfect equality', hoverinfo='skip'
))
fig_lorenz.add_trace(go.Scatter(
    x=np.concatenate([[0], cz_18]), y=np.concatenate([[0], ct_18]),
    mode='lines', line=dict(color=STYLE['year_2018'], width=2.5),
    name=f'2018 (Gini = {gini_18:.3f})',
    hovertemplate='%{x:.0%} of zones -> %{y:.0%} of trips<extra>2018</extra>'
))
fig_lorenz.add_trace(go.Scatter(
    x=np.concatenate([[0], cz_25]), y=np.concatenate([[0], ct_25]),
    mode='lines', line=dict(color=STYLE['year_2025'], width=2.5),
    name=f'2025 (Gini = {gini_25:.3f})',
    hovertemplate='%{x:.0%} of zones -> %{y:.0%} of trips<extra>2025</extra>'
))

fig_lorenz.update_layout(
    **base_layout(height=600, width=850),
    xaxis=styled_axis(title_text='Cumulative Share of Zones', range=[0, 1], tickformat='.0%'),
    yaxis=styled_axis(title_text='Cumulative Share of Trips', range=[0, 1], tickformat='.0%'),
    legend=dict(x=0.05, y=0.95, font=dict(family=STYLE['font_family'])),
)

save_html(fig_lorenz, 'lorenz_curve.html')

---
## 11. Intra-Cluster Trip Share

Fraction of trips where pickup and dropoff fall within the same geographic cluster, indicating the degree of within-area versus cross-area travel.

In [None]:
print("[11.1] Intra-cluster share...")

intra_2018 = df_2018.dropna(subset=['PU_cluster', 'DO_cluster'])
intra_2018_count = (intra_2018['PU_cluster'] == intra_2018['DO_cluster']).sum()
intra_2018_pct = 100 * intra_2018_count / len(intra_2018)

intra_2025 = df_2025.dropna(subset=['PU_cluster', 'DO_cluster'])
intra_2025_count = (intra_2025['PU_cluster'] == intra_2025['DO_cluster']).sum()
intra_2025_pct = 100 * intra_2025_count / len(intra_2025)

fig_intra = go.Figure()
fig_intra.add_trace(go.Bar(
    x=['2018', '2025'],
    y=[intra_2018_pct, intra_2025_pct],
    marker_color=[STYLE['year_2018'], STYLE['year_2025']],
    text=[f'{intra_2018_pct:.1f}%', f'{intra_2025_pct:.1f}%'],
    textposition='outside',
    width=0.4,
))

fig_intra.update_layout(
    **base_layout(height=500, width=900, margin=dict(l=80, r=80, t=20, b=50)),
    xaxis=styled_axis(range=[-0.5, 1.5]),
    yaxis=styled_axis(title_text='% of Trips (Same Cluster PU and DO)'),
    showlegend=False,
)

save_html(fig_intra, 'intra_cluster_share.html')

---
## 12. Spatial Autocorrelation: LISA Maps

Local Indicators of Spatial Association (Moran's Local I) identify statistically significant clusters and outliers in the distribution of trip share across taxi zones. Hot spots (High-High) indicate zones with high demand surrounded by high-demand neighbors; cold spots (Low-Low) indicate the opposite. The spatial weights matrix uses K=6 nearest neighbors with row standardization. Global Moran's I is reported as an annotation.

In [None]:
# ── Pickup/dropoff mismatch statistics ───────────────────────────────────
print("[12.0] Mismatch ratio statistics...")

mr_2018 = mismatch_ratios(df_2018)
mr_2025 = mismatch_ratios(df_2025)

ks_stat, ks_p = ks_2samp(mr_2018['ratio'].dropna(), mr_2025['ratio'].dropna())
lev_stat, lev_p = levene(mr_2018['ratio'].dropna(), mr_2025['ratio'].dropna())

print(f"  KS test: stat={ks_stat:.4f}, p={ks_p:.4e}")
print(f"  Levene test: stat={lev_stat:.4f}, p={lev_p:.4e}")

In [None]:
# ── LISA cluster maps ────────────────────────────────────────────────────
print("[12.1] LISA cluster maps...")

w = libpysal.weights.KNN.from_dataframe(gdf_raw, k=6)
w.transform = 'r'

for year_label, df_year in [('2018', df_2018), ('2025', df_2025)]:
    zone_counts = df_year.groupby('PU_zone_id').size()
    gdf_tmp = gdf_raw.copy()
    gdf_tmp['LocationID_int'] = gdf_tmp['LocationID'].astype(int)
    gdf_tmp['trips'] = gdf_tmp['LocationID_int'].map(zone_counts).fillna(0)
    gdf_tmp['trip_share'] = 100 * gdf_tmp['trips'] / gdf_tmp['trips'].sum()

    lisa = Moran_Local(gdf_tmp['trip_share'].values, w, permutations=999)
    mi_global = Moran(gdf_tmp['trip_share'].values, w)

    sig = lisa.p_sim < 0.05
    quadrant_map = {1: 'HH', 2: 'LH', 3: 'LL', 4: 'HL'}
    gdf_tmp['lisa_class'] = 'ns'
    for idx in range(len(gdf_tmp)):
        if sig[idx]:
            gdf_tmp.iloc[idx, gdf_tmp.columns.get_loc('lisa_class')] = \
                quadrant_map.get(lisa.q[idx], 'ns')

    gdf_tmp['zone_id_str'] = gdf_tmp['LocationID_int'].astype(str)
    gdf_tmp['color'] = gdf_tmp['lisa_class'].map(STYLE['lisa_colors'])

    zc_lookup = zone_centroids.drop_duplicates(subset='zone_id').set_index('zone_id')
    gdf_tmp['zone_name'] = gdf_tmp['LocationID_int'].map(zc_lookup['zone_name'])
    gdf_tmp['borough_name'] = gdf_tmp['LocationID_int'].map(zc_lookup['borough'])

    geojson_all = json.loads(gdf_tmp.to_json())
    for feat, zid, lclass in zip(
        geojson_all['features'],
        gdf_tmp['zone_id_str'],
        gdf_tmp['lisa_class']
    ):
        feat['properties']['zone_id_str'] = zid
        feat['properties']['lisa_class'] = lclass

    fig_lisa = go.Figure()

    fig_lisa.add_trace(go.Choroplethmap(
        geojson=taxi_zones_geo_4326,
        locations=all_zone_ids,
        featureidkey='properties.LocationID',
        z=[1] * len(all_zone_ids),
        colorscale=[[0, '#f5f5f5'], [1, '#f5f5f5']],
        marker_opacity=0.3, marker_line_width=0.3, marker_line_color='#ccc',
        showscale=False, hoverinfo='skip',
    ))

    for lclass in ['HH', 'LL', 'HL', 'LH', 'ns']:
        subset = gdf_tmp[gdf_tmp['lisa_class'] == lclass]
        if len(subset) == 0:
            continue

        subset_geojson = {
            'type': 'FeatureCollection',
            'features': [f for f in geojson_all['features']
                         if f['properties']['lisa_class'] == lclass]
        }

        fig_lisa.add_trace(go.Choroplethmap(
            geojson=subset_geojson,
            locations=subset['zone_id_str'].values,
            featureidkey='properties.zone_id_str',
            z=[1] * len(subset),
            colorscale=[[0, STYLE['lisa_colors'][lclass]], [1, STYLE['lisa_colors'][lclass]]],
            marker_opacity=0.8 if lclass != 'ns' else 0.35,
            marker_line_width=0.5,
            marker_line_color='rgba(255,255,255,0.6)',
            showscale=False,
            name=STYLE['lisa_labels'][lclass],
            showlegend=True,
            customdata=np.column_stack([
                subset['zone_name'].fillna('Unknown').values,
                subset['borough_name'].fillna('Unknown').values,
                subset['trip_share'].values,
            ]),
            hovertemplate=(
                '<b>%{customdata[0]}</b> (%{customdata[1]})<br>'
                'Trip share: %{customdata[2]:.2f}%<br>'
                f'LISA: {STYLE["lisa_labels"][lclass]}'
                '<extra></extra>'
            ),
        ))

    fig_lisa.update_layout(
        **base_layout(height=700, width=1100, margin=STYLE['margin_map']),
        map_style=STYLE['map_style'],
        map_zoom=STYLE['map_zoom'],
        map_center=dict(**STYLE['map_center']),
        legend=dict(
            title="LISA Classification",
            yanchor='top', y=0.99, xanchor='left', x=0.01,
            bgcolor='rgba(255,255,255,0.95)', bordercolor='#dde1e7',
            borderwidth=1, font=dict(size=STYLE['legend_size'], family=STYLE['font_family']),
        ),
        annotations=[dict(
            x=0.99, y=0.01, xref='paper', yref='paper',
            xanchor='right', yanchor='bottom',
            text=f"Global Moran's I: {mi_global.I:.4f} (p={mi_global.p_sim:.4f})",
            showarrow=False, font=dict(size=11, family=STYLE['font_family']),
            bgcolor='rgba(255,255,255,0.9)', bordercolor='#ddd', borderwidth=1,
        )]
    )

    save_html(fig_lisa, f'lisa_map_{year_label}.html')

---
## 13. Summary

In [None]:
matched_shifts = [
    haversine_km(
        centers_2018[i18, 0], centers_2018[i18, 1],
        centers_2025[i25, 0], centers_2025[i25, 1]
    )
    for i18, i25 in cluster_matches.items()
]
avg_shift = np.mean(matched_shifts)

print("=" * 70)
print("ANALYSIS COMPLETE")
print("=" * 70)
print(f"  Market share: {100 * uber_count_2018 / total_2018:.1f}% -> {100 * uber_count_2025 / total_2025:.1f}%")
print(f"  Gini: {gini_18:.3f} -> {gini_25:.3f}")
print(f"  Avg cluster shift: {avg_shift:.2f} km")
print(f"  Intra-cluster trips: {intra_2018_pct:.1f}% -> {intra_2025_pct:.1f}%")
print(f"  KS mismatch test: stat={ks_stat:.4f}, p={ks_p:.4e}")
print(f"  Levene mismatch test: stat={lev_stat:.4f}, p={lev_p:.4e}")
print("=" * 70)

print(f"\nOutputs saved to: {OUTPUT_DIR}")
print("  Charts:")
for fname in [
    'cluster_map_2018.html',
    'cluster_map_2025.html',
    'cluster_centroid_shifts.html',
    'pickup_density_change.html',
    'hourly_patterns.html',
    'cluster_hourly_profiles.html',
    'demand_heatmaps.html',
    'borough_analysis.html',
    'lorenz_curve.html',
    'intra_cluster_share.html',
    'lisa_map_2018.html',
    'lisa_map_2025.html',
]:
    print(f"    {fname}")