In [24]:
"""
Generate zone_centroids.csv from the TLC taxi zones shapefile.
Computes the centroid of each zone polygon in WGS84 coordinates.
"""

import geopandas as gpd

PATH_SHAPEFILE = '/Users/leoss/Desktop/Portfolio/Website-/uber_visualizations/data/taxi_zones/taxi_zones.shp'
OUTPUT_PATH = '/Users/leoss/Desktop/Portfolio/Website-/uber_visualizations/data/zone_centroids.csv'

gdf = gpd.read_file(PATH_SHAPEFILE)
gdf = gdf.to_crs(epsg=4326)
centroids_proj = gdf.geometry.centroid
centroids_wgs84 = gpd.GeoSeries(centroids_proj, crs=gdf.crs).to_crs(epsg=4326)

gdf['latitude'] = centroids_wgs84.y
gdf['longitude'] = centroids_wgs84.x


centroids = gdf[['LocationID', 'zone', 'borough', 'latitude', 'longitude']].copy()
centroids.columns = ['zone_id', 'zone_name', 'borough', 'latitude', 'longitude']
centroids['zone_id'] = centroids['zone_id'].astype(int)
centroids = centroids.sort_values('zone_id').reset_index(drop=True)

centroids.to_csv(OUTPUT_PATH, index=False)

print(f"Saved {len(centroids)} zone centroids to {OUTPUT_PATH}")
print(f"\nSample:")
print(centroids.head(10).to_string(index=False))
print(f"\nEWR check (zone_id=1):")
print(centroids[centroids['zone_id'] == 1].to_string(index=False))

Saved 263 zone centroids to /Users/leoss/Desktop/Portfolio/Website-/uber_visualizations/data/zone_centroids.csv

Sample:
 zone_id               zone_name       borough  latitude  longitude
       1          Newark Airport           EWR 40.691831 -74.174000
       2             Jamaica Bay        Queens 40.616745 -73.831299
       3 Allerton/Pelham Gardens         Bronx 40.864474 -73.847422
       4           Alphabet City     Manhattan 40.723752 -73.976968
       5           Arden Heights Staten Island 40.552659 -74.188484
       6 Arrochar/Fort Wadsworth Staten Island 40.600324 -74.071771
       7                 Astoria        Queens 40.761493 -73.919694
       8            Astoria Park        Queens 40.778559 -73.923086
       9              Auburndale        Queens 40.751035 -73.787949
      10            Baisley Park        Queens 40.678953 -73.790986

EWR check (zone_id=1):
 zone_id      zone_name borough  latitude  longitude
       1 Newark Airport     EWR 40.691831    -74.174



Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.




In [25]:
"""
UBER GEOGRAPHIC EVOLUTION: COMPREHENSIVE ANALYSIS
- Clusters matched by geographic proximity (not index)
- Consistent naming based on nearest major zone
- Methodologically sound comparisons
"""

import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from math import radians, cos, sin, asin, sqrt
import geopandas as gpd
import json
import gc

print("=" * 70)
print("UBER COMPREHENSIVE ANALYSIS: 2018 → 2025")
print("=" * 70)

# ============================================================================
# CONFIGURATION
# ============================================================================
SAMPLE_SIZE = 20_000_000
N_CLUSTERS = 6

OUTPUT_DIR = '/Users/leoss/Desktop/Portfolio/Website-/uber_visualizations/outputs/'
PATH_2018 = '/Users/leoss/Desktop/Portfolio/Website-/uber_visualizations/data/fhv_tripdata_2018-01.parquet'
PATH_2025 = '/Users/leoss/Desktop/Portfolio/Website-/uber_visualizations/data/fhvhv_tripdata_2025-01.parquet'
PATH_CENTROIDS = '/Users/leoss/Desktop/Portfolio/Website-/uber_visualizations/data/zone_centroids.csv'
PATH_SHAPEFILE = '/Users/leoss/Desktop/Portfolio/Website-/uber_visualizations/data/taxi_zones/taxi_zones.shp'

UBER_2018_BASES = ['B02512', 'B02598', 'B02617', 'B02682', 'B02764', 'B02765', 'B02835', 'B02836']
UBER_2025_LICENSE = 'HV0003'

# Cluster color palette (shared across all cluster visualizations)

CLUSTER_COLORS = ['#e6194b', '#3cb44b', '#4363d8', '#f58231', '#911eb4', '#42d4f4']


# Airport zone IDs (TLC)
AIRPORT_ZONE_IDS = {132, 138, 1}
AIRPORT_LABELS = {132: 'JFK', 138: 'LaGuardia', 1: 'Newark (EWR)'}

print(f"\nConfiguration:")
print(f"  Sample size: {SAMPLE_SIZE:,} trips per year")
print(f"  Clusters: {N_CLUSTERS}")

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def haversine_km(lat1, lon1, lat2, lon2):
    """Great-circle distance between two points in km."""
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    return 2 * 6371 * asin(sqrt(a))


def name_cluster_by_location(center_lat, center_lon, zone_centroids):
    """Name cluster based on nearest major zone."""
    zone_distances = np.sqrt(
        (zone_centroids['latitude'] - center_lat) ** 2 +
        (zone_centroids['longitude'] - center_lon) ** 2
    )
    nearest_idx = zone_distances.idxmin()
    nearest_zone = zone_centroids.iloc[nearest_idx]
    return f"{nearest_zone['borough']}: {nearest_zone['zone_name']}"


def match_clusters_by_proximity(centers_2018, centers_2025):
    """Match 2018 to 2025 clusters by geographic proximity (greedy)."""
    distances = cdist(centers_2018, centers_2025, metric='cityblock')
    matches = {}
    used_2025 = set()

    pairs = []
    for i in range(len(centers_2018)):
        for j in range(len(centers_2025)):
            pairs.append((i, j, distances[i, j]))

    pairs.sort(key=lambda x: x[2])

    for i, j, dist in pairs:
        if i not in matches and j not in used_2025:
            matches[i] = j
            used_2025.add(j)

    return matches


def lorenz_data(df, zone_col='zone_id'):
    """Return (cumulative share of zones, cumulative share of trips)."""
    zone_counts = df.groupby(zone_col).size().sort_values().values
    cum_zones = np.arange(1, len(zone_counts) + 1) / len(zone_counts)
    cum_trips = np.cumsum(zone_counts) / zone_counts.sum()
    return cum_zones, cum_trips


def gini_from_lorenz(cum_zones, cum_trips):
    """Gini coefficient via trapezoidal integration under the Lorenz curve."""
    area_under = np.trapz(cum_trips, cum_zones)
    return 1 - 2 * area_under


# ============================================================================
# LOAD ZONE CENTROIDS & SHAPEFILE
# ============================================================================
print("\n" + "=" * 70)
print("LOADING ZONE CENTROIDS & SHAPEFILE")
print("=" * 70)

zone_centroids = pd.read_csv(PATH_CENTROIDS)
print(f"  Loaded {len(zone_centroids)} zone centroids")

gdf_raw = gpd.read_file(PATH_SHAPEFILE)
gdf_raw = gdf_raw.to_crs(epsg=4326)
taxi_zones_geo_4326 = json.loads(gdf_raw.to_json())

for f in taxi_zones_geo_4326['features']:
    f['properties']['LocationID'] = str(int(f['properties']['LocationID']))

all_zone_ids = [f['properties']['LocationID'] for f in taxi_zones_geo_4326['features']]
print(f"  Loaded {len(all_zone_ids)} taxi zone geometries")

# ============================================================================
# PART 1: 2018 UBER DATA
# ============================================================================
print("\n" + "=" * 70)
print("PART 1: 2018 UBER ANALYSIS")
print("=" * 70)

print("\n[1.1] Loading 2018 data...")
table_2018 = pq.read_table(PATH_2018, columns=[])
total_2018 = table_2018.num_rows
print(f"  Total rows: {total_2018:,}")

columns_2018 = ['pickup_datetime', 'PUlocationID', 'dispatching_base_num']
table_2018 = pq.read_table(PATH_2018, columns=columns_2018)

df_2018_full = table_2018.to_pandas()
df_2018_full = df_2018_full[df_2018_full['dispatching_base_num'].isin(UBER_2018_BASES)].copy()

uber_count_2018 = len(df_2018_full)
print(f"  Uber trips: {uber_count_2018:,} ({100 * uber_count_2018 / total_2018:.1f}%)")

df_2018 = df_2018_full.sample(n=min(SAMPLE_SIZE, uber_count_2018), random_state=42)
del df_2018_full, table_2018
gc.collect()

print("\n[1.2] Processing temporal and geographic features...")
df_2018['pickup_datetime'] = pd.to_datetime(df_2018['pickup_datetime'])
df_2018['hour'] = df_2018['pickup_datetime'].dt.hour
df_2018['day_of_week'] = df_2018['pickup_datetime'].dt.dayofweek
df_2018['day_name'] = df_2018['pickup_datetime'].dt.day_name()

df_2018 = df_2018.dropna(subset=['PUlocationID'])
df_2018['PUlocationID'] = df_2018['PUlocationID'].astype(int)
df_2018 = df_2018.merge(
    zone_centroids[['zone_id', 'zone_name', 'borough', 'latitude', 'longitude']],
    left_on='PUlocationID',
    right_on='zone_id',
    how='left'
)
df_2018 = df_2018.dropna(subset=['latitude', 'longitude'])

print("\n[1.3] Clustering on geographic coordinates...")
coords_2018 = df_2018[['latitude', 'longitude']].values
kmeans_2018 = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
df_2018['cluster'] = kmeans_2018.fit_predict(coords_2018)

cluster_names_2018 = {}
for i in range(N_CLUSTERS):
    lat, lon = kmeans_2018.cluster_centers_[i]
    cluster_names_2018[i] = name_cluster_by_location(lat, lon, zone_centroids)

df_2018['cluster_name'] = df_2018['cluster'].map(cluster_names_2018)

cluster_counts_2018 = df_2018['cluster'].value_counts().sort_index()
print("  Cluster distribution:")
for i, count in enumerate(cluster_counts_2018):
    pct = 100 * count / len(df_2018)
    print(f"    Cluster {i} - {cluster_names_2018[i]}: {count:>7,} ({pct:>5.1f}%)")

# ============================================================================
# PART 2: 2025 UBER DATA
# ============================================================================
print("\n" + "=" * 70)
print("PART 2: 2025 UBER ANALYSIS")
print("=" * 70)

print("\n[2.1] Loading 2025 data...")
table_2025 = pq.read_table(PATH_2025, columns=[])
total_2025 = table_2025.num_rows
print(f"  Total rows: {total_2025:,}")

columns_2025 = ['pickup_datetime', 'PULocationID', 'hvfhs_license_num']
table_2025 = pq.read_table(PATH_2025, columns=columns_2025)

df_2025_full = table_2025.to_pandas()
df_2025_full = df_2025_full[df_2025_full['hvfhs_license_num'] == UBER_2025_LICENSE].copy()

uber_count_2025 = len(df_2025_full)
print(f"  Uber trips: {uber_count_2025:,} ({100 * uber_count_2025 / total_2025:.1f}%)")

df_2025 = df_2025_full.sample(n=min(SAMPLE_SIZE, uber_count_2025), random_state=42)
del df_2025_full, table_2025
gc.collect()

UBER COMPREHENSIVE ANALYSIS: 2018 → 2025

Configuration:
  Sample size: 20,000,000 trips per year
  Clusters: 6

LOADING ZONE CENTROIDS & SHAPEFILE
  Loaded 263 zone centroids
  Loaded 263 taxi zone geometries

PART 1: 2018 UBER ANALYSIS

[1.1] Loading 2018 data...
  Total rows: 19,808,094
  Uber trips: 4,502,999 (22.7%)

[1.2] Processing temporal and geographic features...

[1.3] Clustering on geographic coordinates...
  Cluster distribution:
    Cluster 0 - Manhattan: Yorkville East: 765,644 ( 17.0%)
    Cluster 1 - Brooklyn: Borough Park: 388,895 (  8.6%)
    Cluster 2 - Queens: Jamaica: 369,831 (  8.2%)
    Cluster 3 - Manhattan: Gramercy: 1,882,965 ( 41.7%)
    Cluster 4 - Bronx: East Tremont: 477,419 ( 10.6%)
    Cluster 5 - Brooklyn: Ocean Hill: 626,327 ( 13.9%)

PART 2: 2025 UBER ANALYSIS

[2.1] Loading 2025 data...
  Total rows: 20,405,666
  Uber trips: 15,356,455 (75.3%)


33

In [None]:
print("\n[2.2] Processing temporal and geographic features...")
df_2025['pickup_datetime'] = pd.to_datetime(df_2025['pickup_datetime'])
df_2025['hour'] = df_2025['pickup_datetime'].dt.hour
df_2025['day_of_week'] = df_2025['pickup_datetime'].dt.dayofweek
df_2025['day_name'] = df_2025['pickup_datetime'].dt.day_name()

df_2025 = df_2025.dropna(subset=['PULocationID'])
df_2025['PULocationID'] = df_2025['PULocationID'].astype(int)
df_2025 = df_2025.merge(
    zone_centroids[['zone_id', 'zone_name', 'borough', 'latitude', 'longitude']],
    left_on='PULocationID',
    right_on='zone_id',
    how='left'
)
df_2025 = df_2025.dropna(subset=['latitude', 'longitude'])

print("\n[2.3] Clustering on geographic coordinates...")
coords_2025 = df_2025[['latitude', 'longitude']].values
kmeans_2025 = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
df_2025['cluster'] = kmeans_2025.fit_predict(coords_2025)

cluster_names_2025 = {}
for i in range(N_CLUSTERS):
    lat, lon = kmeans_2025.cluster_centers_[i]
    cluster_names_2025[i] = name_cluster_by_location(lat, lon, zone_centroids)

df_2025['cluster_name'] = df_2025['cluster'].map(cluster_names_2025)

cluster_counts_2025 = df_2025['cluster'].value_counts().sort_index()
print("  Cluster distribution:")
for i, count in enumerate(cluster_counts_2025):
    pct = 100 * count / len(df_2025)
    print(f"    Cluster {i} - {cluster_names_2025[i]}: {count:>7,} ({pct:>5.1f}%)")

# ============================================================================
# MATCH CLUSTERS
# ============================================================================
print("\n" + "=" * 70)
print("MATCHING CLUSTERS BY GEOGRAPHIC PROXIMITY")
print("=" * 70)

centers_2018 = kmeans_2018.cluster_centers_
centers_2025 = kmeans_2025.cluster_centers_

cluster_matches = match_clusters_by_proximity(centers_2018, centers_2025)

print(f"\n  {'2018 Cluster':<55} → {'2025 Cluster':<55} {'Shift (km)':>10}")
print("  " + "-" * 125)

for idx_2018, idx_2025 in sorted(cluster_matches.items()):
    lat1, lon1 = centers_2018[idx_2018]
    lat2, lon2 = centers_2025[idx_2025]
    dist = haversine_km(lat1, lon1, lat2, lon2)
    print(f"  {cluster_names_2018[idx_2018]:<55} → {cluster_names_2025[idx_2025]:<55} {dist:>10.2f}")


cluster_color_map_2018 = {
    cluster_names_2018[i]: CLUSTER_COLORS[i % len(CLUSTER_COLORS)]
    for i in range(N_CLUSTERS)
}

cluster_color_map_2025 = {
    cluster_names_2025[idx_25]: CLUSTER_COLORS[idx_18 % len(CLUSTER_COLORS)]
    for idx_18, idx_25 in cluster_matches.items()
}

# ============================================================================
# PART 3: VISUALIZATIONS
# ============================================================================
print("\n" + "=" * 70)
print("PART 3: CREATING VISUALIZATIONS")
print("=" * 70)

# --- 3.1a: 2018 Cluster Choropleth Map ---
print("\n[3.1a] 2018 cluster choropleth map...")

zone_clusters = df_2018.groupby('zone_id').agg({
    'cluster': lambda x: x.mode()[0],
    'cluster_name': lambda x: x.mode()[0],
    'zone_name': 'first',
    'borough': 'first'
}).reset_index()
zone_clusters['zone_id'] = zone_clusters['zone_id'].astype(float).astype(int).astype(str)

zone_ids_with_data = set(zone_clusters['zone_id'].values)
filtered_geojson = {
    'type': 'FeatureCollection',
    'features': [f for f in taxi_zones_geo_4326['features']
                 if f['properties']['LocationID'] in zone_ids_with_data]
}
print(f"  Filtered to {len(filtered_geojson['features'])} zones with data")

# Build figure: grey background FIRST, then choropleth on top
fig = go.Figure()

# Grey background — all taxi zones
fig.add_trace(go.Choroplethmap(
    geojson=taxi_zones_geo_4326,
    locations=all_zone_ids,
    featureidkey='properties.LocationID',
    z=[1] * len(all_zone_ids),
    colorscale=[[0, '#e8e8e8'], [1, '#e8e8e8']],
    marker_opacity=0.4,
    marker_line_width=0.3,
    marker_line_color='#ccc',
    showscale=False,
    hoverinfo='skip',
))
# 2018 map
fig_tmp = px.choropleth_map(
    zone_clusters,
    geojson=filtered_geojson,
    locations='zone_id',
    featureidkey='properties.LocationID',
    color='cluster_name',
    color_discrete_map=cluster_color_map_2018,  # ← replaces color_discrete_sequence
    map_style='carto-positron-nolabels',
    zoom=9.5,
    center={'lat': 40.7128, 'lon': -73.9352},
    opacity=0.95,
    hover_data={'zone_id': False, 'zone_name': True, 'borough': True, 'cluster_name': True},
    labels={'cluster_name': 'Cluster'}
)


fig_tmp.update_traces(marker=dict(line=dict(width=0.8, color='rgba(255,255,255,0.8)')))
for trace in fig_tmp.data:
    fig.add_trace(trace)

# Centroid markers
for i, (lat, lon) in enumerate(centers_2018):
    name = cluster_names_2018[i]
    fig.add_trace(go.Scattermap(
        lat=[lat], lon=[lon],
        mode='markers+text',
        marker=dict(size=20, color='black', opacity=0.85),
        text=str(i),
        textfont=dict(size=11, color='white', family='Arial Black'),
        textposition='middle center',
        name=name,
        showlegend=False,
        hovertemplate=f'<b>Cluster {i}</b><br>{name}<extra></extra>',
    ))

fig.update_layout(
    title='2018 Uber: Geographic Demand Clusters',
    legend=dict(
        title="Demand Clusters",
        yanchor="top", y=0.99, xanchor="left", x=0.01,
        bgcolor="rgba(255,255,255,0.95)",
        bordercolor="#dde1e7", borderwidth=1, font=dict(size=12),
    ),
    margin=dict(l=0, r=0, t=50, b=0),
    map=dict(
        style='carto-positron-nolabels',
        center={'lat': 40.7128, 'lon': -73.9352},
        zoom=9.5,
    ),
    height=700, width=1200,
)

fig.write_html(OUTPUT_DIR + '1_uber_2018_clusters.html')
fig.show()
print("  ✓ Saved: 1_uber_2018_clusters.html")

# --- 3.1b: 2025 Cluster Choropleth Map ---
print("\n[3.1b] 2025 cluster choropleth map...")

zone_clusters_2025 = df_2025.groupby('zone_id').agg({
    'cluster': lambda x: x.mode()[0],
    'cluster_name': lambda x: x.mode()[0],
    'zone_name': 'first',
    'borough': 'first'
}).reset_index()
zone_clusters_2025['zone_id'] = zone_clusters_2025['zone_id'].astype(float).astype(int).astype(str)

zone_ids_2025 = set(zone_clusters_2025['zone_id'].values)
filtered_geojson_2025 = {
    'type': 'FeatureCollection',
    'features': [f for f in taxi_zones_geo_4326['features']
                 if f['properties']['LocationID'] in zone_ids_2025]
}
print(f"  Filtered to {len(filtered_geojson_2025['features'])} zones with data")

fig_2025 = go.Figure()

# Grey background
fig_tmp_2025 = px.choropleth_map(
    zone_clusters_2025,
    geojson=filtered_geojson_2025,
    locations='zone_id',
    featureidkey='properties.LocationID',
    color='cluster_name',
    color_discrete_map=cluster_color_map_2025,  # ← same color for matched clusters
    map_style='carto-positron-nolabels',
    zoom=9.5,
    center={'lat': 40.7128, 'lon': -73.9352},
    opacity=0.95,
    hover_data={'zone_id': False, 'zone_name': True, 'borough': True, 'cluster_name': True},
    labels={'cluster_name': 'Cluster'}
)


fig_tmp_2025.update_traces(marker=dict(line=dict(width=0.8, color='rgba(255,255,255,0.8)')))
for trace in fig_tmp_2025.data:
    fig_2025.add_trace(trace)

for i, (lat, lon) in enumerate(centers_2025):
    name = cluster_names_2025[i]
    fig_2025.add_trace(go.Scattermap(
        lat=[lat], lon=[lon],
        mode='markers+text',
        marker=dict(size=0, color='black', opacity=0.85),
        text=str(i),
        textfont=dict(size=11, color='white', family='Arial Black'),
        textposition='middle center',
        name=name,
        showlegend=False,
        hovertemplate=f'<b>Cluster {i}</b><br>{name}<extra></extra>',
    ))

fig_2025.update_layout(
    title='2025 Uber: Geographic Demand Clusters',
    legend=dict(
        title="Demand Clusters",
        yanchor="top", y=0.99, xanchor="left", x=0.01,
        bgcolor="rgba(255,255,255,0.95)",
        bordercolor="#dde1e7", borderwidth=1, font=dict(size=12),
    ),
    margin=dict(l=0, r=0, t=50, b=0),
    map=dict(
        style='carto-positron-nolabels',
        center={'lat': 40.7128, 'lon': -73.9352},
        zoom=9.5,
    ),
    height=700, width=1200,
)

fig_2025.write_html(OUTPUT_DIR + '2_uber_2025_clusters.html')
fig_2025.show()
print("  ✓ Saved: 2_uber_2025_clusters.html")

# --- 3.2: Top zones comparison ---
print("\n[3.2] Creating top zones comparison...")

zone_counts_2018 = df_2018.groupby(['zone_id', 'zone_name', 'borough']).size().reset_index(name='count_2018')
zone_counts_2025 = df_2025.groupby(['zone_id', 'zone_name', 'borough']).size().reset_index(name='count_2025')

zone_comparison = zone_counts_2018.merge(
    zone_counts_2025, on=['zone_id', 'zone_name', 'borough'], how='outer'
).fillna(0)
zone_comparison['share_2018'] = 100 * zone_comparison['count_2018'] / zone_comparison['count_2018'].sum()
zone_comparison['share_2025'] = 100 * zone_comparison['count_2025'] / zone_comparison['count_2025'].sum()
zone_comparison['share_change'] = zone_comparison['share_2025'] - zone_comparison['share_2018']

top_zones = zone_comparison.nlargest(20, 'share_2025')

fig_top_zones = go.Figure()
fig_top_zones.add_trace(go.Bar(
    name='2018',
    y=top_zones['zone_name'] + ' (' + top_zones['borough'] + ')',
    x=top_zones['share_2018'],
    orientation='h',
    marker_color='#ff6b6b',
    text=[f'{x:.1f}%' for x in top_zones['share_2018']],
    textposition='outside'
))
fig_top_zones.add_trace(go.Bar(
    name='2025',
    y=top_zones['zone_name'] + ' (' + top_zones['borough'] + ')',
    x=top_zones['share_2025'],
    orientation='h',
    marker_color='#4ecdc4',
    text=[f'{x:.1f}%' for x in top_zones['share_2025']],
    textposition='outside'
))

fig_top_zones.update_layout(
    title='Top 20 Pickup Zones: 2018 vs 2025 (% of Total Trips)',
    xaxis_title='Share of Total Trips (%)',
    barmode='group',
    height=800, width=1200,
    template='plotly_white'
)

fig_top_zones.write_html(OUTPUT_DIR + '3_top_zones_comparison.html')
print(f"  ✓ Saved: 3_top_zones_comparison.html")

# --- 3.3: Hourly patterns (normalized) ---
print("\n[3.3] Creating temporal analysis (normalized)...")

hourly_2018 = df_2018.groupby('hour').size()
hourly_2025 = df_2025.groupby('hour').size()
hourly_2018_pct = 100 * hourly_2018 / hourly_2018.sum()
hourly_2025_pct = 100 * hourly_2025 / hourly_2025.sum()

fig_hourly = go.Figure()
fig_hourly.add_trace(go.Scatter(
    x=hourly_2018_pct.index, y=hourly_2018_pct.values, name='2018',
    mode='lines+markers', line=dict(color='#ff6b6b', width=3),
    hovertemplate='<b>Hour %{x}</b><br>Share: %{y:.2f}%<extra></extra>'
))
fig_hourly.add_trace(go.Scatter(
    x=hourly_2025_pct.index, y=hourly_2025_pct.values, name='2025',
    mode='lines+markers', line=dict(color='#4ecdc4', width=3),
    hovertemplate='<b>Hour %{x}</b><br>Share: %{y:.2f}%<extra></extra>'
))

fig_hourly.update_layout(
    title='Hourly Demand Pattern: 2018 vs 2025 (% of Daily Trips)',
    xaxis_title='Hour of Day',
    yaxis_title='Share of Total Trips (%)',
    template='plotly_white',
    height=500, width=1200,
    hovermode='x unified'
)

fig_hourly.write_html(OUTPUT_DIR + '4_hourly_patterns.html')
print(f"  ✓ Saved: 4_hourly_patterns.html")

# --- 3.4: Daily patterns (normalized) ---
print("\n[3.4] Creating day-of-week analysis (normalized)...")

day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_2018 = df_2018.groupby('day_name').size().reindex(day_order)
daily_2025 = df_2025.groupby('day_name').size().reindex(day_order)
daily_2018_pct = 100 * daily_2018 / daily_2018.sum()
daily_2025_pct = 100 * daily_2025 / daily_2025.sum()

fig_daily = go.Figure()
fig_daily.add_trace(go.Bar(
    name='2018', x=day_order, y=daily_2018_pct.values, marker_color='#ff6b6b',
    text=[f'{x:.1f}%' for x in daily_2018_pct.values], textposition='outside'
))
fig_daily.add_trace(go.Bar(
    name='2025', x=day_order, y=daily_2025_pct.values, marker_color='#4ecdc4',
    text=[f'{x:.1f}%' for x in daily_2025_pct.values], textposition='outside'
))

fig_daily.update_layout(
    title='Weekly Demand Pattern: 2018 vs 2025 (% of Weekly Trips)',
    xaxis_title='Day of Week',
    yaxis_title='Share of Total Trips (%)',
    barmode='group',
    template='plotly_white',
    height=500, width=1200
)

fig_daily.write_html(OUTPUT_DIR + '5_daily_patterns.html')
print(f"  ✓ Saved: 5_daily_patterns.html")

# --- 3.5: Heatmaps (normalized) ---
print("\n[3.5] Creating demand heatmaps (normalized)...")

fig_heat = make_subplots(rows=2, cols=1, subplot_titles=('2018', '2025'), vertical_spacing=0.12)

for df, row in [(df_2018, 1), (df_2025, 2)]:
    pivot = df.groupby(['day_name', 'hour']).size().reset_index(name='trips')
    pivot_table = pivot.pivot(index='day_name', columns='hour', values='trips').reindex(day_order)
    pivot_table_pct = 100 * pivot_table / pivot_table.sum().sum()

    fig_heat.add_trace(go.Heatmap(
        z=pivot_table_pct.values,
        x=pivot_table_pct.columns,
        y=pivot_table_pct.index,
        colorscale='Viridis',
        showscale=(row == 2),
        colorbar=dict(title='% of Total<br>Trips') if row == 2 else None,
        hovertemplate='<b>%{y}, Hour %{x}</b><br>Share: %{z:.2f}%<extra></extra>'
    ), row=row, col=1)

fig_heat.update_layout(title='Demand Heatmaps: Hour × Day (% of Total Trips)', height=700, width=1200)
fig_heat.write_html(OUTPUT_DIR + '6_demand_heatmaps.html')
print(f"  ✓ Saved: 6_demand_heatmaps.html")

# --- 3.6: Borough analysis (normalized) ---
print("\n[3.6] Creating borough analysis (normalized)...")

borough_2018 = df_2018.groupby('borough').size()
borough_2025 = df_2025.groupby('borough').size()
borough_2018_pct = 100 * borough_2018 / borough_2018.sum()
borough_2025_pct = 100 * borough_2025 / borough_2025.sum()

fig_borough = go.Figure()
fig_borough.add_trace(go.Bar(
    name='2018', x=borough_2018_pct.index, y=borough_2018_pct.values, marker_color='#ff6b6b',
    text=[f'{x:.1f}%' for x in borough_2018_pct.values], textposition='outside'
))
fig_borough.add_trace(go.Bar(
    name='2025', x=borough_2025_pct.index, y=borough_2025_pct.values, marker_color='#4ecdc4',
    text=[f'{x:.1f}%' for x in borough_2025_pct.values], textposition='outside'
))

fig_borough.update_layout(
    title='Demand by Borough: 2018 vs 2025 (% of Total Trips)',
    xaxis_title='Borough',
    yaxis_title='Share of Total Trips (%)',
    barmode='group',
    template='plotly_white',
    height=500, width=1000
)

fig_borough.write_html(OUTPUT_DIR + '7_borough_analysis.html')
print(f"  ✓ Saved: 7_borough_analysis.html")

# --- 3.7: Cluster shift map (matched, Haversine) ---
print("\n[3.7] Creating cluster shift map...")

fig_shifts = go.Figure()

for i in range(len(centers_2018)):
    fig_shifts.add_trace(go.Scattermap(
        lat=[centers_2018[i, 0]], lon=[centers_2018[i, 1]],
        mode='markers+text',
        marker=dict(size=25, color='#ff6b6b', opacity=0.8),
        text=f"18-{i}",
        textfont=dict(size=10, color='white', family='Arial Black'),
        textposition='middle center',
        name=f'2018: {cluster_names_2018[i]}',
        hovertemplate=f'<b>2018 Cluster {i}</b><br>{cluster_names_2018[i]}<extra></extra>'
    ))

for i in range(len(centers_2025)):
    fig_shifts.add_trace(go.Scattermap(
        lat=[centers_2025[i, 0]], lon=[centers_2025[i, 1]],
        mode='markers+text',
        marker=dict(size=25, color='#4ecdc4', opacity=0.8),
        text=f"25-{i}",
        textfont=dict(size=10, color='white', family='Arial Black'),
        textposition='middle center',
        name=f'2025: {cluster_names_2025[i]}',
        hovertemplate=f'<b>2025 Cluster {i}</b><br>{cluster_names_2025[i]}<extra></extra>'
    ))

for idx_2018, idx_2025 in cluster_matches.items():
    lat1, lon1 = centers_2018[idx_2018]
    lat2, lon2 = centers_2025[idx_2025]
    dist = haversine_km(lat1, lon1, lat2, lon2)

    fig_shifts.add_trace(go.Scattermap(
        lat=[lat1, lat2], lon=[lon1, lon2],
        mode='lines',
        line=dict(width=3, color='black'),
        showlegend=False,
        hovertemplate=(
            f'<b>Shift: {dist:.2f} km</b><br>'
            f'From: {cluster_names_2018[idx_2018]}<br>'
            f'To: {cluster_names_2025[idx_2025]}<extra></extra>'
        )
    ))

fig_shifts.update_layout(
    title='Cluster Center Shifts: 2018 → 2025 (Matched by Geographic Proximity)',
    map_style='carto-positron',
    map_zoom=10,
    map_center=dict(lat=40.75, lon=-73.95),
    height=800, width=1400,
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, bgcolor="rgba(255,255,255,0.9)")
)

fig_shifts.write_html(OUTPUT_DIR + '8_cluster_shifts.html')
print(f"  ✓ Saved: 8_cluster_shifts.html")

# ============================================================================
# PART 4: ADDITIONAL VISUALIZATIONS
# ============================================================================
print("\n" + "=" * 70)
print("PART 4: ADDITIONAL VISUALIZATIONS")
print("=" * 70)

# --- 4.1: Lorenz Curve ---
print("\n[4.1] Creating Lorenz curve...")

cz_18, ct_18 = lorenz_data(df_2018)
cz_25, ct_25 = lorenz_data(df_2025)
gini_18 = gini_from_lorenz(cz_18, ct_18)
gini_25 = gini_from_lorenz(cz_25, ct_25)

fig_lorenz = go.Figure()

fig_lorenz.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    mode='lines', line=dict(color='#888', width=1.5, dash='dash'),
    name='Perfect equality', hoverinfo='skip'
))
fig_lorenz.add_trace(go.Scatter(
    x=np.concatenate([[0], cz_18]), y=np.concatenate([[0], ct_18]),
    mode='lines', line=dict(color='#ff6b6b', width=2.5),
    name=f'2018 (Gini = {gini_18:.3f})',
    hovertemplate='%{x:.0%} of zones → %{y:.0%} of trips<extra>2018</extra>'
))
fig_lorenz.add_trace(go.Scatter(
    x=np.concatenate([[0], cz_25]), y=np.concatenate([[0], ct_25]),
    mode='lines', line=dict(color='#4ecdc4', width=2.5),
    name=f'2025 (Gini = {gini_25:.3f})',
    hovertemplate='%{x:.0%} of zones → %{y:.0%} of trips<extra>2025</extra>'
))

fig_lorenz.update_layout(
    title='Lorenz Curve: Spatial Concentration of Uber Pickups by Zone',
    xaxis_title='Cumulative Share of Zones (ranked by trip count)',
    yaxis_title='Cumulative Share of Trips',
    xaxis=dict(range=[0, 1], tickformat='.0%'),
    yaxis=dict(range=[0, 1], tickformat='.0%'),
    template='plotly_white',
    height=600, width=800,
    legend=dict(x=0.05, y=0.95),
    annotations=[dict(
        x=0.65, y=0.25,
        text=f'Gini 2018: {gini_18:.3f}<br>Gini 2025: {gini_25:.3f}',
        showarrow=False, font=dict(size=13),
        bgcolor='rgba(255,255,255,0.85)',
        bordercolor='#ddd', borderwidth=1
    )]
)

fig_lorenz.write_html(OUTPUT_DIR + '9_lorenz_curve.html')
print(f"  ✓ Saved: 9_lorenz_curve.html")

# --- 4.2: Manhattan vs Airports ---
print("\n[4.2] Creating Manhattan vs Airports comparison...")


def manhattan_airport_shares(df, zone_col='zone_id', borough_col='borough'):
    total = len(df)
    result = {}
    result['Manhattan'] = len(df[df[borough_col] == 'Manhattan']) / total * 100

    for zid, label in AIRPORT_LABELS.items():
        result[label] = len(df[df[zone_col] == zid]) / total * 100

    result['All Airports'] = len(df[df[zone_col].isin(AIRPORT_ZONE_IDS)]) / total * 100
    result['Other'] = 100 - result['Manhattan'] - result['All Airports']
    return result


shares_18 = manhattan_airport_shares(df_2018)
shares_25 = manhattan_airport_shares(df_2025)

categories = ['Manhattan', 'JFK', 'LaGuardia', 'Newark (EWR)', 'All Airports', 'Other']
vals_18 = [shares_18[c] for c in categories]
vals_25 = [shares_25[c] for c in categories]
changes = [v25 - v18 for v18, v25 in zip(vals_18, vals_25)]

fig_ma = make_subplots(
    rows=2, cols=1,
    row_heights=[0.65, 0.35],
    shared_xaxes=True,
    vertical_spacing=0.08,
    subplot_titles=('Trip Share by Area: 2018 vs 2025', 'Percentage-Point Change (2025 − 2018)')
)

fig_ma.add_trace(go.Bar(
    x=categories, y=vals_18, name='2018', marker_color='#ff6b6b',
    text=[f'{v:.1f}%' for v in vals_18], textposition='outside'
), row=1, col=1)
fig_ma.add_trace(go.Bar(
    x=categories, y=vals_25, name='2025', marker_color='#4ecdc4',
    text=[f'{v:.1f}%' for v in vals_25], textposition='outside'
), row=1, col=1)

bar_colors = ['#2ecc71' if c > 0 else '#e74c3c' for c in changes]
fig_ma.add_trace(go.Bar(
    x=categories, y=changes, marker_color=bar_colors,
    text=[f'{c:+.1f} pp' for c in changes], textposition='outside',
    showlegend=False
), row=2, col=1)
fig_ma.add_hline(y=0, line_dash='dot', line_color='#888', row=2, col=1)

fig_ma.update_layout(
    barmode='group', template='plotly_white',
    height=750, width=1000,
    legend=dict(x=0.85, y=0.98),
)
fig_ma.update_yaxes(title_text='Share of Total Trips (%)', row=1, col=1)
fig_ma.update_yaxes(title_text='Change (pp)', row=2, col=1)

fig_ma.write_html(OUTPUT_DIR + '10_manhattan_vs_airports.html')
print(f"  ✓ Saved: 10_manhattan_vs_airports.html")

# --- 4.3: Improved Cluster Centroid Shift Map ---
print("\n[4.3] Creating improved cluster centroid shift map...")

shift_data = []
for idx_18, idx_25 in cluster_matches.items():
    lat1, lon1 = centers_2018[idx_18]
    lat2, lon2 = centers_2025[idx_25]
    shift_data.append({
        'idx_18': idx_18, 'idx_25': idx_25,
        'name_18': cluster_names_2018[idx_18],
        'name_25': cluster_names_2025[idx_25],
        'dist_km': haversine_km(lat1, lon1, lat2, lon2)
    })

fig_shift2 = go.Figure()

# Grey background
fig_shift2.add_trace(go.Choroplethmap(
    geojson=taxi_zones_geo_4326,
    locations=all_zone_ids,
    featureidkey='properties.LocationID',
    z=[1] * len(all_zone_ids),
    colorscale=[[0, '#e8e8e8'], [1, '#e8e8e8']],
    marker_opacity=0.35,
    marker_line_width=0.3,
    marker_line_color='#ccc',
    showscale=False,
    hoverinfo='skip',
))

# Shift lines
for sa in shift_data:
    lat1, lon1 = centers_2018[sa['idx_18']]
    lat2, lon2 = centers_2025[sa['idx_25']]
    fig_shift2.add_trace(go.Scattermap(
        lat=[lat1, lat2], lon=[lon1, lon2],
        mode='lines', line=dict(width=2.5, color='#333'),
        showlegend=False,
        hovertemplate=(
            f"<b>{sa['name_18']}</b> → <b>{sa['name_25']}</b>"
            f"<br>Shift: {sa['dist_km']:.2f} km<extra></extra>"
        )
    ))

# 2018 centroids — circles
for i in range(len(centers_2018)):
    fig_shift2.add_trace(go.Scattermap(
        lat=[centers_2018[i, 0]], lon=[centers_2018[i, 1]],
        mode='markers',
        marker=dict(size=16, color=CLUSTER_COLORS[i % len(CLUSTER_COLORS)], opacity=0.9),
        name=f'2018: {cluster_names_2018[i]}',
        legendgroup='2018', legendgrouptitle_text='2018 Centroids',
        hovertemplate=f'<b>2018 — Cluster {i}</b><br>{cluster_names_2018[i]}<extra></extra>',
    ))

# 2025 centroids — squares, same color as matched 2018 cluster
for i in range(len(centers_2025)):
    matched_18 = [k for k, v in cluster_matches.items() if v == i]
    color_idx = matched_18[0] if matched_18 else i
    fig_shift2.add_trace(go.Scattermap(
        lat=[centers_2025[i, 0]], lon=[centers_2025[i, 1]],
        mode='markers',
        marker=dict(size=16, color=CLUSTER_COLORS[color_idx % len(CLUSTER_COLORS)],
                    opacity=0.9, symbol='square'),
        name=f'2025: {cluster_names_2025[i]}',
        legendgroup='2025', legendgrouptitle_text='2025 Centroids',
        hovertemplate=f'<b>2025 — Cluster {i}</b><br>{cluster_names_2025[i]}<extra></extra>',
    ))

fig_shift2.update_layout(
    title='Cluster Centroid Shifts: 2018 → 2025 (Haversine distances, matched by proximity)',
    map_style='carto-positron-nolabels',
    map_zoom=9.8,
    map_center=dict(lat=40.72, lon=-73.94),
    height=750, width=1200,
    legend=dict(
        yanchor='top', y=0.99, xanchor='left', x=0.01,
        bgcolor='rgba(255,255,255,0.92)',
        bordercolor='#dde1e7', borderwidth=1, font=dict(size=11),
    ),
    margin=dict(l=0, r=0, t=50, b=0),
)

fig_shift2.write_html(OUTPUT_DIR + '11_cluster_shifts_improved.html')
print(f"  ✓ Saved: 11_cluster_shifts_improved.html")

# --- 4.4: Borough Share Treemaps ---
print("\n[4.4] Creating borough share treemaps...")

BOROUGH_COLORS = {
    'Manhattan': '#4363d8', 'Brooklyn': '#3cb44b', 'Queens': '#f58231',
    'Bronx': '#e6194b', 'Staten Island': '#911eb4', 'EWR': '#42d4f4',
}

borough_df_2018 = df_2018.groupby('borough').size().reset_index(name='trips')
borough_df_2018['share'] = borough_df_2018['trips'] / borough_df_2018['trips'].sum()
borough_df_2018['label'] = borough_df_2018.apply(
    lambda r: f"{r['borough']}<br>{r['share']:.1%}", axis=1
)

borough_df_2025 = df_2025.groupby('borough').size().reset_index(name='trips')
borough_df_2025['share'] = borough_df_2025['trips'] / borough_df_2025['trips'].sum()
borough_df_2025['label'] = borough_df_2025.apply(
    lambda r: f"{r['borough']}<br>{r['share']:.1%}", axis=1
)

fig_tree = make_subplots(
    rows=1, cols=2,
    specs=[[{'type': 'treemap'}, {'type': 'treemap'}]],
    subplot_titles=('January 2018', 'January 2025'),
    horizontal_spacing=0.03,
)

fig_tree.add_trace(go.Treemap(
    labels=borough_df_2018['label'],
    parents=['' for _ in range(len(borough_df_2018))],
    values=borough_df_2018['trips'],
    marker=dict(
        colors=[BOROUGH_COLORS.get(b, '#999') for b in borough_df_2018['borough']],
        line_width=1.5
    ),
    textinfo='label', textfont=dict(size=15),
    hovertemplate='<b>%{label}</b><br>Trips: %{value:,.0f}<extra></extra>',
), row=1, col=1)

fig_tree.add_trace(go.Treemap(
    labels=borough_df_2025['label'],
    parents=['' for _ in range(len(borough_df_2025))],
    values=borough_df_2025['trips'],
    marker=dict(
        colors=[BOROUGH_COLORS.get(b, '#999') for b in borough_df_2025['borough']],
        line_width=1.5
    ),
    textinfo='label', textfont=dict(size=15),
    hovertemplate='<b>%{label}</b><br>Trips: %{value:,.0f}<extra></extra>',
), row=1, col=2)

fig_tree.update_layout(
    height=550, width=1100,
    template='plotly_white',
    margin=dict(t=80, b=20, l=10, r=10),
)

fig_tree.write_html(OUTPUT_DIR + '12_borough_treemaps.html')
print(f"  ✓ Saved: 12_borough_treemaps.html")

# ============================================================================
# PART 5: METRICS & SUMMARY
# ============================================================================
print("\n" + "=" * 70)
print("PART 5: METRICS & SUMMARY")
print("=" * 70)

peak_hour_2018 = df_2018.groupby('hour').size().idxmax()
peak_hour_2025 = df_2025.groupby('hour').size().idxmax()

top_zone_2018 = df_2018.groupby('zone_name').size().idxmax()
top_zone_2025 = df_2025.groupby('zone_name').size().idxmax()

matched_shifts = [
    haversine_km(
        centers_2018[i18, 0], centers_2018[i18, 1],
        centers_2025[i25, 0], centers_2025[i25, 1]
    )
    for i18, i25 in cluster_matches.items()
]
avg_shift = np.mean(matched_shifts)

summary = pd.DataFrame({
    'Metric': [
        'Total Trips (All Companies)',
        'Uber Trips',
        'Uber Market Share (%)',
        'Sample Size',
        'Unique Zones',
        'Top Pickup Zone',
        'Peak Hour',
        'Most Active Day',
        'Gini Coefficient (zone-level)',
        'Avg Cluster Shift (km)'
    ],
    '2018': [
        f'{total_2018:,}',
        f'{uber_count_2018:,}',
        f'{100 * uber_count_2018 / total_2018:.1f}',
        f'{len(df_2018):,}',
        df_2018['zone_id'].nunique(),
        top_zone_2018,
        f'{peak_hour_2018}:00',
        df_2018.groupby('day_name').size().idxmax(),
        f'{gini_18:.4f}',
        '-'
    ],
    '2025': [
        f'{total_2025:,}',
        f'{uber_count_2025:,}',
        f'{100 * uber_count_2025 / total_2025:.1f}',
        f'{len(df_2025):,}',
        df_2025['zone_id'].nunique(),
        top_zone_2025,
        f'{peak_hour_2025}:00',
        df_2025.groupby('day_name').size().idxmax(),
        f'{gini_25:.4f}',
        f'{avg_shift:.2f}'
    ]
})

summary.to_csv(OUTPUT_DIR + '13_comprehensive_summary.csv', index=False)
print(f"\n  ✓ Saved: 13_comprehensive_summary.csv")

zone_comparison.sort_values('count_2025', ascending=False).head(50).to_csv(
    OUTPUT_DIR + '14_top_50_zones.csv', index=False
)
print(f"  ✓ Saved: 14_top_50_zones.csv")

# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("\n" + "=" * 70)
print("ANALYSIS COMPLETE")
print("=" * 70)

print(f"\nGenerated 14 files in {OUTPUT_DIR}")
print(f"  1.  2018 cluster choropleth map")
print(f"  2.  2025 cluster choropleth map")
print(f"  3.  Top 20 zones comparison")
print(f"  4.  Hourly demand patterns")
print(f"  5.  Daily demand patterns")
print(f"  6.  Hour × Day heatmaps")
print(f"  7.  Borough bar chart")
print(f"  8.  Cluster shift map (basic)")
print(f"  9.  Lorenz curve")
print(f"  10. Manhattan vs Airports")
print(f"  11. Cluster shifts (improved)")
print(f"  12. Borough treemaps")
print(f"  13. Summary metrics CSV")
print(f"  14. Top 50 zones CSV")

print(f"\nKey numbers:")
print(f"  Market share: {100 * uber_count_2018 / total_2018:.1f}% → {100 * uber_count_2025 / total_2025:.1f}%")
print(f"  Gini (zone-level): {gini_18:.3f} → {gini_25:.3f}")
print(f"  Avg cluster shift: {avg_shift:.2f} km")
print(f"  Peak hour: {peak_hour_2018}:00 → {peak_hour_2025}:00")
print("=" * 70)


[2.2] Processing temporal and geographic features...

[2.3] Clustering on geographic coordinates...
  Cluster distribution:
    Cluster 0 - Manhattan: Murray Hill: 5,996,328 ( 38.9%)
    Cluster 1 - Bronx: Claremont/Bathgate: 2,713,198 ( 17.6%)
    Cluster 2 - Brooklyn: Prospect-Lefferts Gardens: 3,180,936 ( 20.7%)
    Cluster 3 - Queens: Baisley Park: 1,353,333 (  8.8%)
    Cluster 4 - Queens: Elmhurst: 1,824,961 ( 11.8%)
    Cluster 5 - Staten Island: Grymes Hill/Clifton: 332,575 (  2.2%)

MATCHING CLUSTERS BY GEOGRAPHIC PROXIMITY

  2018 Cluster                                            → 2025 Cluster                                            Shift (km)
  -----------------------------------------------------------------------------------------------------------------------------
  Manhattan: Yorkville East                               → Queens: Elmhurst                                              6.15
  Brooklyn: Borough Park                                  → Staten Island: Gr