In [2]:
"""
UBER GEOGRAPHIC EVOLUTION: COMPREHENSIVE ANALYSIS (PROPERLY FIXED)
- Clusters matched by geographic proximity (not index)
- Consistent naming based on nearest major zone
- Methodologically sound comparisons
"""

import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import gc

print("="*70)
print("UBER COMPREHENSIVE ANALYSIS: 2018 ‚Üí 2025 (METHODOLOGICALLY FIXED)")
print("="*70)

# ============================================================================
# CONFIGURATION
# ============================================================================
SAMPLE_SIZE = 20_000_000
N_CLUSTERS = 6

OUTPUT_DIR = '/Users/leoss/Desktop/Portfolio/Website-/Uber/outputs/'
PATH_2018 = '/Users/leoss/Desktop/Portfolio/Website-/Uber/data/fhv_tripdata_2018-01.parquet'
PATH_2025 = '/Users/leoss/Desktop/Portfolio/Website-/Uber/data/fhvhv_tripdata_2025-01.parquet'
PATH_CENTROIDS = '/Users/leoss/Desktop/Portfolio/Website-/Uber/data/zone_centroids.csv'

UBER_2018_BASES = ['B02512', 'B02598', 'B02617', 'B02682', 'B02764', 'B02765', 'B02835', 'B02836']
UBER_2025_LICENSE = 'HV0003'

print(f"\nConfiguration:")
print(f"  Sample size: {SAMPLE_SIZE:,} trips per year")
print(f"  Clusters: {N_CLUSTERS}")

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def name_cluster_by_location(center_lat, center_lon, zone_centroids):
    """Name cluster based on nearest major zone."""
    zone_distances = np.sqrt(
        (zone_centroids['latitude'] - center_lat)**2 + 
        (zone_centroids['longitude'] - center_lon)**2
    )
    nearest_idx = zone_distances.idxmin()
    nearest_zone = zone_centroids.iloc[nearest_idx]
    return f"{nearest_zone['borough']}: {nearest_zone['zone_name']}"

def match_clusters_by_proximity(centers_2018, centers_2025):
    """Match 2018 to 2025 clusters by geographic proximity."""
    distances = cdist(centers_2018, centers_2025, metric='euclidean')
    matches = {}
    used_2025 = set()
    
    pairs = []
    for i in range(len(centers_2018)):
        for j in range(len(centers_2025)):
            if j not in used_2025:
                pairs.append((i, j, distances[i, j]))
    
    pairs.sort(key=lambda x: x[2])
    
    for i, j, dist in pairs:
        if i not in matches and j not in used_2025:
            matches[i] = j
            used_2025.add(j)
    
    return matches

# ============================================================================
# LOAD ZONE CENTROIDS
# ============================================================================
print("\n" + "="*70)
print("LOADING ZONE CENTROIDS")
print("="*70)

zone_centroids = pd.read_csv(PATH_CENTROIDS)
print(f"‚úì Loaded {len(zone_centroids)} zones")

# ============================================================================
# PART 1: 2018 UBER DATA
# ============================================================================
print("\n" + "="*70)
print("PART 1: 2018 UBER ANALYSIS")
print("="*70)

print("\n[1.1] Loading 2018 data...")
table_2018 = pq.read_table(PATH_2018, columns=[])
total_2018 = table_2018.num_rows
print(f"  Total rows: {total_2018:,}")

columns_2018 = ['pickup_datetime', 'PUlocationID', 'dispatching_base_num']
table_2018 = pq.read_table(PATH_2018, columns=columns_2018)

df_2018_full = table_2018.to_pandas()
df_2018_full = df_2018_full[df_2018_full['dispatching_base_num'].isin(UBER_2018_BASES)].copy()

uber_count_2018 = len(df_2018_full)
print(f"  Uber trips: {uber_count_2018:,} ({100*uber_count_2018/total_2018:.1f}%)")

df_2018 = df_2018_full.sample(n=min(SAMPLE_SIZE, uber_count_2018), random_state=42)
del df_2018_full, table_2018
gc.collect()

print("\n[1.2] Processing temporal and geographic features...")
df_2018['pickup_datetime'] = pd.to_datetime(df_2018['pickup_datetime'])
df_2018['hour'] = df_2018['pickup_datetime'].dt.hour
df_2018['day_of_week'] = df_2018['pickup_datetime'].dt.dayofweek
df_2018['day_name'] = df_2018['pickup_datetime'].dt.day_name()

df_2018 = df_2018.dropna(subset=['PUlocationID'])
df_2018['PUlocationID'] = df_2018['PUlocationID'].astype(int)
df_2018 = df_2018.merge(
    zone_centroids[['zone_id', 'zone_name', 'borough', 'latitude', 'longitude']], 
    left_on='PUlocationID',
    right_on='zone_id',
    how='left'
)
df_2018 = df_2018.dropna(subset=['latitude', 'longitude'])

print("\n[1.3] Clustering on geographic coordinates...")
coords_2018 = df_2018[['latitude', 'longitude']].values
kmeans_2018 = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
df_2018['cluster'] = kmeans_2018.fit_predict(coords_2018)

# Name clusters by nearest major zone
cluster_names_2018 = {}
for i in range(N_CLUSTERS):
    lat, lon = kmeans_2018.cluster_centers_[i]
    cluster_names_2018[i] = name_cluster_by_location(lat, lon, zone_centroids)

df_2018['cluster_name'] = df_2018['cluster'].map(cluster_names_2018)

cluster_counts_2018 = df_2018['cluster'].value_counts().sort_index()
print(f"  Cluster distribution:")
for i, count in enumerate(cluster_counts_2018):
    pct = 100 * count / len(df_2018)
    print(f"    Cluster {i} - {cluster_names_2018[i]}: {count:>7,} ({pct:>5.1f}%)")

# ============================================================================
# PART 2: 2025 UBER DATA
# ============================================================================
print("\n" + "="*70)
print("PART 2: 2025 UBER ANALYSIS")
print("="*70)

print("\n[2.1] Loading 2025 data...")
table_2025 = pq.read_table(PATH_2025, columns=[])
total_2025 = table_2025.num_rows
print(f"  Total rows: {total_2025:,}")

columns_2025 = ['pickup_datetime', 'PULocationID', 'hvfhs_license_num']
table_2025 = pq.read_table(PATH_2025, columns=columns_2025)

df_2025_full = table_2025.to_pandas()
df_2025_full = df_2025_full[df_2025_full['hvfhs_license_num'] == UBER_2025_LICENSE].copy()

uber_count_2025 = len(df_2025_full)
print(f"  Uber trips: {uber_count_2025:,} ({100*uber_count_2025/total_2025:.1f}%)")

df_2025 = df_2025_full.sample(n=min(SAMPLE_SIZE, uber_count_2025), random_state=42)
del df_2025_full, table_2025
gc.collect()

print("\n[2.2] Processing temporal and geographic features...")
df_2025['pickup_datetime'] = pd.to_datetime(df_2025['pickup_datetime'])
df_2025['hour'] = df_2025['pickup_datetime'].dt.hour
df_2025['day_of_week'] = df_2025['pickup_datetime'].dt.dayofweek
df_2025['day_name'] = df_2025['pickup_datetime'].dt.day_name()

df_2025 = df_2025.dropna(subset=['PULocationID'])
df_2025['PULocationID'] = df_2025['PULocationID'].astype(int)
df_2025 = df_2025.merge(
    zone_centroids[['zone_id', 'zone_name', 'borough', 'latitude', 'longitude']], 
    left_on='PULocationID',
    right_on='zone_id',
    how='left'
)
df_2025 = df_2025.dropna(subset=['latitude', 'longitude'])

print("\n[2.3] Clustering on geographic coordinates...")
coords_2025 = df_2025[['latitude', 'longitude']].values
kmeans_2025 = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
df_2025['cluster'] = kmeans_2025.fit_predict(coords_2025)

# Name clusters by nearest major zone
cluster_names_2025 = {}
for i in range(N_CLUSTERS):
    lat, lon = kmeans_2025.cluster_centers_[i]
    cluster_names_2025[i] = name_cluster_by_location(lat, lon, zone_centroids)

df_2025['cluster_name'] = df_2025['cluster'].map(cluster_names_2025)

cluster_counts_2025 = df_2025['cluster'].value_counts().sort_index()
print(f"  Cluster distribution:")
for i, count in enumerate(cluster_counts_2025):
    pct = 100 * count / len(df_2025)
    print(f"    Cluster {i} - {cluster_names_2025[i]}: {count:>7,} ({pct:>5.1f}%)")

# ============================================================================
# MATCH CLUSTERS PROPERLY
# ============================================================================
print("\n" + "="*70)
print("MATCHING CLUSTERS BY GEOGRAPHIC PROXIMITY")
print("="*70)

centers_2018 = kmeans_2018.cluster_centers_
centers_2025 = kmeans_2025.cluster_centers_

cluster_matches = match_clusters_by_proximity(centers_2018, centers_2025)

print(f"\nüìç Cluster Matching Results:")
print(f"  {'2018 Cluster':<60} ‚Üí {'2025 Cluster':<60} {'Shift (km)':>12}")
print("  " + "="*135)

for idx_2018, idx_2025 in sorted(cluster_matches.items()):
    lat1, lon1 = centers_2018[idx_2018]
    lat2, lon2 = centers_2025[idx_2025]
    distance_km = np.sqrt((lat2-lat1)**2 + (lon2-lon1)**2) * 111
    
    name_2018 = cluster_names_2018[idx_2018]
    name_2025 = cluster_names_2025[idx_2025]
    
    print(f"  {name_2018:<60} ‚Üí {name_2025:<60} {distance_km:>12.2f}")

# ============================================================================
# PART 3: VISUALIZATIONS
# ============================================================================
print("\n" + "="*70)
print("PART 3: CREATING VISUALIZATIONS")
print("="*70)

# 3.1: Cluster maps
print("\n[3.1] Creating cluster maps...")

def create_cluster_map(df, centers, cluster_names, title):
    viz_sample = df.sample(n=min(15_000, len(df)), random_state=42)
    
    fig = px.scatter_map(
        viz_sample,
        lat='latitude',
        lon='longitude',
        color='cluster_name',
        color_discrete_sequence=px.colors.qualitative.Bold,
        zoom=10,
        title=title,
        map_style='carto-positron',
        height=700,
        width=1200,
        hover_data={'zone_name': True, 'cluster_name': True, 'borough': True}
    )
    
    # Add clean numbered centroids
    for i, (lat, lon) in enumerate(centers):
        fig.add_trace(go.Scattermap(
            lat=[lat],
            lon=[lon],
            mode='markers+text',
            marker=dict(size=18, color='black', opacity=0.8),
            text=f"{i}",
            textfont=dict(size=12, color='white', family='Arial Black'),
            textposition='middle center',
            name=cluster_names[i],
            showlegend=False,
            hovertemplate=f'<b>Cluster {i}</b><br>{cluster_names[i]}<extra></extra>'
        ))
    
    fig.update_layout(
        font=dict(size=12),
        title_font=dict(size=18),
        legend=dict(title="Clusters", yanchor="top", y=0.99, xanchor="left", x=0.01,
                   bgcolor="rgba(255,255,255,0.9)")
    )
    
    return fig

fig_2018 = create_cluster_map(df_2018, centers_2018, cluster_names_2018, 
                               '2018 Uber: Geographic Demand Clusters')
fig_2018.write_html(OUTPUT_DIR + '1_uber_2018_clusters.html')
print(f"  ‚úì Saved: 1_uber_2018_clusters.html")

fig_2025 = create_cluster_map(df_2025, centers_2025, cluster_names_2025, 
                               '2025 Uber: Geographic Demand Clusters')
fig_2025.write_html(OUTPUT_DIR + '2_uber_2025_clusters.html')
print(f"  ‚úì Saved: 2_uber_2025_clusters.html")

# 3.2: Top zones
print("\n[3.2] Creating top zones comparison...")

zone_counts_2018 = df_2018.groupby(['zone_id', 'zone_name', 'borough']).size().reset_index(name='count_2018')
zone_counts_2025 = df_2025.groupby(['zone_id', 'zone_name', 'borough']).size().reset_index(name='count_2025')

zone_comparison = zone_counts_2018.merge(zone_counts_2025, on=['zone_id', 'zone_name', 'borough'], how='outer').fillna(0)
zone_comparison['change'] = zone_comparison['count_2025'] - zone_comparison['count_2018']

top_zones = zone_comparison.nlargest(20, 'count_2025')

fig_top_zones = go.Figure()
fig_top_zones.add_trace(go.Bar(
    name='2018',
    y=top_zones['zone_name'] + ' (' + top_zones['borough'] + ')',
    x=top_zones['count_2018'],
    orientation='h',
    marker_color='#ff6b6b',
    text=top_zones['count_2018'].astype(int),
    textposition='outside'
))
fig_top_zones.add_trace(go.Bar(
    name='2025',
    y=top_zones['zone_name'] + ' (' + top_zones['borough'] + ')',
    x=top_zones['count_2025'],
    orientation='h',
    marker_color='#4ecdc4',
    text=top_zones['count_2025'].astype(int),
    textposition='outside'
))

fig_top_zones.update_layout(
    title='Top 20 Pickup Zones: 2018 vs 2025',
    xaxis_title='Number of Trips',
    barmode='group',
    height=800,
    width=1200,
    template='plotly_white'
)

fig_top_zones.write_html(OUTPUT_DIR + '3_top_zones_comparison.html')
print(f"  ‚úì Saved: 3_top_zones_comparison.html")

# 3.3: Hourly patterns
print("\n[3.3] Creating temporal analysis...")

hourly_2018 = df_2018.groupby('hour').size()
hourly_2025 = df_2025.groupby('hour').size()

fig_hourly = go.Figure()
fig_hourly.add_trace(go.Scatter(x=hourly_2018.index, y=hourly_2018.values, name='2018',
                                mode='lines+markers', line=dict(color='#ff6b6b', width=3)))
fig_hourly.add_trace(go.Scatter(x=hourly_2025.index, y=hourly_2025.values, name='2025',
                                mode='lines+markers', line=dict(color='#4ecdc4', width=3)))

fig_hourly.update_layout(
    title='Hourly Demand Pattern: 2018 vs 2025',
    xaxis_title='Hour of Day',
    yaxis_title='Number of Trips',
    template='plotly_white',
    height=500,
    width=1200,
    hovermode='x unified'
)

fig_hourly.write_html(OUTPUT_DIR + '4_hourly_patterns.html')
print(f"  ‚úì Saved: 4_hourly_patterns.html")

# 3.4: Daily patterns
print("\n[3.4] Creating day-of-week analysis...")

day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_2018 = df_2018.groupby('day_name').size().reindex(day_order)
daily_2025 = df_2025.groupby('day_name').size().reindex(day_order)

fig_daily = go.Figure()
fig_daily.add_trace(go.Bar(name='2018', x=day_order, y=daily_2018.values, marker_color='#ff6b6b'))
fig_daily.add_trace(go.Bar(name='2025', x=day_order, y=daily_2025.values, marker_color='#4ecdc4'))

fig_daily.update_layout(
    title='Weekly Demand Pattern: 2018 vs 2025',
    xaxis_title='Day of Week',
    yaxis_title='Number of Trips',
    barmode='group',
    template='plotly_white',
    height=500,
    width=1200
)

fig_daily.write_html(OUTPUT_DIR + '5_daily_patterns.html')
print(f"  ‚úì Saved: 5_daily_patterns.html")

# 3.5: Heatmaps
print("\n[3.5] Creating demand heatmaps...")

fig_heat = make_subplots(rows=2, cols=1, subplot_titles=('2018', '2025'), vertical_spacing=0.12)

for df, row, year in [(df_2018, 1, '2018'), (df_2025, 2, '2025')]:
    pivot = df.groupby(['day_name', 'hour']).size().reset_index(name='trips')
    pivot_table = pivot.pivot(index='day_name', columns='hour', values='trips').reindex(day_order)
    
    fig_heat.add_trace(go.Heatmap(
        z=pivot_table.values,
        x=pivot_table.columns,
        y=pivot_table.index,
        colorscale='Viridis',
        showscale=(row==2)
    ), row=row, col=1)

fig_heat.update_layout(title='Demand Heatmaps: Hour x Day', height=700, width=1200)
fig_heat.write_html(OUTPUT_DIR + '6_demand_heatmaps.html')
print(f"  ‚úì Saved: 6_demand_heatmaps.html")

# 3.6: Borough analysis
print("\n[3.6] Creating borough analysis...")

borough_2018 = df_2018.groupby('borough').size()
borough_2025 = df_2025.groupby('borough').size()

fig_borough = go.Figure()
fig_borough.add_trace(go.Bar(name='2018', x=borough_2018.index, y=borough_2018.values, marker_color='#ff6b6b'))
fig_borough.add_trace(go.Bar(name='2025', x=borough_2025.index, y=borough_2025.values, marker_color='#4ecdc4'))

fig_borough.update_layout(
    title='Demand by Borough: 2018 vs 2025',
    xaxis_title='Borough',
    yaxis_title='Number of Trips',
    barmode='group',
    template='plotly_white',
    height=500,
    width=1000
)

fig_borough.write_html(OUTPUT_DIR + '7_borough_analysis.html')
print(f"  ‚úì Saved: 7_borough_analysis.html")

# 3.7: PROPERLY MATCHED CLUSTER SHIFTS
print("\n[3.7] Creating properly matched cluster shift map...")

fig_shifts = go.Figure()

# Add 2018 centers
for i in range(len(centers_2018)):
    fig_shifts.add_trace(go.Scattermap(
        lat=[centers_2018[i, 0]],
        lon=[centers_2018[i, 1]],
        mode='markers+text',
        marker=dict(size=25, color='#ff6b6b', opacity=0.8),
        text=f"18-{i}",
        textfont=dict(size=10, color='white', family='Arial Black'),
        textposition='middle center',
        name=f'2018: {cluster_names_2018[i]}',
        hovertemplate=f'<b>2018 Cluster {i}</b><br>{cluster_names_2018[i]}<extra></extra>'
    ))

# Add 2025 centers
for i in range(len(centers_2025)):
    fig_shifts.add_trace(go.Scattermap(
        lat=[centers_2025[i, 0]],
        lon=[centers_2025[i, 1]],
        mode='markers+text',
        marker=dict(size=25, color='#4ecdc4', opacity=0.8),
        text=f"25-{i}",
        textfont=dict(size=10, color='white', family='Arial Black'),
        textposition='middle center',
        name=f'2025: {cluster_names_2025[i]}',
        hovertemplate=f'<b>2025 Cluster {i}</b><br>{cluster_names_2025[i]}<extra></extra>'
    ))

# Draw arrows ONLY between properly matched clusters
for idx_2018, idx_2025 in cluster_matches.items():
    lat1, lon1 = centers_2018[idx_2018]
    lat2, lon2 = centers_2025[idx_2025]
    distance_km = np.sqrt((lat2-lat1)**2 + (lon2-lon1)**2) * 111
    
    fig_shifts.add_trace(go.Scattermap(
        lat=[lat1, lat2],
        lon=[lon1, lon2],
        mode='lines',
        line=dict(width=3, color='black'),
        showlegend=False,
        hovertemplate=f'<b>Shift: {distance_km:.2f} km</b><br>' +
                     f'From: {cluster_names_2018[idx_2018]}<br>' +
                     f'To: {cluster_names_2025[idx_2025]}<extra></extra>'
    ))

fig_shifts.update_layout(
    title='Cluster Center Shifts: 2018 ‚Üí 2025 (Properly Matched by Geographic Proximity)',
    map_style='carto-positron',
    map_zoom=10,
    map_center=dict(lat=40.75, lon=-73.95),
    height=800,
    width=1400,
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, bgcolor="rgba(255,255,255,0.9)")
)

fig_shifts.write_html(OUTPUT_DIR + '8_cluster_shifts_PROPER.html')
print(f"  ‚úì Saved: 8_cluster_shifts_PROPER.html")

# ============================================================================
# PART 4: METRICS & SUMMARY
# ============================================================================
print("\n" + "="*70)
print("PART 4: CALCULATING METRICS")
print("="*70)

def gini_coefficient(counts):
    sorted_counts = np.sort(counts)
    n = len(sorted_counts)
    cumsum = np.cumsum(sorted_counts)
    return (2 * np.sum((np.arange(1, n+1)) * sorted_counts)) / (n * cumsum[-1]) - (n + 1) / n

gini_2018 = gini_coefficient(cluster_counts_2018.values)
gini_2025 = gini_coefficient(cluster_counts_2025.values)

peak_hour_2018 = df_2018.groupby('hour').size().idxmax()
peak_hour_2025 = df_2025.groupby('hour').size().idxmax()

top_zone_2018 = df_2018.groupby('zone_name').size().idxmax()
top_zone_2025 = df_2025.groupby('zone_name').size().idxmax()

# Calculate average shift for matched clusters
matched_shifts = [
    np.sqrt((centers_2025[idx_2025, 0] - centers_2018[idx_2018, 0])**2 + 
            (centers_2025[idx_2025, 1] - centers_2018[idx_2018, 1])**2) * 111
    for idx_2018, idx_2025 in cluster_matches.items()
]
avg_shift = np.mean(matched_shifts)

summary = pd.DataFrame({
    'Metric': [
        'Total Trips (All Companies)',
        'Uber Trips',
        'Uber Market Share (%)',
        'Sample Size',
        'Unique Zones',
        'Top Pickup Zone',
        'Peak Hour',
        'Most Active Day',
        'Gini Coefficient',
        'Avg Cluster Shift (km)'
    ],
    '2018': [
        f'{total_2018:,}',
        f'{uber_count_2018:,}',
        f'{100*uber_count_2018/total_2018:.1f}',
        f'{len(df_2018):,}',
        df_2018['zone_id'].nunique(),
        top_zone_2018,
        f'{peak_hour_2018}:00',
        df_2018.groupby('day_name').size().idxmax(),
        f'{gini_2018:.4f}',
        '-'
    ],
    '2025': [
        f'{total_2025:,}',
        f'{uber_count_2025:,}',
        f'{100*uber_count_2025/total_2025:.1f}',
        f'{len(df_2025):,}',
        df_2025['zone_id'].nunique(),
        top_zone_2025,
        f'{peak_hour_2025}:00',
        df_2025.groupby('day_name').size().idxmax(),
        f'{gini_2025:.4f}',
        f'{avg_shift:.2f}'
    ]
})

summary.to_csv(OUTPUT_DIR + '9_comprehensive_summary.csv', index=False)
print(f"\n‚úì Saved: 9_comprehensive_summary.csv")

zone_comparison.sort_values('count_2025', ascending=False).head(50).to_csv(
    OUTPUT_DIR + '10_top_50_zones.csv', index=False
)
print(f"‚úì Saved: 10_top_50_zones.csv")

# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("\n" + "="*70)
print("‚úÖ ANALYSIS COMPLETE - METHODOLOGICALLY SOUND")
print("="*70)

print(f"\nüìÇ Generated 10 Files:")
print(f"  1. uber_2018_clusters.html - 2018 map")
print(f"  2. uber_2025_clusters.html - 2025 map")
print(f"  3. top_zones_comparison.html - Top 20 zones")
print(f"  4. hourly_patterns.html - Hour-by-hour")
print(f"  5. daily_patterns.html - Day-of-week")
print(f"  6. demand_heatmaps.html - Hour x Day")
print(f"  7. borough_analysis.html - By borough")
print(f"  8. cluster_shifts_PROPER.html - ‚ú® PROPERLY MATCHED SHIFTS")
print(f"  9. comprehensive_summary.csv - Key metrics")
print(f"  10. top_50_zones.csv - Detailed zone data")

print(f"\nüéØ Key Findings:")
print(f"  ‚Ä¢ Market share: {100*uber_count_2018/total_2018:.1f}% ‚Üí {100*uber_count_2025/total_2025:.1f}%")
print(f"  ‚Ä¢ Concentration: {gini_2018:.4f} ‚Üí {gini_2025:.4f}")
print(f"  ‚Ä¢ Avg cluster shift: {avg_shift:.2f} km (properly matched)")
print(f"  ‚Ä¢ Peak hour: {peak_hour_2018}:00 ‚Üí {peak_hour_2025}:00")

print("\n‚úÖ Methodological Improvements:")
print("  ‚Ä¢ Clusters matched by geographic proximity (not index)")
print("  ‚Ä¢ Consistent naming based on nearest zone")
print("  ‚Ä¢ Arrows only connect properly matched clusters")

print("="*70)

UBER COMPREHENSIVE ANALYSIS: 2018 ‚Üí 2025 (METHODOLOGICALLY FIXED)

Configuration:
  Sample size: 20,000,000 trips per year
  Clusters: 6

LOADING ZONE CENTROIDS
‚úì Loaded 263 zones

PART 1: 2018 UBER ANALYSIS

[1.1] Loading 2018 data...
  Total rows: 19,808,094
  Uber trips: 4,502,999 (22.7%)

[1.2] Processing temporal and geographic features...

[1.3] Clustering on geographic coordinates...
  Cluster distribution:
    Cluster 0 - Manhattan: Yorkville East: 765,644 ( 17.0%)
    Cluster 1 - Brooklyn: Borough Park: 388,895 (  8.6%)
    Cluster 2 - Queens: Jamaica: 369,831 (  8.2%)
    Cluster 3 - Manhattan: Gramercy: 1,882,965 ( 41.7%)
    Cluster 4 - Bronx: East Tremont: 477,419 ( 10.6%)
    Cluster 5 - Brooklyn: Ocean Hill: 626,327 ( 13.9%)

PART 2: 2025 UBER ANALYSIS

[2.1] Loading 2025 data...
  Total rows: 20,405,666
  Uber trips: 15,356,455 (75.3%)

[2.2] Processing temporal and geographic features...

[2.3] Clustering on geographic coordinates...
  Cluster distribution:
    Clu