# Which NYC Subway Stations Are at Highest Risk of Platform Overflow?

This analysis uses MTA hourly ridership data for 2025 (with rolling updates from 2026) combined with Additional Platform Time (APT) metrics to identify stations where passenger volume and service stress converge. APT data is published per subway line, not per station; the dashboard bridges this gap using a station-to-route mapping that computes a weighted-average APT for each station complex based on the lines that serve it. 

The Capacity Stress Index (CSI) synthesizes peak-hour concentration and raw volume into a single risk score, calibrated against the full distribution of 2025 ridership — a post-stabilization year that serves as the cleanest available baseline for what "normal" NYC subway load looks like.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
import plotly.express as px
from pathlib import Path

PROCESSED = Path("../data/processed")

In [None]:
df = pd.read_parquet(PROCESSED / "ridership_transformed")
df["year"] = df["year"].astype(int)
df["month"] = df["month"].astype(int)
df.head()

In [None]:
csi_df = pd.read_parquet(PROCESSED / "csi")
csi_df["year"] = csi_df["year"].astype(int)
csi_df["month"] = csi_df["month"].astype(int)
csi_df.head()

In [None]:
borough_totals = df.groupby("borough")["ridership"].sum().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(8, 4))
borough_totals.plot(kind="bar", ax=ax, color=sns.color_palette("muted"))
ax.set_title("Total Ridership by Borough — 2025")
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{x/1e6:.0f}M"))
ax.set_xlabel(""); ax.set_ylabel("Riders")
plt.tight_layout(); plt.show()

In [None]:
month_names = {
    1: 'January',
    2: 'February',
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'August',
    9: 'September',
    10: 'October',
    11: 'November',
    12: 'December'
}

In [None]:
subset = csi_df.nlargest(5, "csi")[["station_complex","month", "csi"]]
top_5 = [tuple(row) for row in subset.itertuples(index=False)]

for station, m, csi in top_5:
    pivot = df[df["station_complex"] == station] \
        .groupby(["day_of_week", "hour_of_day"])["ridership"].mean().unstack()
    fig, ax = plt.subplots(figsize=(14, 3))
    sns.heatmap(pivot, ax=ax, cmap="YlOrRd", linewidths=0.3,
                xticklabels=[f"{h}:00" for h in range(24)],
                yticklabels=["Sun","Mon","Tue","Wed","Thu","Fri","Sat"])
    ax.set_title(f"Avg Hourly Ridership: {station}, {month_names[m]}, CSI={csi:.3f}")
    plt.tight_layout(); plt.show()

In [None]:
csi_df.iloc[csi_df.groupby("station_complex")["csi"].idxmax()].nlargest(5, "csi")[["station_complex","month"]]

In [None]:
subset = csi_df.iloc[csi_df.groupby("station_complex")["csi"].idxmax()].nlargest(5, "csi")[["station_complex","month", "csi"]]
top_5 = [tuple(row) for row in subset.itertuples(index=False)]

for station, m, csi in top_5:
    pivot = df[df["station_complex"] == station] \
        .groupby(["day_of_week", "hour_of_day"])["ridership"].mean().unstack()
    fig, ax = plt.subplots(figsize=(14, 3))
    sns.heatmap(pivot, ax=ax, cmap="YlOrRd", linewidths=0.3,
                xticklabels=[f"{h}:00" for h in range(24)],
                yticklabels=["Sun","Mon","Tue","Wed","Thu","Fri","Sat"])
    ax.set_title(f"Avg Hourly Ridership: {station}, {month_names[m]}, CSI={csi:.3f}")
    plt.tight_layout(); plt.show()

In [None]:
# Drop monthly_ridership from csi_df if it somehow got there, then merge
latest_year = csi_df["year"].max()
latest_month = csi_df[csi_df["year"] == latest_year]["month"].max()

csi_latest = csi_df[(csi_df["year"] == latest_year) & (csi_df["month"] == latest_month)]

# Pull monthly_ridership from df (already computed), deduplicate to one row per station/month
df_monthly = (
    df[(df["year"] == latest_year) & (df["month"] == latest_month)]
    [["station_complex", "year", "month", "monthly_ridership"]]
    .drop_duplicates(subset=["station_complex", "year", "month"])
)

monthly_latest = csi_latest.merge(df_monthly, on=["station_complex", "year", "month"], how="left")

top20 = monthly_latest.nlargest(20, "monthly_ridership")[
    ["station_complex", "station_complex_id", "borough", "monthly_ridership", "csi",
     "apt_minutes", "num_lines_serving"]
]

top20.style \
    .background_gradient(subset=["monthly_ridership"], cmap="Blues") \
    .background_gradient(subset=["csi"], cmap="Reds") \
    .format({
        "monthly_ridership": "{:,.0f}",
        "csi": "{:.3f}",
        "apt_minutes": "{:.1f}",
        "num_lines_serving": "{:.0f}",
    })

In [None]:
monthly_latest.head()

In [None]:
fig = px.scatter(
    monthly_latest.dropna(subset=["apt_minutes"]),
    x="hhi_monthly_concentration",
    y="apt_minutes",
    size="monthly_ridership",
    color="borough",
    hover_name="station_complex",
    hover_data={"num_lines_serving": True, "csi": ":.3f"},
    # text=monthly_latest.dropna(subset=["apt_minutes"])
    #     .nlargest(10, "csi")["station_complex"],
    title="Capacity Stress Index: Peak Concentration vs. Weighted Platform Wait Time (2025 Baseline)",
    labels={
        "hhi_monthly_concentration": "Ridership Concentration (Herfindahl Index)",
        "apt_minutes": "Weighted Avg Additional Platform Time (min)",
    },
    size_max=50,
)
fig.update_traces(textposition="top center")
fig.show()

In [None]:
fig = px.scatter(
    monthly_latest.dropna(subset=["apt_minutes"]),
    x="hhi_monthly_concentration",
    y="apt_minutes",
    size="volume_component",
    color="borough",
    hover_name="station_complex",
    hover_data={"num_lines_serving": True, "csi": ":.3f", "volume_component": ":.3f", "hhi_monthly_concentration": ":.3f", "apt_minutes": ":.3f"},
    # text=monthly_latest.dropna(subset=["apt_minutes"])
    #     .nlargest(10, "csi")["station_complex"],
    title=f"Capacity Stress Index: Peak Concentration vs. Weighted Platform Wait Time (2025 Baseline), {month_names[latest_month]}",
    labels={
        "hhi_monthly_concentration": "Ridership Concentration (HHI)",
        "apt_minutes": "Weighted Avg Additional Platform Time (min)",
    },
    # size_max=50,
)
fig.update_traces(textposition="top center")
fig.show()