In [1]:
import os
import sys
from pathlib import Path
from typing import Optional

from sodapy import Socrata
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape

sys.path.insert(0, str(Path(os.getcwd()).parent.parent))

from src.common.secrets_helper import Secrets

secrets = Secrets()


def process_numeric_fields(mta_gdf: gpd.GeoDataFrame, numeric_columns: Optional[list] = None) -> None:
    if not numeric_columns:
        numeric_columns = ['station_complex_id', 'sum_ridership', 'sum_transfers']
    for col in numeric_columns:
        mta_gdf[col] = pd.to_numeric(mta_gdf[col], errors='coerce')
    return mta_gdf


def assign_one_geometry_per_station_complex(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Assign one geometry per station complex by taking the first geometry for each station_complex_id.
    """
    gdf = gdf.copy()
    gdf["geometry"] = gdf.groupby("station_complex_id")["geometry"].transform(lambda x: x.iloc[0])
    return gdf

In [2]:
# Optional: Add your App Token here if you have one
headers = {
    "Accept": "application/json",
    "X-App-Token": secrets["MTA_X_APP_TOKEN"]
}

In [3]:
client = Socrata(domain="data.ny.gov", app_token=secrets["MTA_X_APP_TOKEN"])  # Or include your app token

In [4]:
client.timeout = 120
month_start_dates = pd.date_range(start="2024-01-01", end="2025-01-01", freq="MS")

total_results = []
for i in range(12):
    start_date = month_start_dates[i].strftime('%Y-%m-%d')
    end_date = month_start_dates[i + 1].strftime('%Y-%m-%d')
    where_clause = f"transit_timestamp >= {start_date!r} AND transit_timestamp < {end_date!r}"
    where_clause += f" AND transit_mode = 'subway'"
    print(where_clause)
    results = client.get(
        "wujg-7c2s",
        select="transit_timestamp, station_complex_id, station_complex, borough, georeference, sum(ridership), sum(transfers)",
        where=where_clause,
        group="transit_timestamp, station_complex_id, station_complex, borough, georeference",
        limit=100,
    )
    total_results.extend(results)

len(total_results)

transit_timestamp >= '2024-01-01' AND transit_timestamp < '2024-02-01' AND transit_mode = 'subway'
transit_timestamp >= '2024-02-01' AND transit_timestamp < '2024-03-01' AND transit_mode = 'subway'
transit_timestamp >= '2024-03-01' AND transit_timestamp < '2024-04-01' AND transit_mode = 'subway'
transit_timestamp >= '2024-04-01' AND transit_timestamp < '2024-05-01' AND transit_mode = 'subway'
transit_timestamp >= '2024-05-01' AND transit_timestamp < '2024-06-01' AND transit_mode = 'subway'
transit_timestamp >= '2024-06-01' AND transit_timestamp < '2024-07-01' AND transit_mode = 'subway'
transit_timestamp >= '2024-07-01' AND transit_timestamp < '2024-08-01' AND transit_mode = 'subway'
transit_timestamp >= '2024-08-01' AND transit_timestamp < '2024-09-01' AND transit_mode = 'subway'
transit_timestamp >= '2024-09-01' AND transit_timestamp < '2024-10-01' AND transit_mode = 'subway'
transit_timestamp >= '2024-10-01' AND transit_timestamp < '2024-11-01' AND transit_mode = 'subway'
transit_ti

1200

In [5]:
df = pd.DataFrame.from_records(total_results)

df['geometry'] = df['georeference'].apply(lambda x: shape(x) if x else None)
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

gdf = process_numeric_fields(gdf)
gdf = assign_one_geometry_per_station_complex(gdf)
gdf["transit_timestamp"] = pd.to_datetime(gdf["transit_timestamp"])
gdf["month"] = gdf["transit_timestamp"].dt.month

In [6]:
gdf = gdf.groupby(
    [pd.Grouper(key="transit_timestamp", freq="D"), "station_complex_id", "station_complex", "borough", "geometry", "month"],
    as_index=False
).agg({"sum_ridership": "sum", "sum_transfers": "sum"})

In [7]:
gdf = gdf.rename(
    columns={"sum_ridership": "total_daily_ridership", "sum_transfers": "total_daily_transfers"},
)

In [8]:
gdf = gpd.GeoDataFrame(
    gdf,
    geometry=gdf.geometry,
    crs="EPSG:4326",
)

In [9]:
gdf.to_file(
    "../data/mta_subway_total_daily_ridership_by_station_2024-01-01_to_2025-01-01_1200_subset.geojson",
    driver="GeoJSON",
)