In [1]:
import requests
from datetime import datetime, timedelta
import polars as pl
import re

In [17]:
def clean_column_name(name):
    return re.sub(r"[ \-&]", "_", name).replace("___", "_").replace("__","_").lower()

In [18]:
def get_data(ses):
    url = "https://data.ny.gov/resource/wujg-7c2s.csv"

    params = {
        "$limit":1,
        "$order":"transit_timestamp DESC"
    }
    response = ses.get(url, params=params)

    last_update_string = pl.read_csv(response.content)["transit_timestamp"][0]
    last_update_datetime = datetime.fromisoformat(last_update_string)

    two_weeks_prior = last_update_datetime - timedelta(days=14)
    rounded_two_weeks = datetime(year=two_weeks_prior.year, month=two_weeks_prior.month, day=two_weeks_prior.day)

    print(f"Getting data from {rounded_two_weeks.isoformat()} and {last_update_datetime.isoformat()}...")
    params = {
        "$where": f"transit_timestamp between'{rounded_two_weeks.isoformat()}' and '{last_update_datetime.isoformat()}'",
        "$limit": 1_500_000
    }
    response = ses.get(url, params=params)
    
    columns_to_keep = ["transit_timestamp", "station_complex_id", "fare_class_category", "ridership"]
    df = pl.read_csv(response.content, infer_schema_length=0, columns=columns_to_keep)
    return df

In [19]:
def format_df(df):
    ridership_wide = df.with_columns(
        [pl.col("transit_timestamp").map_elements(lambda x: datetime.fromisoformat(x)),
        pl.col("ridership").cast(pl.Float64).floor().cast(pl.Int16)]
    ).pivot(
        index=["transit_timestamp", "station_complex_id"],
        columns="fare_class_category",
        values="ridership",
        aggregate_function="sum",
        sort_columns=True
    ).sort(
        ["transit_timestamp", "station_complex_id"], descending=[False, False]
    ).fill_null(0)

    ridership_columns = [col for col in ridership_wide.columns if "Metrocard" in col or "OMNY" in col]
    ridership = ridership_wide.with_columns(
        total_ridership=pl.sum_horizontal(col for col in ridership_columns)
    )

    rename_mapping = {col: clean_column_name(col) for col in ridership.columns}
    ridership = ridership.rename(rename_mapping)
    return ridership

In [20]:
with requests.session() as ses:
    df = get_data(ses)
ridership = format_df(df)
ridership

Getting data from 2024-03-09T00:00:00 and 2024-03-23T23:00:00...


transit_timestamp,station_complex_id,metrocard_fair_fare,metrocard_full_fare,metrocard_other,metrocard_seniors_disability,metrocard_students,metrocard_unlimited_30_day,metrocard_unlimited_7_day,omny_full_fare,omny_other,omny_seniors_disability,total_ridership
datetime[μs],str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
2024-03-09 00:00:00,"""1""",1,9,9,1,0,4,11,25,0,0,60
2024-03-09 00:00:00,"""10""",12,50,15,6,0,31,42,264,0,1,421
2024-03-09 00:00:00,"""100""",1,2,0,1,0,0,2,18,0,0,24
2024-03-09 00:00:00,"""101""",0,10,4,0,0,9,13,78,0,0,114
2024-03-09 00:00:00,"""103""",2,14,5,0,0,13,12,100,0,0,146
2024-03-09 00:00:00,"""107""",4,4,0,4,0,6,11,22,0,1,52
2024-03-09 00:00:00,"""108""",1,0,3,0,0,1,1,1,0,0,7
2024-03-09 00:00:00,"""109""",0,1,4,0,0,2,2,4,0,0,13
2024-03-09 00:00:00,"""110""",0,0,0,0,0,0,0,10,0,0,10
2024-03-09 00:00:00,"""111""",2,0,0,0,0,1,0,3,0,0,6
