In [3]:
import requests
from datetime import datetime, timedelta
import polars as pl
import re

In [4]:
def clean_column_name(name):
    return re.sub(r"[ \-&]", "_", name).replace("___", "_").replace("__","_").lower()

In [5]:
def get_data(ses):
    url = "https://data.ny.gov/resource/wujg-7c2s.csv"

    params = {
        "$limit":1,
        "$order":"transit_timestamp DESC"
    }
    response = ses.get(url, params=params)

    last_update_string = pl.read_csv(response.content)["transit_timestamp"][0]
    last_update_datetime = datetime.fromisoformat(last_update_string)

    days_of_history = 30
    two_weeks_prior = last_update_datetime - timedelta(days=days_of_history)
    rounded_two_weeks = datetime(year=two_weeks_prior.year, month=two_weeks_prior.month, day=two_weeks_prior.day)

    print(f"Getting data from {rounded_two_weeks.isoformat()} and {last_update_datetime.isoformat()}...")
    params = {
        "$where": f"transit_timestamp between'{rounded_two_weeks.isoformat()}' and '{last_update_datetime.isoformat()}'",
        "$limit": days_of_history * 100_000
    }
    response = ses.get(url, params=params)
    
    columns_to_keep = ["transit_timestamp", "station_complex_id", "fare_class_category", "ridership"]
    df = pl.read_csv(response.content, infer_schema_length=0, columns=columns_to_keep)
    return df

In [8]:
def format_df(df):
    ridership_wide = df.with_columns(
        [pl.col("transit_timestamp").map_elements(lambda x: datetime.fromisoformat(x), return_dtype=pl.Datetime),
        pl.col("ridership").cast(pl.Float64).floor().cast(pl.Int16)]
    ).pivot(
        index=["transit_timestamp", "station_complex_id"],
        columns="fare_class_category",
        values="ridership",
        aggregate_function="sum",
        sort_columns=True
    ).sort(
        ["transit_timestamp", "station_complex_id"], descending=[False, False]
    ).fill_null(0)

    ridership_columns = [col for col in ridership_wide.columns if "Metrocard" in col or "OMNY" in col]
    ridership = ridership_wide.with_columns(
        total_ridership=pl.sum_horizontal(col for col in ridership_columns)
    )

    rename_mapping = {col: clean_column_name(col) for col in ridership.columns}
    ridership = ridership.rename(rename_mapping)
    
    #Since we got rid of the shuttle and TRAM lines, we filter them out here too.
    ridership = ridership.filter(~pl.col("station_complex_id").str.contains("TRAM")).filter(~pl.col("station_complex_id").str.contains("141"))
    return ridership

In [9]:
with requests.session() as ses:
    df = get_data(ses)
ridership = format_df(df)
ridership

Getting data from 2024-03-12T00:00:00 and 2024-04-11T23:00:00...


transit_timestamp,station_complex_id,metrocard_fair_fare,metrocard_full_fare,metrocard_other,metrocard_seniors_disability,metrocard_students,metrocard_unlimited_30_day,metrocard_unlimited_7_day,omny_full_fare,omny_other,omny_seniors_disability,total_ridership
datetime[μs],str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
2024-03-12 00:00:00,"""1""",1,4,2,0,0,4,6,17,0,0,34
2024-03-12 00:00:00,"""10""",8,25,5,5,0,29,35,116,0,0,223
2024-03-12 00:00:00,"""100""",1,3,0,0,0,1,4,5,0,0,14
2024-03-12 00:00:00,"""101""",2,6,0,0,0,7,7,21,0,0,43
2024-03-12 00:00:00,"""103""",0,7,2,0,0,1,2,35,0,0,47
…,…,…,…,…,…,…,…,…,…,…,…,…
2024-04-11 23:00:00,"""95""",1,2,2,0,0,3,3,29,0,0,40
2024-04-11 23:00:00,"""96""",0,3,1,0,0,2,2,26,0,0,34
2024-04-11 23:00:00,"""97""",4,17,4,1,0,10,11,76,0,0,123
2024-04-11 23:00:00,"""98""",4,2,4,0,0,3,10,18,0,0,41
