In [1]:
import requests
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
from urllib.parse import quote

import os
import sys
from pathlib import Path

sys.path.insert(0, str(Path(os.getcwd()).parent.parent))

from src.common.secrets_helper import Secrets
secrets = Secrets()

# Full URL with SoQL query
url = "https://data.ny.gov/resource/wujg-7c2s.json?$query=SELECT%20transit_timestamp%2C%20transit_mode%2C%20station_complex_id%2C%20station_complex%2C%20borough%2C%20payment_method%2C%20fare_class_category%2C%20ridership%2C%20transfers%2C%20latitude%2C%20longitude%2C%20georeference%20WHERE%20%60transit_timestamp%60%20%3E%3D%20'2024-06-01'%20AND%20%60transit_timestamp%60%20%3C%20'2024-07-01'%20LIMIT%2050000"

In [2]:
# Optional: Add your App Token here if you have one
headers = {
    "Accept": "application/json",
    "X-App-Token": secrets["MTA_X_APP_TOKEN"]
}

In [None]:
# offset = 0
# limit = 50000
# all_data = []

# while True:
#     paged_query = f"""
#     SELECT transit_timestamp, transit_mode, station_complex_id, ...
#     WHERE transit_timestamp >= '2024-06-01' AND transit_timestamp < '2024-07-01'
#     LIMIT {limit} OFFSET {offset}
#     """
#     url = "https://data.ny.gov/resource/wujg-7c2s.json?$query=" + quote(paged_query)
#     response = requests.get(url, headers=headers)
#     chunk = response.json()
    
#     if not chunk:
#         break
#     all_data.extend(chunk)
#     offset += limit


In [None]:
# # Send the request
# response = requests.get(url, headers=headers)

# # Check for successful response
# if response.status_code == 200:
#     data = response.json()
#     df = pd.DataFrame(data)
#     print("Success!")
# else:
#     print(f"Request failed with status code {response.status_code}")

# # Convert the 'georeference' column to Shapely geometries
# df['geometry'] = df['georeference'].apply(lambda x: shape(x) if x else None)

# gdf = gpd.GeoDataFrame(df, geometry='geometry')

Success!


In [None]:
def api_query_string_to_sql_query(
    api_query_string: str
 )-> str:
    pass



def sql_query_to_api_query_string():
    """
    Converts a SQL query string to an API query string.
    """
    # Example SQL query
    sql_query = """
    SELECT
        transit_timestamp,
        transit_mode,
        station_complex_id,
        station_complex,
        borough,
        payment_method,
        fare_class_category,
        ridership,
        transfers,
        latitude,
        longitude,
        georeference
    WHERE transit_timestamp >= '2024-06-01' AND transit_timestamp < '2024-07-01'
    """

    # Convert SQL to API query string
    api_query_string = sql_query.replace("SELECT", "$query=SELECT").replace("WHERE", "WHERE").replace(" ", "%20").replace("'", "%27")
    
    return api_query_string

In [3]:
from sodapy import Socrata
import json

In [4]:
client = Socrata(domain="data.ny.gov", app_token=secrets["MTA_X_APP_TOKEN"])  # Or include your app token

In [5]:
client.timeout = 3600
results = client.get(
    "wujg-7c2s",
    select="station_complex_id, station_complex, borough, payment_method, fare_class_category, georeference, AVG(ridership) as average_ridership, AVG(transfers) as average_transfers, date_extract_hh(transit_timestamp) as hour_of_day, date_extract_m(transit_timestamp) as month",
    where="transit_timestamp >= '2024-01-01' AND transit_timestamp < '2025-01-01'",
    group="station_complex_id, station_complex, borough, payment_method, fare_class_category, georeference, month, hour_of_day",
    # where="transit_timestamp >= '2024-06-01' AND transit_timestamp < '2024-07-01' AND borough = 'Manhattan'",
    limit=1000000,
)
len(results)

KeyboardInterrupt: 

In [21]:
results

[{'average_ridership': '13.3586215369494441',
  'hour_of_day': '0',
  'month': '5'},
 {'average_ridership': '7.0782216152649748', 'hour_of_day': '1', 'month': '5'},
 {'average_ridership': '4.5831639215035473', 'hour_of_day': '2', 'month': '5'},
 {'average_ridership': '3.9172327816723278', 'hour_of_day': '3', 'month': '5'},
 {'average_ridership': '6.9491539551554555', 'hour_of_day': '4', 'month': '5'},
 {'average_ridership': '18.0003220611916264',
  'hour_of_day': '5',
  'month': '5'},
 {'average_ridership': '36.4235625627675237',
  'hour_of_day': '6',
  'month': '5'},
 {'average_ridership': '67.5132643884892086',
  'hour_of_day': '7',
  'month': '5'},
 {'average_ridership': '80.2495942755975214',
  'hour_of_day': '8',
  'month': '5'},
 {'average_ridership': '56.1611616625390730',
  'hour_of_day': '9',
  'month': '5'},
 {'average_ridership': '42.5097568688728910',
  'hour_of_day': '10',
  'month': '5'},
 {'average_ridership': '41.7065790820361566',
  'hour_of_day': '11',
  'month': '5'}

In [None]:
with open("mta_ridership_by_month_by_hour_of_day_2024.json", "w") as file:
    json.dump(results, file)

# You can convert to DataFrame
import pandas as pd
df = pd.DataFrame.from_records(results)

(2152106, 12)

In [None]:
from shapely.geometry import shape
import geopandas as gpd

df['geometry'] = df['georeference'].apply(lambda x: shape(x) if x else None)

gdf = gpd.GeoDataFrame(df, geometry='geometry', crs="EPSG:4326")

In [25]:
gdf = gdf.set_crs("EPSG:4326")

In [26]:
gdf.to_file("mta_ridership_all_boroughs_june_2024.geojson", driver="GeoJSON")