In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime, timedelta
import time
import calendar
import requests
import json
import os

In [29]:
year = '2019'
month = '01'

base_url = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json'
query = f"$where=date_trunc_ym(crash_date) = '{year}-{month}'"
query += "&$limit=10000"
query += "&$offset={offset}"
#query += "&$order=crash_date"

In [30]:
url = f'{base_url}?{query}'
print(url)

https://data.cityofnewyork.us/resource/h9gi-nx95.json?$where=date_trunc_ym(crash_date) = '2019-01'&$limit=10000&$offset={offset}


In [23]:
url = f'{base_url}?{query}'
print(url)
#url = "https://data.cityofnewyork.us/resource/h9gi-nx95.json?$where=date_trunc_ym(crash_date) = '2016-01'"
response = requests.get(url)

if not response.ok:
    raise RuntimeError(f"Failed to fetch data from API: {response.status_code}")

# Normalize the json response 
df = pd.json_normalize(response.json())

https://data.cityofnewyork.us/resource/h9gi-nx95.json?$where=date_trunc_ym(crash_date) = '2019-01'&$limit=100000


In [32]:
df.shape

(16929, 31)

In [51]:
def extract_collisions_from_api_to_df(year, month, batch_size=10000):
    """
    Extracts one month of data from the NYC Open Data API into a df.
    
    args:
        year (str): the year to extract data for
        month (str): the month to extract data for
        batch_size (int): the number of records to fetch in each API call
    
    returns:
        df (pd.DataFrame): a dataframe containing the extracted data
    """
    
    # Initialize the data structures and variables
    collisions_df = pd.DataFrame()
    dfs = []
    
    year = year
    month_str = str(month).zfill(2)
    
    batch_size = batch_size
    offset = 0
    batch_num = 0
    
    # Define the base URL and query
    base_url = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json'
    query = f"$where=date_trunc_ym(crash_date) = '{year}-{month_str}'"
    query += f"&$limit={batch_size}"
        
    # Fetch JSON data from the API in reasonable sized batches
    while True:
        # Define the URL to fetch
        offset_query = f"&$offset={offset}"
        url = f'{base_url}?{query}{offset_query}'
        print(url)
        
        # Fetch the data
        response = requests.get(url)
        if not response.ok:
            raise RuntimeError(f"Failed to fetch data from API: {response.status_code}")

        # Normalize the json response 
        df = pd.json_normalize(response.json())
        print(df.shape)

        # Append the data to the list of dataframes
        dfs.append(df)
        
        # Check if there are more records to fetch
        if len(df) < batch_size:
            break
        
        offset += batch_size
        batch_num += 1

    collisions_df = pd.concat(dfs)
    collisions_df.reset_index(drop=True, inplace=True)    

    return collisions_df

In [52]:
df2 = extract_collisions_from_api_to_df(2019, 1, 10000)
df2.shape

https://data.cityofnewyork.us/resource/h9gi-nx95.json?$where=date_trunc_ym(crash_date) = '2019-01'&$limit=10000&$offset=0
(10000, 31)
https://data.cityofnewyork.us/resource/h9gi-nx95.json?$where=date_trunc_ym(crash_date) = '2019-01'&$limit=10000&$offset=10000
(6929, 30)


(16929, 31)