In [22]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime, timedelta
import calendar
import requests
import json
import os

In [6]:
def get_month_start_end_dates(start_year, end_year):
    """
    Get the list of start and end dates for each month from Jan of the Start Year to Dec of the End Year
    """
    
    # Define the start and end dates
    start_date = datetime(start_year, 1, 1)
    end_date = datetime(end_year, 12, 31)

    # Loop through each month from Jan {start_year} to Dec {end_year}
    while start_date <= end_date:
        # Calculate the start and end dates of the current month
        _, num_days = calendar.monthrange(start_date.year, start_date.month)
        start_of_month = datetime(start_date.year, start_date.month, 1)
        end_of_month = datetime(start_date.year, start_date.month, num_days)

        yield start_of_month, end_of_month
        # Move to the next month
        start_date = end_of_month + timedelta(days=1)
        


In [None]:
for start_date, end_date in get_month_start_end_dates(2023, 2023):
    print(f"Start Date: {start_date}, End Date: {end_date}")


In [24]:
def extract_collisions_from_api_to_local(start_year, end_year):
    """
    Extracts data from the NYC Open Data API for each month from Jan Start Year to Dec End Year
    """
    ##### INPUT VALIDATION #####
    # Check if start_year and end_year are integers and have exactly four digits
    if not isinstance(start_year, int) or not isinstance(end_year, int):
        raise ValueError("Start year and end year must be integers")

    # Check if start_year and end_year are within the specified range
    if start_year < 2012 or start_year > 2023 or end_year < 2012 or end_year > 2023:
        raise ValueError("Start year and end year must be between 2012 and 2023.")
    
    ##### DATA EXTRACTION #####
    # Define the base URL
    base_url = "https://data.cityofnewyork.us/resource/h9gi-nx95.json?"

    # Define the target folder
    target_folder = "../data/crash_records"
    
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    for start_date, end_of_month in get_month_start_end_dates(start_year, end_year):

        # Extract month and year
        month_year = start_date.strftime("%m-%Y")
        
        # Construct the URL for the current month
        url = f"{base_url}$where=crash_date between '{start_date.isoformat()}' and '{end_of_month.isoformat()}'"
        response = requests.get(url)
        
        if not response.ok:
            raise RuntimeError(f"Failed to fetch data from API: {response.status_code}")

        # Normalize the json response 
        df = pd.json_normalize(response.json())

        # define the pyarrow table and read the df into it
        pa_table = pa.Table.from_pandas(df)

        output_file = f'{target_folder}/nyc_collisions_{month_year}.parquet'
        # Save the data to a parquet file
        pq.write_table(pa_table, output_file, compression='None')
        
        print(f'Month: {month_year} done')

    return 

In [None]:
extract_collisions_from_api_to_local(2021, 2022)