In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
pd.options.display.float_format = '{:.0f}'.format

In [3]:
def tlc_data(month: str, year: str) -> tuple[np.array, np.array, np.array]:
    """
    Gets the parquet trip data for Yellow & Green Taxi and For-Hire Vehicles

    Params:
        url (str): URL containing the data of interest on the [TLC page](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
        month (str): Trip data for a given month
        year (str): Trip data for a given year
    
    Returns:
        tuple:
        numpy arrays in the following order:
        1. Yellow Taxi
        2. Green Taxi
        3. For-Hire Vehicles
    """
    # Yellow Taxi
    url1 = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet"
    # Green Taxi
    url2 = f"https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_{year}-{month}.parquet"
    # For-Hire (Uber, Lyft, etc)
    url3 = f"https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_{year}-{month}.parquet"

    # Dataframes
    df1 = pd.read_parquet(url1, use_threads=False)
    df2 = pd.read_parquet(url2, use_threads=False)
    df3 = pd.read_parquet(url3, use_threads=False)

    # Filter the data to only include records that are in the specific year and month
    df1 = df1[(df1["tpep_pickup_datetime"].dt.year == int(year)) & (df1["tpep_pickup_datetime"].dt.month == int(month))]
    df2 = df2[(df2["lpep_pickup_datetime"].dt.year == int(year)) & (df2["lpep_pickup_datetime"].dt.month == int(month))]
    df3 = df3[(df3["pickup_datetime"].dt.year == int(year)) & (df3["pickup_datetime"].dt.month == int(month))]
    
    # Drop duplicates
    df1.drop_duplicates(keep="first", inplace=True)
    df2.drop_duplicates(keep="first", inplace=True)
    df3.drop_duplicates(keep="first", inplace=True)

    # Reset indexes
    df1.reset_index(inplace=True)
    df2.reset_index(inplace=True)
    df3.reset_index(inplace=True)

    # Create the folder and then export the data in the 'data' folder
    df1_path = f"data/yellow_tripdata/{year}"
    df2_path = f"data/green_tripdata/{year}"
    df3_path = f"data/fhvhv_tripdata/{year}"
    if not os.path.exists(df1_path):
        os.makedirs(df1_path)
    if not os.path.exists(df2_path):
        os.makedirs(df2_path)
    if not os.path.exists(df3_path):
        os.makedirs(df3_path)
    
    df1.to_parquet(df1_path + f"/yellow_tripdata_{year}-{month}.parquet")
    df2.to_parquet(df2_path + f"/green_tripdata_{year}-{month}.parquet")
    df3.to_parquet(df3_path + f"/fhvhv_tripdata_{year}-{month}.parquet")
    return df1, df2, df3

In [4]:
# Create a loop and store the data
years = [str(year) for year in range(2020, 2025)]
months = ["0" + str(i) for i in range(1, 10)] + ["10", "11", "12"]

print(f"Years: {years}")
print(f"Months: {months}")

Years: ['2020', '2021', '2022', '2023', '2024']
Months: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']


In [5]:
for year in years:
    for month in months:
        print(f"Year: {year} Month: {month}")
        df1, df2, df3 = tlc_data(month=month, year=year)

Year: 2020 Month: 01


KeyboardInterrupt: 