# The Data School

[Data Analyst Consultant - DSNY9 (April 7, 2025)](https://jobs.lever.co/theinformationlab/2956d3ed-13be-4835-8fdb-b3eddbfa62ae)

In [8]:
import os
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
pd.options.display.float_format = '{:.0f}'.format

In [10]:
def tlc_data(inp_month: str, inp_year: str) -> tuple[np.array, np.array, np.array]:
    """
    Gets the parquet trip data for Yellow & Green Taxi and For-Hire Vehicles

    Params:
        url (str): URL containing the data of interest on the [TLC page](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
        month (str): Trip data for a given month
        year (str): Trip data for a given year
    
    Returns:
        Numpy arrays in the following order -> Yellow Taxi, Green Taxi, and For-Hire Vehicles
    """
    # Yellow Taxi
    url1 = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{inp_year}-{inp_month}.parquet"
    # Green Taxi
    url2 = f"https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_{inp_year}-{inp_month}.parquet"
    # For-Hire (Uber, Lyft, etc)
    url3 = f"https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_{inp_year}-{inp_month}.parquet"

    # Dataframes
    df1 = pd.read_parquet(url1, use_threads=False)
    df2 = pd.read_parquet(url2, use_threads=False)
    df3 = pd.read_parquet(url3, use_threads=False)

    # Filter the data to only include records that are in the specific year and month
    df1 = df1[(df1["tpep_pickup_datetime"].dt.year == int(inp_year)) & (df1["tpep_pickup_datetime"].dt.month == int(inp_month))]
    df2 = df2[(df2["lpep_pickup_datetime"].dt.year == int(inp_year)) & (df2["lpep_pickup_datetime"].dt.month == int(inp_month))]
    df3 = df3[(df3["pickup_datetime"].dt.year == int(inp_year)) & (df3["pickup_datetime"].dt.month == int(inp_month))]
    
    # Drop duplicates
    df1.drop_duplicates(keep="first", inplace=True)
    df2.drop_duplicates(keep="first", inplace=True)
    df3.drop_duplicates(keep="first", inplace=True)

    # Reset indexes
    df1.reset_index(inplace=True)
    df2.reset_index(inplace=True)
    df3.reset_index(inplace=True)

    # Create the folder and then export the data in the 'data' folder
    df1_path = f"data/yellow_tripdata/{inp_year}"
    df2_path = f"data/green_tripdata/{inp_year}"
    df3_path = f"data/fhvhv_tripdata/{inp_year}"
    if not os.path.exists(df1_path):
        os.makedirs(df1_path)
    if not os.path.exists(df2_path):
        os.makedirs(df2_path)
    if not os.path.exists(df3_path):
        os.makedirs(df3_path)
    
    df1.to_parquet(df1_path + f"/yellow_tripdata_{inp_year}-{inp_month}.parquet")
    df2.to_parquet(df2_path + f"/green_tripdata_{inp_year}-{inp_month}.parquet")
    df3.to_parquet(df3_path + f"/fhvhv_tripdata_{inp_year}-{inp_month}.parquet")
    return df1, df2, df3

In [11]:
# Create a loop and store the data
years = [str(year) for year in range(2020, 2025)]
months = ["0"+ str(month) for month in range(1, 10)] + ["10"]

In [None]:
for inp_year in years:
    for inp_month in months:
        print(f"Year: {inp_year} Month: {inp_month}")
        df1, df2, df3 = tlc_data(inp_month=inp_month, inp_year=inp_year)

In [12]:
# Get samples from all of the parquet files
max_num = 250_000
tlc_type = "yellow"
parquet_list = []
for year in years:
    parquets = os.listdir(f"data/{tlc_type}_tripdata/{year}/")
    for parquet in parquets:
        parquet_path = os.path.join(f"data/{tlc_type}_tripdata/{year}/", parquet)
        df = pd.read_parquet(parquet_path).drop(columns="index")
        max_num = min(max_num, len(df))
        sample_array = df.sample(n=max_num).to_numpy()
        parquet_list.append(sample_array)
        print(f"{parquet_path} ({len(parquet_list[-1])})")

# Concatenate all data at once
parquet_array = np.concatenate(parquet_list, axis=0)
print(f"Total rows: {len(parquet_array)}")
sample_data = pd.DataFrame(parquet_array, columns=df.columns.str.lower())
sample_data.to_csv(f"data/{tlc_type.upper()}_Trip_Data.csv")

data/yellow_tripdata/2020/yellow_tripdata_2020-01.parquet (250000)
data/yellow_tripdata/2020/yellow_tripdata_2020-02.parquet (250000)
data/yellow_tripdata/2020/yellow_tripdata_2020-03.parquet (250000)
data/yellow_tripdata/2020/yellow_tripdata_2020-04.parquet (237941)
data/yellow_tripdata/2020/yellow_tripdata_2020-05.parquet (237941)
data/yellow_tripdata/2020/yellow_tripdata_2020-06.parquet (237941)
data/yellow_tripdata/2020/yellow_tripdata_2020-07.parquet (237941)
data/yellow_tripdata/2020/yellow_tripdata_2020-08.parquet (237941)
data/yellow_tripdata/2020/yellow_tripdata_2020-09.parquet (237941)
data/yellow_tripdata/2020/yellow_tripdata_2020-10.parquet (237941)
data/yellow_tripdata/2020/yellow_tripdata_2020-11.parquet (237941)
data/yellow_tripdata/2020/yellow_tripdata_2020-12.parquet (237941)
data/yellow_tripdata/2021/yellow_tripdata_2021-01.parquet (237941)
data/yellow_tripdata/2021/yellow_tripdata_2021-02.parquet (237941)
data/yellow_tripdata/2021/yellow_tripdata_2021-03.parquet (237