In [1]:
import os
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
# Constants
TLC_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
YELLOW_TAXI_DIR = "./yellow_taxi_data"
UBER_DATA_DIR = "./uber_data"
TAXI_ZONES_DIR = "./taxi_zones"
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.parquet"

# Ensure directories exist
os.makedirs(YELLOW_TAXI_DIR, exist_ok=True)
os.makedirs(UBER_DATA_DIR, exist_ok=True)
os.makedirs(TAXI_ZONES_DIR, exist_ok=True)

In [5]:
# Downloading
def fetch_and_download_data(base_url, regex_pattern, save_dir):
    """
    Downloads Parquet files based on a regex pattern.
    """
    os.makedirs(save_dir, exist_ok=True)
    response = requests.get(base_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')

    # Filter links by regex for 2020-2024 data
    links = soup.find_all("a", href=re.compile(regex_pattern))
    for link in links:
        file_url = link["href"]
        if not file_url.startswith("http"):
            file_url = f"https:{file_url}"
        file_name = file_url.split("/")[-1]
        save_path = os.path.join(save_dir, file_name)

        if not os.path.exists(save_path):
            print(f"Downloading {file_name}...")
            file_response = requests.get(file_url, stream=True)
            with open(save_path, "wb") as f:
                for chunk in file_response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            print(f"Saved {file_name} to {save_dir}.")
        else:
            print(f"{file_name} already exists. Skipping download.")

In [7]:
# Sampling
def calculate_sample_size(population, confidence_level=0.95, margin_of_error=0.05):
    """
    Calculate sample size using Cochran's formula.
    """
    z = (2 * (1 - confidence_level)) ** -0.5
    p = 0.5
    numerator = (z ** 2) * p * (1 - p)
    denominator = margin_of_error ** 2
    initial_sample_size = numerator / denominator

    if population:
        adjusted_sample_size = initial_sample_size / (1 + (initial_sample_size - 1) / population)
        return int(adjusted_sample_size + 0.5)
    return int(initial_sample_size + 0.5)

In [9]:
# Cleaning
def get_and_clean_taxi_month(url, sample_size=None):
    """
    Load, clean, and optionally sample a single month's taxi data.
    """
    import pyarrow.parquet as pq

    # Load the Parquet file
    try:
        df = pd.read_parquet(url, engine='pyarrow', use_legacy_dataset=True)
    except Exception as e:
        print(f"Error reading Parquet file: {url}. Error: {e}")
        return pd.DataFrame()  # Return empty DataFrame on failure

    # Cleaning logic
    df = df[df["PULocationID"].notnull() & df["DOLocationID"].notnull()]
    df = df[df["trip_distance"] > 0]

    # Sampling logic
    if sample_size and sample_size < len(df):
        df = df.sample(n=sample_size)

    return df

In [11]:
def get_and_clean_taxi_data(parquet_urls, sample_size=None):
    """
    Process multiple Parquet files, clean, and optionally sample the data.
    """
    all_taxi_data = []

    for url in parquet_urls:
        if not os.path.exists(url):
            print(f"File not found: {url}. Skipping...")
            continue

        df = get_and_clean_taxi_month(url, sample_size)
        if not df.empty:
            all_taxi_data.append(df)

    if all_taxi_data:
        taxi_data = pd.concat(all_taxi_data, ignore_index=True)
    else:
        taxi_data = pd.DataFrame()

    return taxi_data

In [13]:
# Taxi Zone Shapefile
def load_taxi_zones(shapefile):
    """
    Load taxi zone shapefile data as a GeoDataFrame.
    """
    import geopandas as gpd
    try:
        taxi_zones = gpd.read_file(shapefile)
        print("Taxi Zones loaded successfully.")
        return taxi_zones
    except Exception as e:
        print(f"Error loading Taxi Zones: {e}")
        raise

In [15]:
def main():
    # Download data for 2020-2024
    fetch_and_download_data(TLC_URL, r"yellow_tripdata_(202[0-4]-\d{2})\.parquet", YELLOW_TAXI_DIR)
    fetch_and_download_data(TLC_URL, r"fhvhv_tripdata_(202[0-4]-\d{2})\.parquet", UBER_DATA_DIR)

    # Calculate sample sizes
    yellow_sample_size = calculate_sample_size(population=1000000)
    uber_sample_size = calculate_sample_size(population=500000)

    print(f"Yellow Taxi sample size: {yellow_sample_size}")
    print(f"Uber sample size: {uber_sample_size}")

    # Process Yellow Taxi data
    yellow_data_files = [os.path.join(YELLOW_TAXI_DIR, f) for f in os.listdir(YELLOW_TAXI_DIR) if f.endswith(".parquet")]
    print(f"Yellow Taxi files: {yellow_data_files}")

    yellow_data = get_and_clean_taxi_data(yellow_data_files, sample_size=yellow_sample_size)
    print(f"Processed Yellow Taxi data shape: {yellow_data.shape}")

    # Process Uber data
    uber_data_files = [os.path.join(UBER_DATA_DIR, f) for f in os.listdir(UBER_DATA_DIR) if f.endswith(".parquet")]
    print(f"Uber files: {uber_data_files}")

    uber_data = get_and_clean_taxi_data(uber_data_files, sample_size=uber_sample_size)
    print(f"Processed Uber data shape: {uber_data.shape}")

In [17]:
if __name__ == "__main__":
    main()

yellow_tripdata_2024-01.parquet  already exists. Skipping download.
yellow_tripdata_2024-02.parquet  already exists. Skipping download.
yellow_tripdata_2024-03.parquet  already exists. Skipping download.
yellow_tripdata_2024-04.parquet already exists. Skipping download.
yellow_tripdata_2024-05.parquet already exists. Skipping download.
yellow_tripdata_2024-06.parquet already exists. Skipping download.
yellow_tripdata_2024-07.parquet already exists. Skipping download.
yellow_tripdata_2024-08.parquet already exists. Skipping download.
yellow_tripdata_2023-01.parquet already exists. Skipping download.
yellow_tripdata_2023-02.parquet already exists. Skipping download.
yellow_tripdata_2023-03.parquet already exists. Skipping download.
yellow_tripdata_2023-04.parquet already exists. Skipping download.
yellow_tripdata_2023-05.parquet  already exists. Skipping download.
yellow_tripdata_2023-06.parquet already exists. Skipping download.
yellow_tripdata_2023-07.parquet  already exists. Skipping 

  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read

Error reading Parquet file: ./yellow_taxi_data/yellow_tripdata_2019-01.parquet. Error: Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.


  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  pa_table = self.api.parquet.read_table(
  taxi_data = pd.concat(all_taxi_data, ignore_index=True)
  pa_table = self.api.parquet.read_table(


Processed Yellow Taxi data shape: (46953, 20)
Uber files: ['./uber_data/fhvhv_tripdata_2021-03.parquet', './uber_data/fhvhv_tripdata_2020-05.parquet', './uber_data/fhvhv_tripdata_2022-06.parquet', './uber_data/fhvhv_tripdata_2022-07.parquet', './uber_data/fhvhv_tripdata_2023-01.parquet', './uber_data/fhvhv_tripdata_2021-12.parquet', './uber_data/fhvhv_tripdata_2021-02.parquet', './uber_data/fhvhv_tripdata_2020-04.parquet', './uber_data/fhvhv_tripdata_2021-09.parquet', './uber_data/fhvhv_tripdata_2022-05.parquet', './uber_data/fhvhv_tripdata_2024-08.parquet', './uber_data/fhvhv_tripdata_2021-10.parquet', './uber_data/fhvhv_tripdata_2020-06.parquet', './uber_data/fhvhv_tripdata_2021-11.parquet', './uber_data/fhvhv_tripdata_2021-01.parquet', './uber_data/fhvhv_tripdata_2020-07.parquet', './uber_data/fhvhv_tripdata_2021-08.parquet', './uber_data/fhvhv_tripdata_2022-04.parquet', './uber_data/fhvhv_tripdata_2023-02.parquet', './uber_data/fhvhv_tripdata_2022-11.parquet', './uber_data/fhvhv_tr

KeyError: 'trip_distance'