# Understanding Hired Rides in NYC

_[Project prompt](https://docs.google.com/document/d/1VERPjEZcC1XSs4-02aM-DbkNr_yaJVbFjLJxaYQswqA/edit#)_

_This scaffolding notebook may be used to help setup your final project. It's **totally optional** whether you make use of this or not._

_If you do use this notebook, everything provided is optional as well - you may remove or add prose and code as you wish._

_Anything in italics (prose) or comments (in code) is meant to provide you with guidance. **Remove the italic lines and provided comments** before submitting the project, if you choose to use this scaffolding. We don't need the guidance when grading._

_**All code below should be consider "pseudo-code" - not functional by itself, and only a suggestion at the approach.**_

## Project Setup

In [57]:
# all import statements needed for the project, for example:

import os

import bs4
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import requests
import sqlalchemy as db
import re
from datetime import datetime, timedelta
import numpy as np
import fiona
import math

In [3]:
# any constants you might need; some have been added for you, and 
# some you need to fill in

TLC_URL = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = ""
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [5]:
# Make sure the QUERY_DIRECTORY exists
try:
    os.mkdir(QUERY_DIRECTORY)
except Exception as e:
    if e.errno == 17:
        # the directory already exists
        pass
    else:
        raise

## Part 1: Data Preprocessing

In [4]:
# Generate the date range for Yellow Taxi and HVFHV datasets
start_date = "2020-01"
end_date = "2024-08"

dates = []
current_date = datetime.strptime(start_date, "%Y-%m")
end_date_obj = datetime.strptime(end_date, "%Y-%m")

while current_date <= end_date_obj:
    dates.append(current_date.strftime("%Y-%m"))
    current_date += timedelta(days=31)  # Move to the next month
    current_date = current_date.replace(day=1)

# Base URLs for Yellow Taxi and HVFHV datasets
yellow_taxi_base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"
hvhf_base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_"

# Save directories for Yellow Taxi and HVFHV data
yellow_taxi_save_dir = "data/yellow_taxi"
hvhf_save_dir = "data/hvhf"

# Ensure the directories exist
os.makedirs(yellow_taxi_save_dir, exist_ok=True)
os.makedirs(hvhf_save_dir, exist_ok=True)

# Download Yellow Taxi data
for date in dates:
    file_url = f"{yellow_taxi_base_url}{date}.parquet"
    file_name = f"{date}.parquet"
    local_file_path = os.path.join(yellow_taxi_save_dir, file_name)

    print(f"Downloading Yellow Taxi file: {file_url} ...")
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()
        with open(local_file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print(f"File saved to: {local_file_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {file_url}: {e}")


Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-01.parquet ...
File saved to: data/yellow_taxi/2020-01.parquet
Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-02.parquet ...
File saved to: data/yellow_taxi/2020-02.parquet
Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-03.parquet ...
File saved to: data/yellow_taxi/2020-03.parquet
Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-04.parquet ...
File saved to: data/yellow_taxi/2020-04.parquet
Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-05.parquet ...
File saved to: data/yellow_taxi/2020-05.parquet
Downloading Yellow Taxi file: https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2020-06.parquet ...
File saved to: data/yellow_taxi/2020-06.parquet
Downloading Yellow Taxi file

In [6]:
# Download HVFHV data
for date in dates:
    file_url = f"{hvhf_base_url}{date}.parquet"
    file_name = f"{date}.parquet"
    local_file_path = os.path.join(hvhf_save_dir, file_name)

    print(f"Downloading HVFHV file: {file_url} ...")
    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()
        with open(local_file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)
        print(f"File saved to: {local_file_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {file_url}: {e}")

Downloading HVFHV file: https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-01.parquet ...
File saved to: data/hvhf/2020-01.parquet
Downloading HVFHV file: https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-02.parquet ...
File saved to: data/hvhf/2020-02.parquet
Downloading HVFHV file: https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-03.parquet ...
File saved to: data/hvhf/2020-03.parquet
Downloading HVFHV file: https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-04.parquet ...
File saved to: data/hvhf/2020-04.parquet
Downloading HVFHV file: https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-05.parquet ...
File saved to: data/hvhf/2020-05.parquet
Downloading HVFHV file: https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-06.parquet ...
File saved to: data/hvhf/2020-06.parquet
Downloading HVFHV file: https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2020-07.parquet ...
File 

In [8]:
def filter_uber_rides(hvhf_data: pd.DataFrame) -> pd.DataFrame:
    """
    Filters out non-Uber rides from the HVFHV dataset.

    Args:
        hvhf_data (pd.DataFrame): The raw HVFHV dataset.

    Returns:
        pd.DataFrame: Filtered dataset containing only Uber rides.
    """
    # Ensure 'Hvfhs_license_num' is treated as a string
    hvhf_data['hvfhs_license_num'] = hvhf_data['hvfhs_license_num'].astype(str)
    
    # Filter rows where 'hvfhs_license_num' is 'HV0003' (Uber)
    uber_only_data = hvhf_data[hvhf_data['hvfhs_license_num'] == 'HV0003'].copy()

    print(f"Filtered Uber rides: {len(uber_only_data)} out of {len(hvhf_data)} total rides.")
    
    # Warning if no Uber rides are found
    if uber_only_data.empty:
        print("Warning: No Uber rides found after filtering.")
    
    return uber_only_data


def cochran_sample_size(population_size: int, confidence_level: float = 0.95, margin_of_error: float = 0.05, p: float = 0.5) -> int:
    """
    Calculate the sample size using Cochran's formula.

    Args:
        population_size (int): The total number of data points in the population.
        confidence_level (float): The confidence level (default is 0.95).
        margin_of_error (float): The margin of error (default is 0.05).
        p (float): The estimated proportion of the population.

    Returns:
        int: The calculated sample size.
    """
    z = {0.90: 1.645, 0.95: 1.96, 0.99: 2.576}.get(confidence_level, 1.96)
    numerator = (z ** 2) * p * (1 - p)
    denominator = margin_of_error ** 2
    sample_size = numerator / denominator

    if population_size > 0:
        adjusted_sample_size = sample_size / (1 + (sample_size - 1) / population_size)
    else:
        adjusted_sample_size = sample_size
    
    return int(np.ceil(adjusted_sample_size))


def process_dataset(file_path: str, output_dir: str, filter_uber: bool = False, p: float = 0.5) -> None:
    """
    Processes a dataset: loads, filters, samples, and saves the result.

    Args:
        file_path (str): Path to the dataset file.
        output_dir (str): Directory to save the processed file.
        filter_uber (bool): Whether to filter for Uber rides (default is False).
        p (float): The proportion of variability for sampling (default is 0.5).
    """
    print(f"Processing file: {file_path}")
    
    # Load the dataset
    data = pd.read_parquet(file_path)
    
    # Apply filtering for Uber rides if needed
    if filter_uber:
        data = filter_uber_rides(data)
    
    # Determine population size
    population_size = len(data)
    print(f"Population size: {population_size}")
    
    # Calculate sample size
    sample_size = cochran_sample_size(population_size, confidence_level=0.95, margin_of_error=0.05, p=p)
    print(f"Calculated sample size: {sample_size}")
    
    # Sample the dataset
    sampled_data = data.sample(n=sample_size, random_state=42)
    
    # Save the sampled dataset
    output_file = os.path.join(output_dir, os.path.basename(file_path))
    sampled_data.to_parquet(output_file)
    print(f"Processed file saved to: {output_file}")


# Example Workflow

# Input directories (where raw datasets are stored)
yellow_taxi_dir = "data/yellow_taxi"
hvhf_dir = "data/hvhf"

# Output directories (where processed datasets will be saved)
processed_yellow_taxi_dir = "processed_data/yellow_taxi"
processed_hvhf_dir = "processed_data/hvhf"

# Ensure output directories exist
os.makedirs(processed_yellow_taxi_dir, exist_ok=True)
os.makedirs(processed_hvhf_dir, exist_ok=True)

# Process Yellow Taxi datasets (using p = 0.5)
for file in os.listdir(yellow_taxi_dir):
    if file.endswith(".parquet"):
        process_dataset(
            file_path=os.path.join(yellow_taxi_dir, file),
            output_dir=processed_yellow_taxi_dir,
            filter_uber=False,
            p=0.5  # Higher variability for Yellow Taxi
        )

# Process HVFHV datasets (filter for Uber rides, using p = 0.4)
for file in os.listdir(hvhf_dir):
    if file.endswith(".parquet"):
        process_dataset(
            file_path=os.path.join(hvhf_dir, file),
            output_dir=processed_hvhf_dir,
            filter_uber=True,
            p=0.4  # Lower variability for Uber rides
        )


Processing file: data/yellow_taxi/2022-04.parquet
Population size: 3599920
Calculated sample size: 385
Processed file saved to: processed_data/yellow_taxi/2022-04.parquet
Processing file: data/yellow_taxi/2021-08.parquet
Population size: 2788757
Calculated sample size: 385
Processed file saved to: processed_data/yellow_taxi/2021-08.parquet
Processing file: data/yellow_taxi/2023-12.parquet
Population size: 3376567
Calculated sample size: 385
Processed file saved to: processed_data/yellow_taxi/2023-12.parquet
Processing file: data/yellow_taxi/2023-02.parquet
Population size: 2913955
Calculated sample size: 385
Processed file saved to: processed_data/yellow_taxi/2023-02.parquet
Processing file: data/yellow_taxi/2021-01.parquet
Population size: 1369769
Calculated sample size: 385
Processed file saved to: processed_data/yellow_taxi/2021-01.parquet
Processing file: data/yellow_taxi/2021-11.parquet
Population size: 3472949
Calculated sample size: 385
Processed file saved to: processed_data/ye

In [56]:
def combine_monthly_files(data_dir: str) -> pd.DataFrame:
    """
    Combines all Parquet files in the specified directory into a single DataFrame.

    Args:
        data_dir (str): Directory containing the monthly Parquet files.

    Returns:
        pd.DataFrame: A combined DataFrame with data from all monthly files.
    """
    # List to store DataFrames for each file
    data_frames = []

    # Iterate through all Parquet files in the directory
    for file in os.listdir(data_dir):
        if file.endswith(".parquet"):
            file_path = os.path.join(data_dir, file)
            monthly_data = pd.read_parquet(file_path)
            data_frames.append(monthly_data)

    # Combine all DataFrames into one
    combined_data = pd.concat(data_frames, ignore_index=True)
    
    return combined_data

# Directories for processed data
processed_yellow_taxi_dir = "processed_data/yellow_taxi"
processed_hvhf_dir = "processed_data/hvhf"

# Combine Yellow Taxi data
yellow_taxi_data = combine_monthly_files(processed_yellow_taxi_dir)

# Combine Uber (HVFHV) data
uber_data = combine_monthly_files(processed_hvhf_dir)

In [64]:
yellow_taxi_data

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Airport_fee
0,2,2022-04-29 21:40:49,2022-04-29 21:46:33,1.0,0.61,1.0,N,229,161,2,5.00,0.5,0.5,0.00,0.00,0.3,8.80,2.5,0.00,
1,2,2022-04-04 14:27:06,2022-04-04 15:13:35,6.0,10.16,1.0,N,138,65,1,35.50,0.0,0.5,7.51,0.00,0.3,45.06,0.0,1.25,
2,1,2022-04-10 09:32:27,2022-04-10 09:43:42,1.0,2.00,1.0,N,262,43,1,9.50,2.5,0.5,5.00,0.00,0.3,17.80,2.5,0.00,
3,2,2022-04-19 09:35:44,2022-04-19 09:55:06,1.0,3.20,1.0,N,48,75,2,15.00,0.0,0.5,0.00,0.00,0.3,18.30,2.5,0.00,
4,2,2022-04-04 12:50:04,2022-04-04 12:50:07,2.0,0.00,5.0,N,7,138,1,45.00,0.0,0.0,9.06,0.00,0.3,54.36,0.0,0.00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21551,2,2022-11-08 13:33:25,2022-11-08 13:51:41,1.0,1.03,1.0,N,43,43,1,11.00,0.0,0.5,0.00,0.00,0.3,14.30,2.5,0.00,
21552,2,2022-11-05 02:10:01,2022-11-05 02:18:51,1.0,2.50,1.0,N,114,13,1,9.50,0.5,0.5,2.66,0.00,0.3,15.96,2.5,0.00,
21553,2,2022-11-08 21:35:31,2022-11-08 21:43:23,1.0,1.23,1.0,N,148,231,1,7.00,0.5,0.5,1.00,0.00,0.3,11.80,2.5,0.00,
21554,1,2022-11-12 04:34:04,2022-11-12 04:55:38,1.0,10.30,1.0,N,48,138,1,30.50,3.0,0.5,8.15,6.55,0.3,49.00,2.5,0.00,


### Load Taxi Zones

In [70]:
def load_taxi_zones(file_path):
    geofile = gpd.read_file(file_path)
    return geofile
    
taxi_zones = load_taxi_zones("taxi_zones.shp")

In [72]:
def lookup_coords_for_taxi_zone_id(zone_loc_id, loaded_taxi_zones):
    zone = loaded_taxi_zones[loaded_taxi_zones['LocationID'] == zone_loc_id]
    
    # If no match is found, return None
    if zone.empty:
        return None
    
    # Get the centroid of the zone's geometry
    centroid = zone.geometry.centroid.iloc[0]
    
    # Return the latitude and longitude as a tuple
    return (centroid.y, centroid.x)

### Calculate Sample Size

In [59]:
def calculate_sample_size(population, p = 0.5) -> int:
    """
    Calculates the required sample size using Cochran's formula.

    Args:
        population (int): The total population size.
        confidence_level (float): Confidence level as a proportion (default is 0.95 for 95% confidence).
        margin_of_error (float): Desired margin of error as a proportion (default is 0.05 for 5%).

    Returns:
        int: Calculated sample size.
    """
    # Z-value for confidence level (default: 1.96 for 95%)
    z = 1.96
    margin_of_error = 0.05
    q = 1 - p  # Complementary proportion
    
    # Cochran's sample size formula for infinite population
    n_0 = (z**2 * p * q) / (margin_of_error**2)
    
    # Adjust for finite population size
    sample_size = n_0 / (1 + (n_0 - 1) / population)
    
    return math.ceil(sample_size)

### Common Functions

In [79]:
def get_all_urls_from_page(page_url):
    """
    Fetches all URLs from a given webpage.

    Args:
        page_url (str): URL of the webpage to scrape.

    Returns:
        list: List of all URLs found on the webpage.
    """
    try:
        # Send a GET request to the page
        response = requests.get(page_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
    except requests.exceptions.RequestException as e:
        raise Exception(f"Failed to access the URL: {page_url}. Error: {e}")
    
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all anchor tags with href attributes
    links = soup.find_all("a", href=True)
    
    # Extract and return all href attributes
    all_urls = [link["href"] for link in links]
    
    return all_urls

In [81]:
def filter_parquet_urls(links):
    parquet_urls = []
    for url in links:
        # Normalize the URL (strip whitespace, handle cases like trailing slashes)
        url = url.strip()
        # Use regex to ensure matching even with query parameters
        if re.search(r"\.parquet(\?.*)?$", url):
            parquet_urls.append(url)
    return parquet_urls

### Process Taxi Data

In [83]:
def get_and_clean_taxi_month(parquet_url: str) -> pd.DataFrame:
    """
    Downloads, processes, and saves Yellow Taxi dataset for a given month.

    Args:
        parquet_url (str): URL of the Yellow Taxi Parquet file.

    Returns:
        pd.DataFrame: Sampled and processed DataFrame.
    """
    # Default directory for processed Yellow Taxi data
    save_dir = "processed_data/yellow_taxi"

    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Extract file name and define local path
    file_name = parquet_url.split("/")[-1]
    local_file_path = os.path.join(save_dir, file_name)

    # Download the file if not already downloaded
    if not os.path.exists(local_file_path):
        print(f"Downloading Yellow Taxi file: {parquet_url} ...")
        try:
            response = requests.get(parquet_url, stream=True)
            response.raise_for_status()
            with open(local_file_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
                    if chunk:
                        f.write(chunk)
            print(f"File saved to: {local_file_path}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {parquet_url}: {e}")
            return pd.DataFrame()  # Return an empty DataFrame if download fails
    else:
        print(f"Loading file from local storage: {local_file_path}")

    # Load the dataset
    try:
        data = pd.read_parquet(local_file_path)
    except Exception as e:
        print(f"Error reading Parquet file {local_file_path}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame if reading fails

    # Determine population size
    population = len(data)
    print(f"Population size: {population}")

    # Calculate sample size (using p = 0.5 for Yellow Taxi data)
    sample_size = calculate_sample_size(population, p = 0.5)
    print(f"Calculated sample size: {sample_size}")

    # Sample the dataset
    sampled_data = data.sample(n=sample_size, random_state=42) if population > sample_size else data

    # Save the sampled dataset
    processed_file_path = os.path.join(save_dir, f"sampled_{file_name}")
    sampled_data.to_parquet(processed_file_path)
    print(f"Processed file saved to: {processed_file_path}")

    return sampled_data


In [125]:
def get_and_clean_taxi_data(parquet_urls):
    all_taxi_dataframes = []
    yellow_taxi_pattern = re.compile(r"yellow_tripdata_(2020-(0[1-9]|1[0-2])|202[1-3]-(0[1-9]|1[0-2])|2024-(0[1-8]))\.parquet")

    # Filter URLs matching the pattern
    yellow_taxi_urls = [url for url in parquet_urls if yellow_taxi_pattern.search(url)]
    
    for url in yellow_taxi_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_taxi_month(url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_taxi_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    taxi_data = pd.concat(all_taxi_dataframes)
    return taxi_data

In [127]:
def get_taxi_data():
    all_urls = get_all_urls_from_page(TLC_URL)
    all_parquet_urls = filter_parquet_urls(all_urls)
    taxi_data = get_and_clean_taxi_data(all_parquet_urls)
    return taxi_data

In [129]:
taxi_data = get_taxi_data()

Loading file from local storage: processed_data/yellow_taxi/yellow_tripdata_2024-01.parquet
Population size: 2964624
Calculated sample size: 385
Processed file saved to: processed_data/yellow_taxi/sampled_yellow_tripdata_2024-01.parquet
Loading file from local storage: processed_data/yellow_taxi/yellow_tripdata_2024-02.parquet
Population size: 3007526
Calculated sample size: 385
Processed file saved to: processed_data/yellow_taxi/sampled_yellow_tripdata_2024-02.parquet
Loading file from local storage: processed_data/yellow_taxi/yellow_tripdata_2024-03.parquet
Population size: 3582628
Calculated sample size: 385
Processed file saved to: processed_data/yellow_taxi/sampled_yellow_tripdata_2024-03.parquet
Loading file from local storage: processed_data/yellow_taxi/yellow_tripdata_2024-04.parquet
Population size: 3514289
Calculated sample size: 385
Processed file saved to: processed_data/yellow_taxi/sampled_yellow_tripdata_2024-04.parquet
Loading file from local storage: processed_data/yell

In [131]:
taxi_data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee
1725696,2,2024-01-20 13:31:30,2024-01-20 14:03:25,2.0,17.14,2.0,N,132,233,1,70.0,0.0,0.5,8.27,6.94,1.0,90.96,2.5,1.75,
1581136,2,2024-01-18 21:52:46,2024-01-18 22:03:21,1.0,2.49,1.0,N,163,75,1,13.5,1.0,0.5,4.0,0.0,1.0,22.5,2.5,0.0,
19137,2,2024-01-01 03:43:58,2024-01-01 03:50:47,2.0,1.84,1.0,N,127,20,2,10.0,1.0,0.5,0.0,0.0,1.0,12.5,0.0,0.0,
1682810,1,2024-01-19 22:20:12,2024-01-19 22:50:12,1.0,3.6,1.0,N,186,263,1,23.3,3.5,0.5,5.65,0.0,1.0,33.95,2.5,0.0,
511035,2,2024-01-06 22:41:50,2024-01-06 22:43:24,1.0,0.04,1.0,N,238,238,2,3.7,1.0,0.5,0.0,0.0,1.0,6.2,0.0,0.0,


In [133]:
taxi_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21556 entries, 1725696 to 701728
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               21556 non-null  int64         
 1   tpep_pickup_datetime   21556 non-null  datetime64[ns]
 2   tpep_dropoff_datetime  21556 non-null  datetime64[ns]
 3   passenger_count        20381 non-null  float64       
 4   trip_distance          21556 non-null  float64       
 5   RatecodeID             20381 non-null  float64       
 6   store_and_fwd_flag     20381 non-null  object        
 7   PULocationID           21556 non-null  int64         
 8   DOLocationID           21556 non-null  int64         
 9   payment_type           21556 non-null  int64         
 10  fare_amount            21556 non-null  float64       
 11  extra                  21556 non-null  float64       
 12  mta_tax                21556 non-null  float64       
 13 

In [135]:
taxi_data.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,airport_fee
count,21556.0,21556,21556,20381.0,21556.0,20381.0,21556.0,21556.0,21556.0,21556.0,21556.0,21556.0,21556.0,21556.0,21556.0,21556.0,20381.0,6853.0,8201.0
mean,1.719197,2022-05-01 19:16:50.909073920,2022-05-01 19:33:07.142048512,1.39998,3.292046,1.468132,165.086472,161.552282,1.184682,15.582768,1.192449,0.488194,2.69241,0.451578,0.542666,22.609807,2.265958,0.140668,0.09069
min,1.0,2009-01-01 01:11:17,2009-01-01 01:11:20,0.0,0.0,1.0,4.0,1.0,0.0,-250.0,-7.5,-0.5,-0.13,-34.2,-1.0,-251.0,-2.5,-1.75,-1.25
25%,1.0,2021-03-01 12:26:32.750000128,2021-03-01 12:34:07,1.0,1.05,1.0,132.0,107.0,1.0,7.2,0.0,0.5,0.0,0.0,0.3,12.6,2.5,0.0,0.0
50%,2.0,2022-05-01 13:35:11.500000,2022-05-01 14:01:36.500000,1.0,1.8,1.0,162.0,162.0,1.0,10.7,0.5,0.5,2.15,0.0,0.3,17.02,2.5,0.0,0.0
75%,2.0,2023-07-01 00:03:05,2023-07-01 00:12:55.500000,1.0,3.31,1.0,234.0,234.0,1.0,17.5,2.5,0.5,3.44,0.0,1.0,24.8,2.5,0.0,0.0
max,6.0,2024-08-31 23:00:33,2024-08-31 23:34:12,6.0,67.9,99.0,265.0,265.0,4.0,278.8,11.75,0.5,51.15,57.05,1.0,289.35,2.5,1.75,1.25
std,0.493676,,,0.981261,4.272702,6.286294,65.699489,70.905294,0.568066,15.408238,1.511804,0.09758,3.282479,1.909912,0.353155,19.130045,0.780689,0.479079,0.330094


### Processing Uber Data

In [None]:
def get_and_clean_uber_month(parquet_url):
    save_dir = "processed_data/hvhf"

    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)

    # Extract file name and define local path
    file_name = parquet_url.split("/")[-1]
    local_file_path = os.path.join(save_dir, file_name)

    # Download the file if not already downloaded
    if not os.path.exists(local_file_path):
        print(f"Downloading HVHF file: {parquet_url} ...")
        try:
            response = requests.get(parquet_url, stream=True)
            response.raise_for_status()
            with open(local_file_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
                    if chunk:
                        f.write(chunk)
            print(f"File saved to: {local_file_path}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {parquet_url}: {e}")
            return pd.DataFrame()  # Return an empty DataFrame if download fails
    else:
        print(f"Loading file from local storage: {local_file_path}")

    # Load the dataset
    try:
        data = pd.read_parquet(local_file_path)
    except Exception as e:
        print(f"Error reading Parquet file {local_file_path}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame if reading fails

    # Determine population size
    population = len(data)
    print(f"Population size: {population}")

    # Calculate sample size (using p = 0.5 for Yellow Taxi data)
    sample_size = calculate_sample_size(population, p = 0.4)
    print(f"Calculated sample size: {sample_size}")

    # Sample the dataset
    sampled_data = data.sample(n=sample_size, random_state=42) if population > sample_size else data

    # Save the sampled dataset
    processed_file_path = os.path.join(save_dir, f"sampled_{file_name}")
    sampled_data.to_parquet(processed_file_path)
    print(f"Processed file saved to: {processed_file_path}")

    return sampled_data

In [None]:
def get_and_clean_uber_data(parquet_urls):
    all_uber_dataframes = []
    hvfhv_pattern = re.compile(r"fhvhv_tripdata_\d{4}-\d{2}\.parquet")
    hvfhv_urls = [url for url in links if hvfhv_pattern.search(url)]
    for url in hvfhv_urls:
        # maybe: first try to see if you've downloaded this exact
        # file already and saved it before trying again
        dataframe = get_and_clean_uber_month(url)
        # maybe: if the file hasn't been saved, save it so you can
        # avoid re-downloading it if you re-run the function
        
        all_uber_dataframes.append(dataframe)
        
    # create one gigantic dataframe with data from every month needed
    uber_data = pd.contact(all_uber_dataframes)
    return uber_data

In [None]:
def load_and_clean_uber_data(dataframe):
    uber_data['hvfhs_license_num'] = uber_data['hvfhs_license_num'].astype(str)
    
    # Filter rows where 'hvfhs_license_num' is 'HV0003' (Uber)
    uber_only_data = uber_data[uber['hvfhs_license_num'] == 'HV0003'].copy()
    return uber_only_data

In [None]:
def get_uber_data():
    all_urls = get_all_urls_from_tlc_page(TLC_URL)
    all_parquet_urls = find_parquet_urls(all_urls)
    uber_data = get_and_clean_uber_data(all_parquet_urls)
    taxi_data = load_and_clean_uber_data(uber_data)
    return taxi_data

In [None]:
uber_data = get_uber_data()

In [None]:
uber_data.head()

In [None]:
uber_data.info()

In [None]:
uber_data.describe()

### Processing Weather Data

In [None]:
def get_all_weather_csvs(directory):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_hourly(csv_file):
    raise NotImplementedError()

In [None]:
def clean_month_weather_data_daily(csv_file):
    raise NotImplementedError()

In [None]:
def load_and_clean_weather_data():
    weather_csv_files = get_all_weather_csvs(WEATHER_CSV_DIR)
    
    hourly_dataframes = []
    daily_dataframes = []
        
    for csv_file in weather_csv_files:
        hourly_dataframe = clean_month_weather_data_hourly(csv_file)
        daily_dataframe = clean_month_weather_data_daily(csv_file)
        hourly_dataframes.append(hourly_dataframe)
        daily_dataframes.append(daily_dataframe)
        
    # create two dataframes with hourly & daily data from every month
    hourly_data = pd.concat(hourly_dataframes)
    daily_data = pd.concat(daily_dataframes)
    
    return hourly_data, daily_data

In [None]:
hourly_weather_data, daily_weather_data = load_and_clean_weather_data()

In [None]:
hourly_weather_data.head()

In [None]:
hourly_weather_data.info()

In [None]:
hourly_weather_data.describe()

In [None]:
daily_weather_data.head()

In [None]:
daily_weather_data.info()

In [None]:
daily_weather_data.describe()

## Part 2: Storing Cleaned Data

In [None]:
engine = db.create_engine(DATABASE_URL)

In [None]:
# if using SQL (as opposed to SQLAlchemy), define the commands 
# to create your 4 tables/dataframes
HOURLY_WEATHER_SCHEMA = """
TODO
"""

DAILY_WEATHER_SCHEMA = """
TODO
"""

TAXI_TRIPS_SCHEMA = """
TODO
"""

UBER_TRIPS_SCHEMA = """
TODO
"""

In [None]:
# create that required schema.sql file
with open(DATABASE_SCHEMA_FILE, "w") as f:
    f.write(HOURLY_WEATHER_SCHEMA)
    f.write(DAILY_WEATHER_SCHEMA)
    f.write(TAXI_TRIPS_SCHEMA)
    f.write(UBER_TRIPS_SCHEMA)

In [None]:
# create the tables with the schema files
with engine.connect() as connection:
    pass

### Add Data to Database

In [None]:
def write_dataframes_to_table(table_to_df_dict):
    raise NotImplemented()

In [None]:
map_table_name_to_dataframe = {
    "taxi_trips": taxi_data,
    "uber_trips": uber_data,
    "hourly_weather": hourly_data,
    "daily_weather": daily_data,
}

In [None]:
write_dataframes_to_table(map_table_name_to_dataframe)

## Part 3: Understanding the Data

In [None]:
# Helper function to write the queries to file
def write_query_to_file(query, outfile):
    raise NotImplementedError()

### Query 1

In [None]:
QUERY_1_FILENAME = ""

QUERY_1 = """
TODO
"""

In [None]:
# execute query either via sqlalchemy
with engine.connect() as con:
    results = con.execute(db.text(QUERY_1)).fetchall()
results

# or via pandas
pd.read_sql(QUERY_1, con=engine)

In [None]:
write_query_to_file(QUERY_1, QUERY_1_FILENAME)

## Part 4: Visualizing the Data

### Visualization 1

In [None]:
# use a more descriptive name for your function
def plot_visual_1(dataframe):
    figure, axes = plt.subplots(figsize=(20, 10))
    
    values = "..."  # use the dataframe to pull out values needed to plot
    
    # you may want to use matplotlib to plot your visualizations;
    # there are also many other plot types (other 
    # than axes.plot) you can use
    axes.plot(values, "...")
    # there are other methods to use to label your axes, to style 
    # and set up axes labels, etc
    axes.set_title("Some Descriptive Title")
    
    plt.show()

In [None]:
def get_data_for_visual_1():
    # Query SQL database for the data needed.
    # You can put the data queried into a pandas dataframe, if you wish
    raise NotImplementedError()

In [None]:
some_dataframe = get_data_for_visual_1()
plot_visual_1(some_dataframe)