# COMP 3610 Assignment 1   

# Part 1 Data Ingestion and Storage

In [7]:
import os
import requests 
import pandas as pd
from datetime import datetime
from tqdm import tqdm

In [8]:
TRIP_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet"
ZONE_URL = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"

RAW_DIR = "data/raw"
os.makedirs(RAW_DIR, exist_ok=True)

def download_with_progress(url, save_path, expected_min_size_mb=100):
    print(f"Downloading {url}...")
    try:
        response = requests.get(url, stream=True, timeout=60)
        response.raise_for_status()
        
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024 * 1024 
        
        with open(save_path, 'wb') as f, tqdm(
            desc=save_path,
            total=total_size,
            unit='iB',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for chunk in response.iter_content(chunk_size=block_size):
                if chunk:
                    f.write(chunk)
                    bar.update(len(chunk))
        
        actual_size_mb = os.path.getsize(save_path) / (1024 * 1024)
        print(f"Downloaded {actual_size_mb:.1f} MB to {save_path}")
        
    except Exception as e:
        print(f"Download failed: {e}")
        if os.path.exists(save_path):
            os.remove(save_path)  
        raise

In [9]:

trip_path = os.path.join(RAW_DIR, "yellow_tripdata_2024-01.parquet")
zone_path = os.path.join(RAW_DIR, "taxi_zone_lookup.csv")

if os.path.exists(trip_path):
    os.remove(trip_path)
if os.path.exists(zone_path):
    os.remove(zone_path)

# Download again
download_with_progress(TRIP_URL, trip_path,)
download_with_progress(ZONE_URL, zone_path,)

print("\nDownloads finished.")

Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet...


data/raw\yellow_tripdata_2024-01.parquet: 100%|██████████| 47.6M/47.6M [00:03<00:00, 15.7MiB/s]


Downloaded 47.6 MB to data/raw\yellow_tripdata_2024-01.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv...


data/raw\taxi_zone_lookup.csv: 12.0kiB [00:00, 12.1MiB/s]

Downloaded 0.0 MB to data/raw\taxi_zone_lookup.csv

Downloads finished.





In [22]:
print("\n=== Part 1 Data Validation ===")

trip_path = "data/raw/yellow_tripdata_2024-01.parquet"
zone_path = "data/raw/taxi_zone_lookup.csv"

print (f"Reading trip data from {trip_path}...")
df = pd.read_parquet(trip_path)

print (f"Reading zone data from {zone_path}...")
zones_df = pd.read_csv(zone_path)

expected_trip_columns = [
    'tpep_pickup_datetime', 'tpep_dropoff_datetime',
    'PULocationID', 'DOLocationID', 'passenger_count',
    'trip_distance', 'fare_amount', 'tip_amount',
    'total_amount', 'payment_type'
]

missing_trip_columns = [col for col in expected_trip_columns if col not in df.columns]
if missing_trip_columns:
    raise ValueError(f"Missing columns in trip data: {missing_trip_columns}")
print("All expected columns are present in the trip data.")

expected_zone_columns = ['LocationID', 'Borough', 'Zone', 'service_zone']
missing_zone_columns = [col for col in expected_zone_columns if col not in zones_df.columns]
if missing_zone_columns:
    raise ValueError(f"Missing columns in zone data: {missing_zone_columns}")
print("All expected columns are present in the zone data.")

df ['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')
df ['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], errors='coerce')

invalid_dates = df['tpep_pickup_datetime'].isna().sum() + df['tpep_dropoff_datetime'].isna().sum()
if invalid_dates > 0:
    print(f"Warning: Found {invalid_dates} invalid datetime entries in the trip data.")
print("Datetime columns have been validated.")

# Summary
print("\n=== Part 1 Data Summary ===")
print(f"Total number of trips: {len(df)}")
print(f"Zone rows : {len(zones_df)}")


print("\nSample trip data:")
print(df.head(1))

print("\nSample zone data:")
print(zones_df.head(1))

print ("\nValidation and summary completed successfully.")


=== Part 1 Data Validation ===
Reading trip data from data/raw/yellow_tripdata_2024-01.parquet...
Reading zone data from data/raw/taxi_zone_lookup.csv...
All expected columns are present in the trip data.
All expected columns are present in the zone data.
Datetime columns have been validated.

=== Part 1 Data Summary ===
Total number of trips: 2964624
Zone rows : 265

Sample trip data:
   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2024-01-01 00:57:55   2024-01-01 01:17:43              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           1.72         1.0                  N           186            79   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \
0             2         17.7    1.0      0.5         0.0           0.0   

   improvement_surcharge  total_amount  congestion_surcharge  Airport_fee  
0                    1.0          22.7                   2.5          0.0  

# Part 1 Complete add more after done 