# COMP3610 Assignment 1

- **Name:** Sonali Maharaj
- **Student ID:** 816034459
- **Course:** COMP3610  
- **Assignment:** Assignment 1  

---

# Part 1: Data Ingestion

In [2]:

# Programmatically download the files

from pathlib import Path
import requests

# Create raw data directory if it doesn't exist
RAW_DIR = Path("data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Required files
FILES = [
    (
        "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet",
        RAW_DIR / "yellow_tripdata_2024-01.parquet",
    ),
    (
        "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv",
        RAW_DIR / "taxi_zone_lookup.csv",
    ),
]

def download_file(url, output_path, chunk_size=1024 * 1024):
    """
    Downloads a file from a URL and saves it locally.
    Uses streaming to handle large files efficiently.
    """
    print(f"\nDownloading: {url}")

    with requests.get(url, stream=True, timeout=60) as response:
        response.raise_for_status()

        with open(output_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)

    # Validation check
    if not output_path.exists() or output_path.stat().st_size == 0:
        raise RuntimeError(f"Download failed or file is empty: {output_path}")

    print(f"Saved to: {output_path}")
    print(f"File size: {output_path.stat().st_size / 1e6:.2f} MB")


# Download both required files
for url, path in FILES:
    if path.exists() and path.stat().st_size > 0:
        print(f"File already exists, skipping: {path}")
    else:
        download_file(url, path)


File already exists, skipping: data\raw\yellow_tripdata_2024-01.parquet
File already exists, skipping: data\raw\taxi_zone_lookup.csv


In [3]:

# Data Validation

import polars as pl
from pathlib import Path 

PARQUET_PATH = Path("data/raw/yellow_tripdata_2024-01.parquet")

# Check file exists 
if not PARQUET_PATH.exists():
    raise FileNotFoundError(f"Missing file: {PARQUET_PATH}. Run the download step first.")

# Load dataset 
lf = pl.scan_parquet(str(PARQUET_PATH))
schema = lf.schema
actual_columns = list(schema.keys())

# Expected columns 
EXPECTED_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "PULocationID",
    "DOLocationID",
    "passenger_count",
    "trip_distance",
    "fare_amount",
    "tip_amount",
    "total_amount",
    "payment_type",
]

DATETIME_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
]

# a) Verify required columns exist 
missing_columns = [col for col in EXPECTED_COLUMNS if col not in actual_columns]

if missing_columns:
    raise ValueError(f"Validation failed: Missing required columns: {missing_columns}")

# b) Check datetime columns are correct type
for col in DATETIME_COLUMNS:
    if schema[col] != pl.Datetime:
        raise TypeError(f"Validation failed: Column '{col}' is not datetime type.")

# c) Report row count and summary 
row_count = lf.select(pl.len()).collect().item()

print("Validation Passed")
print(f"Total rows: {row_count:,}")
print(f"Total columns: {len(actual_columns)}")

# Print summary statistics 
df = lf.collect()
print("\n--- Dataset Summary ---")
print(df.describe())


Validation Passed
Total rows: 2,964,624
Total columns: 19

--- Dataset Summary ---
shape: (9, 20)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ VendorID  ┆ tpep_pick ┆ tpep_drop ┆ … ┆ improveme ┆ total_amo ┆ congestio ┆ Airport_ │
│ ---       ┆ ---       ┆ up_dateti ┆ off_datet ┆   ┆ nt_surcha ┆ unt       ┆ n_surchar ┆ fee      │
│ str       ┆ f64       ┆ me        ┆ ime       ┆   ┆ rge       ┆ ---       ┆ ge        ┆ ---      │
│           ┆           ┆ ---       ┆ ---       ┆   ┆ ---       ┆ f64       ┆ ---       ┆ f64      │
│           ┆           ┆ str       ┆ str       ┆   ┆ f64       ┆           ┆ f64       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 2.964624e ┆ 2964624   ┆ 2964624   ┆ … ┆ 2.964624e ┆ 2.964624e ┆ 2.824462e ┆ 2.824462 │
│           ┆ 6         ┆           ┆           ┆   ┆ 6         ┆ 6         ┆ 6         ┆ e6  