# 02 – Prepare NYC Taxi Data
This notebook reads raw taxi data and prepares it for modeling.

%pip install -e ../src

In [None]:
from pyspark.sql.functions import col
from project.common import get_spark_session, get_logger
from project.data_ingestion import read_csv, select_columns, filter_nulls
from project.feature_engineering import add_ratio_column, drop_null_rows

logger = get_logger("prepare_data")

In [None]:
# Get Spark session
spark = get_spark_session()

In [None]:
# Read data
raw_path = "dbfs:/databricks-datasets/nyctaxi/tripdata/yellow/yellow_tripdata_2020-01.csv"
logger.info(f"Reading raw data from {raw_path}")
df = read_csv(spark, raw_path)

In [None]:
# Select subset of relevant columns
columns = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "passenger_count",
    "trip_distance",
    "fare_amount",
    "tip_amount",
    "total_amount"
]
df = select_columns(df, columns)

In [None]:
# Drop rows with nulls in key fields
df_clean = drop_null_rows(df, subset=["trip_distance", "fare_amount", "total_amount"])

In [None]:
# Feature: tip as percentage of fare
df_features = add_ratio_column(df_clean, "tip_amount", "fare_amount", "tip_pct")

In [None]:
# Save prepared data to Delta (or overwrite in dev)
output_path = "dbfs:/tmp/nyc_taxi/prepared_data"
logger.info(f"Saving cleaned data to {output_path}")
df_features.write.mode("overwrite").format("delta").save(output_path)

# ✅ Data preparation completed. Output saved as Delta table.