In [2]:
import pandas as pd
import numpy as np
sample_size = 10_000
np.random.seed(42)

sample_data = {
    "pickup_datetime": pd.date_range(
        start="2022-01-01", periods=sample_size, freq="5min"
    ),
    "pickup_longitude": np.random.uniform(-74.03, -73.77, sample_size),
    "pickup_latitude": np.random.uniform(40.63, 40.85, sample_size),
    "dropoff_longitude": np.random.uniform(-74.03, -73.77, sample_size),
    "dropoff_latitude": np.random.uniform(40.63, 40.85, sample_size),
    "passenger_count": np.random.randint(1, 7, sample_size),
    "fare_amount": np.random.uniform(2.5, 100, sample_size),
    "trip_distance": np.random.uniform(0.1, 30, sample_size)
}

df = pd.DataFrame(sample_data)

# Inject errors intentionally
df.loc[0:10, "fare_amount"] = 600              # Fare outliers
df.loc[11:20, "passenger_count"] = 0            # Invalid passenger count
df.loc[21:30, "pickup_longitude"] = -76         # Invalid GPS
INPUT_FILE = "sample_nyc_taxi_trips.csv"
OUTPUT_FILE = "cleaned_sample_nyc_taxi_trips.csv"

df.to_csv(INPUT_FILE, index=False)
print("✅ Sample input file created")
def clean_taxi_chunk(df):
    # NYC valid boundaries
    LON_MIN, LON_MAX = -74.03, -73.77
    LAT_MIN, LAT_MAX = 40.63, 40.85

    # -------------------------------
    # Standardize column names
    # -------------------------------
    df.columns = df.columns.str.lower().str.strip()

    # -------------------------------
    # Passenger count validation
    # -------------------------------
    df["passenger_count"] = pd.to_numeric(
        df["passenger_count"], errors="coerce"
    )

    df = df[(df["passenger_count"] > 0) & (df["passenger_count"] <= 6)]

    # -------------------------------
    # GPS coordinate validation
    # -------------------------------
    df = df[
        df["pickup_latitude"].between(LAT_MIN, LAT_MAX) &
        df["dropoff_latitude"].between(LAT_MIN, LAT_MAX) &
        df["pickup_longitude"].between(LON_MIN, LON_MAX) &
        df["dropoff_longitude"].between(LON_MIN, LON_MAX)
    ]

    # -------------------------------
    # Fare validation
    # -------------------------------
    df["fare_amount"] = pd.to_numeric(
        df["fare_amount"], errors="coerce"
    )
    df = df[(df["fare_amount"] > 0) & (df["fare_amount"] < 500)]

    # -------------------------------
    # Trip distance validation
    # -------------------------------
    df["trip_distance"] = pd.to_numeric(
        df["trip_distance"], errors="coerce"
    )
    df = df[(df["trip_distance"] > 0) & (df["trip_distance"] < 100)]

    # -------------------------------
    # Datetime correction
    # -------------------------------
    df["pickup_datetime"] = pd.to_datetime(
        df["pickup_datetime"], errors="coerce"
    )
    df = df.dropna(subset=["pickup_datetime"])

    # -------------------------------
    # Remove duplicates
    # -------------------------------
    df = df.drop_duplicates()

    return df
chunk_size = 1_000_000
first_chunk = True

for chunk in pd.read_csv(INPUT_FILE, chunksize=chunk_size):
    cleaned_chunk = clean_taxi_chunk(chunk)

    cleaned_chunk.to_csv(
        OUTPUT_FILE,
        mode="w" if first_chunk else "a",
        header=first_chunk,
        index=False
    )

    first_chunk = False
    print("✅ Cleaned chunk saved:", cleaned_chunk.shape)
final_df = pd.read_csv(OUTPUT_FILE)

print("\nFinal Shape:", final_df.shape)
print("\nMissing Values:")
print(final_df.isna().sum())

assert final_df["fare_amount"].min() > 0
assert final_df["passenger_count"].min() > 0

print("\n✅ Data validation passed successfully")


✅ Sample input file created
✅ Cleaned chunk saved: (9969, 8)

Final Shape: (9969, 8)

Missing Values:
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
fare_amount          0
trip_distance        0
dtype: int64

✅ Data validation passed successfully
