In [1]:
# =========================
# IMPORTS
# =========================
import numpy as np
import pandas as pd
from pathlib import Path

# =========================
# RUTAS
# =========================
PROJECT_ROOT = Path.cwd().parent
RAW_PATH = PROJECT_ROOT / "Data/Raw/yellow_tripdata_2025-01.parquet"
PROCESSED_DIR = PROJECT_ROOT / "Data/processed"
REPORTS_DIR = PROJECT_ROOT / "reports"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

# =========================
# CARGA DE DATOS
# =========================
cols = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "trip_distance",
    "fare_amount",
    "total_amount",
    "passenger_count",
    "payment_type",
]

df = pd.read_parquet(RAW_PATH, columns=cols)
print("Dataset cargado:", df.shape)

# =========================
# LIMPIEZA BÁSICA
# =========================
df = df.dropna().copy()

df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])

df["trip_duration_min"] = (
    (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"])
    .dt.total_seconds() / 60
)

df = df[
    (df["trip_distance"] > 0) &
    (df["total_amount"] > 0) &
    (df["trip_duration_min"] > 0)
].copy()

print("Después de limpieza:", df.shape)

# =========================
# FEATURES CLAVE
# =========================
df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
df["is_weekend"] = df["tpep_pickup_datetime"].dt.dayofweek.isin([5, 6]).astype(int)

df["cost_per_km"] = df["total_amount"] / df["trip_distance"]
df["cost_per_min"] = df["total_amount"] / df["trip_duration_min"]

# =========================
# RISK SCORE SIMPLE
# =========================
df["risk_score"] = (
    (df["trip_distance"] > 30).astype(int) +
    (df["trip_duration_min"] > 120).astype(int) +
    (df["total_amount"] > df["total_amount"].quantile(0.99)).astype(int)
)

df["risk_level"] = pd.cut(
    df["risk_score"],
    bins=[-1, 0, 1, 3],
    labels=["bajo", "medio", "alto"]
)

# =========================
# GUARDADO
# =========================
OUT_PATH = PROCESSED_DIR / "trips_features_2025-01.parquet"
df.to_parquet(OUT_PATH, index=False)

summary = pd.DataFrame([{
    "rows_final": len(df),
    "avg_total_amount": df["total_amount"].mean(),
    "avg_distance": df["trip_distance"].mean(),
    "pct_risk_high": (df["risk_level"] == "alto").mean(),
}])

summary.to_csv(REPORTS_DIR / "summary.csv", index=False)

print("Pipeline finalizado ✔")


Dataset cargado: (3475226, 7)
Después de limpieza: (2839560, 8)
Pipeline finalizado ✔
