In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

In [None]:
DATA_PATH = "Data/processed/trips_features_2025-01.parquet"

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"No existe el dataset procesado en {DATA_PATH}")

df = pd.read_parquet(DATA_PATH)

print("Dataset cargado ✅")
print("Shape:", df.shape)
df.head()

FileNotFoundError: No existe el dataset procesado en Data/processed/trips_features_2025-01.parquet

In [3]:
from pathlib import Path
import pandas as pd

# ===============================
# 1. Definir raíz del proyecto
#    (notebook está en /Notebooks)
# ===============================
PROJECT_ROOT = Path.cwd().parent

# ===============================
# 2. Ruta correcta al dataset
# ===============================
DATA_PATH = PROJECT_ROOT / "Data" / "processed" / "trips_features_2025-01.parquet"

print(" Buscando dataset en:")
print(DATA_PATH)

# ===============================
# 3. Validación de existencia
# ===============================
if not DATA_PATH.exists():
    raise FileNotFoundError(
        f" No existe el dataset procesado en la ruta esperada:\n{DATA_PATH}"
    )

# ===============================
# 4. Carga del dataset
# ===============================
df = pd.read_parquet(DATA_PATH)

print(" Dataset cargado correctamente")
print("Shape:", df.shape)

df.head()


 Buscando dataset en:
d:\end_to_end_customer_risk_system\Data\processed\trips_features_2025-01.parquet
 Dataset cargado correctamente
Shape: (3328229, 27)


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,total_amount,passenger_count,payment_type,trip_duration_min,pickup_hour,pickup_dayofweek,...,flag_very_long_trip,flag_long_duration,flag_short_duration,flag_short_but_expensive,flag_cash_payment,flag_outlier_total,flag_outlier_distance,flag_outlier_duration,risk_score,risk_level
0,2025-01-01 00:18:38,2025-01-01 00:26:59,1.6,10.0,18.0,1.0,1,8.35,0,2,...,0,0,0,0,0,0,0,0,0,bajo
1,2025-01-01 00:32:40,2025-01-01 00:35:13,0.5,5.1,12.12,1.0,1,2.55,0,2,...,0,0,0,0,0,0,0,0,0,bajo
2,2025-01-01 00:44:04,2025-01-01 00:46:01,0.6,5.1,12.1,1.0,1,1.95,0,2,...,0,0,1,0,0,0,0,0,1,bajo
3,2025-01-01 00:14:27,2025-01-01 00:20:01,0.52,7.2,9.7,3.0,2,5.566667,0,2,...,0,0,0,0,1,0,0,0,1,bajo
4,2025-01-01 00:21:34,2025-01-01 00:25:06,0.66,5.8,8.3,3.0,2,3.533333,0,2,...,0,0,0,0,1,1,0,0,2,medio


In [4]:
features_ml = [
    "trip_distance",
    "trip_duration_min",
    "total_amount",
    "cost_per_km",
    "cost_per_min",
    "fare_to_total_ratio",
    "passenger_count",
    "risk_score"
]

X = df[features_ml].replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

print("Features usadas para ML:")
X.head()

Features usadas para ML:


Unnamed: 0,trip_distance,trip_duration_min,total_amount,cost_per_km,cost_per_min,fare_to_total_ratio,passenger_count,risk_score
0,1.6,8.35,18.0,11.25,2.155689,0.555556,1.0,0
1,0.5,2.55,12.12,24.24,4.752941,0.420792,1.0,0
2,0.6,1.95,12.1,20.166667,6.205128,0.421488,1.0,1
3,0.52,5.566667,9.7,18.653846,1.742515,0.742268,3.0,1
4,0.66,3.533333,8.3,12.575758,2.349057,0.698795,3.0,2


In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled[:5]

array([[-0.00745536, -0.24545806, -0.01927036, -0.0323049 , -0.02760215,
        -0.60763151, -0.36308969, -0.39216935],
       [-0.00961911, -0.45754737, -0.03168322,  0.00475045, -0.0027939 ,
        -1.64407093, -0.36308969, -0.39216935],
       [-0.00942241, -0.47948764, -0.03172544, -0.00686917,  0.011077  ,
        -1.63872179, -0.36308969,  1.04180514],
       [-0.00957977, -0.34723655, -0.03679191, -0.01118465, -0.03154867,
         0.82833754,  2.4751762 ,  1.04180514],
       [-0.00930438, -0.4215897 , -0.03974736, -0.02852304, -0.02575515,
         0.49399631,  2.4751762 ,  2.47577963]])

In [6]:
iso_forest = IsolationForest(
    n_estimators=100,
    contamination=0.03,  # asumimos ~3% anomalías
    random_state=42,
    n_jobs=-1
)

iso_forest.fit(X_scaled)

print("Modelo entrenado ")


Modelo entrenado 


In [7]:
df["anomaly_score"] = iso_forest.decision_function(X_scaled)
df["anomaly_flag"] = iso_forest.predict(X_scaled)

# Isolation Forest:
# -1 = anomalía, 1 = normal
df["anomaly_flag"] = df["anomaly_flag"].map({1: "normal", -1: "anomalía"})

df["anomaly_flag"].value_counts()

anomaly_flag
normal      3228382
anomalía      99847
Name: count, dtype: int64

In [8]:
comparison = pd.crosstab(
    df["risk_level"],
    df["anomaly_flag"],
    normalize="index"
)

comparison

anomaly_flag,anomalía,normal
risk_level,Unnamed: 1_level_1,Unnamed: 2_level_1
alto,0.88057,0.11943
bajo,0.011903,0.988097
medio,0.257748,0.742252


In [9]:
impact = df.groupby("anomaly_flag").agg(
    trips=("total_amount", "count"),
    avg_amount=("total_amount", "mean"),
    total_revenue=("total_amount", "sum")
)

impact

Unnamed: 0_level_0,trips,avg_amount,total_revenue
anomaly_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
anomalía,99847,88.286468,8815138.98
normal,3228382,25.236923,81474428.88


In [10]:
time_analysis = pd.crosstab(
    df["time_bucket"],
    df["anomaly_flag"],
    normalize="index"
)

time_analysis

anomaly_flag,anomalía,normal
time_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1
madrugada,0.034127,0.965873
mañana,0.030118,0.969882
noche,0.021973,0.978027
tarde,0.036693,0.963307


In [11]:
df["final_risk_score"] = (
    df["risk_score"] +
    (df["anomaly_flag"] == "anomalía").astype(int) * 3
)

df["final_risk_level"] = pd.cut(
    df["final_risk_score"],
    bins=[-1, 2, 5, 100],
    labels=["bajo", "medio", "alto"]
)

df["final_risk_level"].value_counts(normalize=True)

final_risk_level
bajo     0.966055
medio    0.021871
alto     0.012074
Name: proportion, dtype: float64

In [12]:
OUT_PATH = "Data/processed/trips_with_ml_risk.parquet"
df.to_parquet(OUT_PATH, index=False)

print(" Dataset con ML guardado en:", OUT_PATH)


 Dataset con ML guardado en: Data/processed/trips_with_ml_risk.parquet
