# 05 - Evaluación del Modelo y Análisis de Impacto

Objetivo:
Evaluar el desempeño del modelo de detección de anomalías y traducir sus resultados
a impacto financiero, operativo y de seguridad.

Este notebook responde a:
- ¿Qué detectó el modelo?
- ¿Qué tan relevante es?
- ¿Cuánto dinero y riesgo está involucrado?
- ¿Qué decisiones se pueden tomar?


In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
DATA_PATH = "Data/processed/trips_with_ml_risk.parquet"

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"No existe el archivo {DATA_PATH}")

df = pd.read_parquet(DATA_PATH)

print("Dataset cargado correctamente")
print("Shape:", df.shape)
df.head()

Dataset cargado correctamente
Shape: (3328229, 31)


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,total_amount,passenger_count,payment_type,trip_duration_min,pickup_hour,pickup_dayofweek,...,flag_cash_payment,flag_outlier_total,flag_outlier_distance,flag_outlier_duration,risk_score,risk_level,anomaly_score,anomaly_flag,final_risk_score,final_risk_level
0,2025-01-01 00:18:38,2025-01-01 00:26:59,1.6,10.0,18.0,1.0,1,8.35,0,2,...,0,0,0,0,0,bajo,0.248968,normal,0,bajo
1,2025-01-01 00:32:40,2025-01-01 00:35:13,0.5,5.1,12.12,1.0,1,2.55,0,2,...,0,0,0,0,0,bajo,0.150257,normal,0,bajo
2,2025-01-01 00:44:04,2025-01-01 00:46:01,0.6,5.1,12.1,1.0,1,1.95,0,2,...,0,0,0,0,1,bajo,0.121694,normal,1,bajo
3,2025-01-01 00:14:27,2025-01-01 00:20:01,0.52,7.2,9.7,3.0,2,5.566667,0,2,...,1,0,0,0,1,bajo,0.135841,normal,1,bajo
4,2025-01-01 00:21:34,2025-01-01 00:25:06,0.66,5.8,8.3,3.0,2,3.533333,0,2,...,1,1,0,0,2,medio,0.075245,normal,2,bajo


In [3]:
required_cols = [
    "total_amount",
    "trip_distance",
    "trip_duration_min",
    "risk_level",
    "risk_score",
    "anomaly_flag",
    "final_risk_level",
    "time_bucket",
    "payment_type"
]

missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Faltan columnas críticas: {missing}")

print("Todas las columnas críticas están presentes ")


Todas las columnas críticas están presentes 


In [4]:
risk_dist = df["final_risk_level"].value_counts(normalize=True).rename("percentage") * 100
risk_dist_df = risk_dist.reset_index().rename(columns={"index": "risk_level"})

risk_dist_df

Unnamed: 0,final_risk_level,percentage
0,bajo,96.605462
1,medio,2.187109
2,alto,1.207429


In [5]:
anomaly_dist = df["anomaly_flag"].value_counts(normalize=True) * 100
anomaly_dist_df = anomaly_dist.reset_index().rename(columns={"index": "anomaly_flag", 0: "percentage"})

anomaly_dist_df

Unnamed: 0,anomaly_flag,proportion
0,normal,96.999996
1,anomalía,3.000004


In [6]:
consistency = pd.crosstab(
    df["risk_level"],
    df["anomaly_flag"],
    normalize="index"
)

consistency

anomaly_flag,anomalía,normal
risk_level,Unnamed: 1_level_1,Unnamed: 2_level_1
alto,0.88057,0.11943
bajo,0.011903,0.988097
medio,0.257748,0.742252


In [7]:
financial_impact = df.groupby("final_risk_level").agg(
    trips=("total_amount", "count"),
    avg_amount=("total_amount", "mean"),
    total_revenue=("total_amount", "sum")
).sort_values("total_revenue", ascending=False)

financial_impact

  financial_impact = df.groupby("final_risk_level").agg(


Unnamed: 0_level_0,trips,avg_amount,total_revenue
final_risk_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bajo,3215251,25.288471,81308782.67
medio,72792,90.377119,6578731.27
alto,40186,59.773402,2402053.92


In [8]:
anomaly_money = df.groupby("anomaly_flag").agg(
    trips=("total_amount", "count"),
    avg_amount=("total_amount", "mean"),
    total_revenue=("total_amount", "sum")
)

anomaly_money

Unnamed: 0_level_0,trips,avg_amount,total_revenue
anomaly_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
anomalía,99847,88.286468,8815138.98
normal,3228382,25.236923,81474428.88


In [9]:
time_risk = df.pivot_table(
    values="total_amount",
    index="time_bucket",
    columns="final_risk_level",
    aggfunc="sum",
    fill_value=0
)

time_risk

  time_risk = df.pivot_table(


final_risk_level,bajo,medio,alto
time_bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
madrugada,6302660.09,344929.02,266719.79
mañana,16953450.11,1259425.73,455083.55
noche,28765743.33,1381045.31,673706.96
tarde,29286929.14,3593331.21,1006543.62


In [10]:
payment_map = {
    1: "tarjeta",
    2: "efectivo"
}

df["payment_label"] = df["payment_type"].map(payment_map).fillna("otro")

payment_risk = df.groupby(["payment_label", "final_risk_level"]).agg(
    trips=("total_amount", "count"),
    total_revenue=("total_amount", "sum")
)

payment_risk

  payment_risk = df.groupby(["payment_label", "final_risk_level"]).agg(


Unnamed: 0_level_0,Unnamed: 1_level_0,trips,total_revenue
payment_label,final_risk_level,Unnamed: 2_level_1,Unnamed: 3_level_1
efectivo,bajo,347911,7537352.09
efectivo,medio,15861,782383.44
efectivo,alto,11990,554765.96
otro,bajo,492863,11343854.82
otro,medio,6669,1185192.02
otro,alto,9865,279373.64
tarjeta,bajo,2374477,62427575.76
tarjeta,medio,50262,4611155.81
tarjeta,alto,18331,1567914.32


In [11]:
top_risk_cases = df[df["final_risk_level"] == "alto"] \
    .sort_values("final_risk_score", ascending=False) \
    .head(10)

top_risk_cases[
    [
        "trip_distance",
        "trip_duration_min",
        "total_amount",
        "cost_per_km",
        "cost_per_min",
        "risk_score",
        "anomaly_flag",
        "final_risk_score"
    ]
]

Unnamed: 0,trip_distance,trip_duration_min,total_amount,cost_per_km,cost_per_min,risk_score,anomaly_flag,final_risk_score
1548706,0.0,0.15,171.0,,1140.0,7,anomalía,10
2131716,0.0,0.333333,201.0,,603.0,7,anomalía,10
2308663,0.0,0.166667,903.5,,5421.0,7,anomalía,10
464105,0.0,0.116667,163.5,,1401.428571,7,anomalía,10
1792148,0.0,0.283333,702.75,,2480.294118,7,anomalía,10
2037850,0.0,0.716667,201.0,,280.465116,7,anomalía,10
2319214,0.0,0.583333,379.25,,650.142857,7,anomalía,10
33368,0.0,0.3,104.0,,346.666667,7,anomalía,10
2303826,0.0,0.2,503.5,,2517.5,7,anomalía,10
2215741,0.0,0.083333,121.0,,1452.0,7,anomalía,10


In [12]:
conclusions = {
    "total_trips_analyzed": len(df),
    "pct_high_risk": float((df["final_risk_level"] == "alto").mean() * 100),
    "pct_anomalies": float((df["anomaly_flag"] == "anomalía").mean() * 100),
    "revenue_high_risk": float(df[df["final_risk_level"] == "alto"]["total_amount"].sum()),
    "revenue_anomalies": float(df[df["anomaly_flag"] == "anomalía"]["total_amount"].sum()),
}

conclusions_df = pd.DataFrame([conclusions])
conclusions_df

Unnamed: 0,total_trips_analyzed,pct_high_risk,pct_anomalies,revenue_high_risk,revenue_anomalies
0,3328229,1.207429,3.000004,2402053.92,8815138.98


In [13]:
OUT_PATH = "reports/05_evaluation_business_impact_summary.csv"
conclusions_df.to_csv(OUT_PATH, index=False)

print("Resumen ejecutivo guardado en:", OUT_PATH)

Resumen ejecutivo guardado en: reports/05_evaluation_business_impact_summary.csv
