<!-- # 06 - Model Explainability (Seguridad + Negocio)

Objetivo:
Explicar por qué el modelo marcó viajes como anómalos y convertir esa explicación
en hallazgos defendibles ante auditoría, seguridad y stakeholders de negocio.

Este notebook responde a:
- ¿Qué variables empujan más la anomalía?
- ¿Qué caracteriza a los casos de alto riesgo?
- ¿Cómo justifico "por qué" sin cajas negras? -->

import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.inspection import permutation_importance

In [2]:
import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.inspection import permutation_importance

In [3]:
DATA_PATH = "Data/processed/trips_with_ml_risk.parquet"

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"No existe el archivo {DATA_PATH}")

df = pd.read_parquet(DATA_PATH)

print("Dataset cargado ")
print("Shape:", df.shape)
df.head()


Dataset cargado 
Shape: (3328229, 31)


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,total_amount,passenger_count,payment_type,trip_duration_min,pickup_hour,pickup_dayofweek,...,flag_cash_payment,flag_outlier_total,flag_outlier_distance,flag_outlier_duration,risk_score,risk_level,anomaly_score,anomaly_flag,final_risk_score,final_risk_level
0,2025-01-01 00:18:38,2025-01-01 00:26:59,1.6,10.0,18.0,1.0,1,8.35,0,2,...,0,0,0,0,0,bajo,0.248968,normal,0,bajo
1,2025-01-01 00:32:40,2025-01-01 00:35:13,0.5,5.1,12.12,1.0,1,2.55,0,2,...,0,0,0,0,0,bajo,0.150257,normal,0,bajo
2,2025-01-01 00:44:04,2025-01-01 00:46:01,0.6,5.1,12.1,1.0,1,1.95,0,2,...,0,0,0,0,1,bajo,0.121694,normal,1,bajo
3,2025-01-01 00:14:27,2025-01-01 00:20:01,0.52,7.2,9.7,3.0,2,5.566667,0,2,...,1,0,0,0,1,bajo,0.135841,normal,1,bajo
4,2025-01-01 00:21:34,2025-01-01 00:25:06,0.66,5.8,8.3,3.0,2,3.533333,0,2,...,1,1,0,0,2,medio,0.075245,normal,2,bajo


In [4]:
required = [
    "trip_distance",
    "trip_duration_min",
    "total_amount",
    "cost_per_km",
    "cost_per_min",
    "fare_to_total_ratio",
    "passenger_count",
    "risk_score",
    "anomaly_flag",
    "final_risk_level"
]

missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Faltan columnas necesarias para explicabilidad: {missing}")

print("Columnas OK ")


Columnas OK 


In [5]:
features_ml = [
    "trip_distance",
    "trip_duration_min",
    "total_amount",
    "cost_per_km",
    "cost_per_min",
    "fare_to_total_ratio",
    "passenger_count",
    "risk_score"
]

X = df[features_ml].replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())

print("X listo ")
X.head()

X listo 


Unnamed: 0,trip_distance,trip_duration_min,total_amount,cost_per_km,cost_per_min,fare_to_total_ratio,passenger_count,risk_score
0,1.6,8.35,18.0,11.25,2.155689,0.555556,1.0,0
1,0.5,2.55,12.12,24.24,4.752941,0.420792,1.0,0
2,0.6,1.95,12.1,20.166667,6.205128,0.421488,1.0,1
3,0.52,5.566667,9.7,18.653846,1.742515,0.742268,3.0,1
4,0.66,3.533333,8.3,12.575758,2.349057,0.698795,3.0,2


In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

iso = IsolationForest(
    n_estimators=100,
    contamination=0.03,
    random_state=42,
    n_jobs=-1
)
iso.fit(X_scaled)

# Score: mientras más bajo, más anómalo
score = iso.decision_function(X_scaled)
df["anomaly_score_rebuilt"] = score

print("Modelo reconstruido ")
df[["anomaly_flag", "anomaly_score", "anomaly_score_rebuilt"]].head()


Modelo reconstruido 


Unnamed: 0,anomaly_flag,anomaly_score,anomaly_score_rebuilt
0,normal,0.248968,0.248968
1,normal,0.150257,0.150257
2,normal,0.121694,0.121694
3,normal,0.135841,0.135841
4,normal,0.075245,0.075245


In [7]:
check = df.groupby("anomaly_flag").agg(
    mean_score=("anomaly_score_rebuilt", "mean"),
    median_score=("anomaly_score_rebuilt", "median"),
    min_score=("anomaly_score_rebuilt", "min"),
    max_score=("anomaly_score_rebuilt", "max"),
    n=("anomaly_score_rebuilt", "count"),
)

check

Unnamed: 0_level_0,mean_score,median_score,min_score,max_score,n
anomaly_flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
anomalía,-0.048921,-0.038159,-0.2107069,-1e-06,99847
normal,0.197802,0.216366,2.040474e-07,0.26294,3228382


In [8]:
# Definimos una función score para permutation_importance:
# Usamos el propio decision_function como "predicción".
def model_predict(X_scaled_local):
    return iso.decision_function(X_scaled_local)

# Wrapper sencillo: permutation_importance requiere un estimador con predict,
# así que haremos un mini "estimador" con lambda NO.
# Alternativa robusta: usar permutation sobre una métrica basada en score:
# Creamos y como score base.
y = df["anomaly_score_rebuilt"].values

# Para permutation_importance: usaremos un modelo lineal proxy sobre X_scaled que prediga y
# (esto es una práctica común de auditoría: "surrogate model").
from sklearn.linear_model import Ridge

surrogate = Ridge(alpha=1.0, random_state=42)
surrogate.fit(X_scaled, y)

perm = permutation_importance(
    surrogate, X_scaled, y,
    n_repeats=5,
    random_state=42,
    n_jobs=-1
)

imp = pd.DataFrame({
    "feature": features_ml,
    "importance_mean": perm.importances_mean,
    "importance_std": perm.importances_std
}).sort_values("importance_mean", ascending=False)

imp

Unnamed: 0,feature,importance_mean,importance_std
7,risk_score,0.653502,0.000312
6,passenger_count,0.143732,0.000102
1,trip_duration_min,0.067047,0.000132
5,fare_to_total_ratio,0.03877,9.6e-05
3,cost_per_km,0.009703,5.2e-05
4,cost_per_min,0.002432,3.1e-05
0,trip_distance,0.000168,7e-06
2,total_amount,0.000103,4e-06


In [9]:
def profile_by_group(df_local, group_col, features):
    rows = []
    for g, sub in df_local.groupby(group_col):
        row = {"group": g, "n": len(sub)}
        for f in features:
            row[f"{f}_median"] = float(sub[f].median())
            row[f"{f}_p25"] = float(sub[f].quantile(0.25))
            row[f"{f}_p75"] = float(sub[f].quantile(0.75))
        rows.append(row)
    return pd.DataFrame(rows)

profile_anom = profile_by_group(df, "anomaly_flag", features_ml)
profile_anom

Unnamed: 0,group,n,trip_distance_median,trip_distance_p25,trip_distance_p75,trip_duration_min_median,trip_duration_min_p25,trip_duration_min_p75,total_amount_median,total_amount_p25,...,cost_per_min_p75,fare_to_total_ratio_median,fare_to_total_ratio_p25,fare_to_total_ratio_p75,passenger_count_median,passenger_count_p25,passenger_count_p75,risk_score_median,risk_score_p25,risk_score_p75
0,anomalía,99847,16.4,0.01,19.53,38.933333,0.566667,60.266667,88.0,40.0,...,22.105263,0.762974,0.666667,0.871876,1.0,1.0,2.0,2.0,1.0,4.0
1,normal,3228382,1.65,1.0,2.97,11.583333,7.333333,17.9,20.21,15.81,...,2.379707,0.622711,0.542299,0.710355,1.0,1.0,1.0,0.0,0.0,0.0


In [13]:
# Baseline "normal": usamos mediana y MAD/STD del grupo normal
normal = df[df["anomaly_flag"] == "normal"].copy()

baseline_median = normal[features_ml].median()
baseline_std = normal[features_ml].std(ddof=0).replace(0, np.nan)

def local_explain(row):
    z = (row[features_ml] - baseline_median) / baseline_std
    z = z.replace([np.inf, -np.inf], np.nan).fillna(0.0)
    top = z.abs().sort_values(ascending=False).head(5)
    return top

# Tomamos 5 anomalías más extremas (score más bajo)
top_anoms = df[df["anomaly_flag"] == "anomalía"].sort_values("anomaly_score_rebuilt").head(5)

explanations = []
for idx, row in top_anoms.iterrows():
    top = local_explain(row)
    explanations.append({
        "row_index": int(idx),
        "anomaly_score": float(row["anomaly_score_rebuilt"]),
        "final_risk_level": str(row["final_risk_level"]),
        "top_drivers": ", ".join([f"{k} (|z|={v:.2f})" for k, v in top.items()])
    })

pd.DataFrame(explanations)

  z = z.replace([np.inf, -np.inf], np.nan).fillna(0.0)
  z = z.replace([np.inf, -np.inf], np.nan).fillna(0.0)
  z = z.replace([np.inf, -np.inf], np.nan).fillna(0.0)
  z = z.replace([np.inf, -np.inf], np.nan).fillna(0.0)
  z = z.replace([np.inf, -np.inf], np.nan).fillna(0.0)


Unnamed: 0,row_index,anomaly_score,final_risk_level,top_drivers
0,862299,-0.210707,alto,"cost_per_min (|z|=1712.24), cost_per_km (|z|=3..."
1,765810,-0.209611,alto,"cost_per_min (|z|=3822.78), cost_per_km (|z|=4..."
2,360498,-0.207971,alto,"cost_per_min (|z|=3055.31), cost_per_km (|z|=4..."
3,459217,-0.207971,alto,"cost_per_min (|z|=807.78), cost_per_km (|z|=39..."
4,332610,-0.207971,alto,"cost_per_min (|z|=1492.10), cost_per_km (|z|=3..."


In [14]:
impact_by_anom = df.groupby(["anomaly_flag", "final_risk_level"]).agg(
    trips=("total_amount", "count"),
    total_revenue=("total_amount", "sum"),
    avg_amount=("total_amount", "mean")
).sort_values("total_revenue", ascending=False)

impact_by_anom

  impact_by_anom = df.groupby(["anomaly_flag", "final_risk_level"]).agg(


Unnamed: 0_level_0,Unnamed: 1_level_0,trips,total_revenue,avg_amount
anomaly_flag,final_risk_level,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
normal,bajo,3215251,81308782.67,25.288471
anomalía,medio,59947,6414410.31,107.001356
anomalía,alto,39900,2400728.67,60.168638
normal,medio,12845,164320.96,12.792601
normal,alto,286,1325.25,4.633741
anomalía,bajo,0,0.0,


In [15]:
# Derivar umbrales basados en percentiles de anomalías para reglas de monitoreo
anom = df[df["anomaly_flag"] == "anomalía"]

rules = {
    "cost_per_km_p95_anom": float(anom["cost_per_km"].quantile(0.95)),
    "cost_per_min_p95_anom": float(anom["cost_per_min"].quantile(0.95)),
    "total_amount_p99_anom": float(anom["total_amount"].quantile(0.99)),
    "duration_p99_anom": float(anom["trip_duration_min"].quantile(0.99)),
}

rules_df = pd.DataFrame([rules])
rules_df

Unnamed: 0,cost_per_km_p95_anom,cost_per_min_p95_anom,total_amount_p99_anom,duration_p99_anom
0,875.0,683.571429,245.45,135.919333


In [16]:
OUT_IMPORTANCE = "reports/06_feature_importance_surrogate.csv"
OUT_PROFILE = "reports/06_profile_normal_vs_anomaly.csv"
OUT_RULES = "reports/06_rules_thresholds_from_anomalies.csv"
OUT_LOCAL = "reports/06_top_anomalies_local_explanations.csv"

imp.to_csv(OUT_IMPORTANCE, index=False)
profile_anom.to_csv(OUT_PROFILE, index=False)
rules_df.to_csv(OUT_RULES, index=False)
pd.DataFrame(explanations).to_csv(OUT_LOCAL, index=False)

print(" Reportes guardados:")
print("-", OUT_IMPORTANCE)
print("-", OUT_PROFILE)
print("-", OUT_RULES)
print("-", OUT_LOCAL)


 Reportes guardados:
- reports/06_feature_importance_surrogate.csv
- reports/06_profile_normal_vs_anomaly.csv
- reports/06_rules_thresholds_from_anomalies.csv
- reports/06_top_anomalies_local_explanations.csv
