# 02 · Ingeniería de variables

Se generan y validan las variables que alimentarán los
modelos de detección de fraude. Las funciones se encapsulan en
`src/features.py` para facilitar mantenimiento y pruebas unitarias.


In [None]:
# C.0 · Setup y carga de datos
import time
from pathlib import Path
import pandas as pd
from src.utils    import log_step
from src.features import build_features
import numpy as np

CACHE_DIR = Path("./cache")
df = pd.read_parquet(CACHE_DIR / "df_clean.parquet")
log_step(f"df_clean cargado: {len(df):,} filas")

2025-06-29 02:08:38 | INFO | df_clean cargado: 10,758,402 filas


In [None]:
# C.1 · Generación de features
t0 = time.time()
df_feat = build_features(df)
log_step(f"Features generadas en {time.time()-t0:.1f}s | "
         f"columnas totales: {df_feat.shape[1]}")

  grp = df.set_index("transaction_date").groupby("user_id", sort=False)
  .groupby("user_id", sort=False)[amount_col]
  pct = df.groupby(type_col)[amount_col].quantile([0.95, 0.99]).unstack(level=1)
2025-06-29 02:05:01 | INFO | Features generadas en 829.9s | columnas totales: 29


In [None]:
# C.2 · Resumen rápido de % nulos y correlación alta
summary = (
    df_feat.isna().mean()
      .to_frame("pct_null")
      .join(df_feat.dtypes.rename("dtype"))
      .sort_values("pct_null", ascending=False)
)
display(summary.head(12))

# Correlaciones >0.95
num_cols = df_feat.select_dtypes(include="number").columns
corr = df_feat[num_cols].corr().abs()
high_corr = (
    corr.where(np.triu(np.ones(corr.shape), 1).astype(bool))
        .stack()
        .loc[lambda s: s > 0.95]
)
display(high_corr.head())

Unnamed: 0,pct_null,dtype
tx_cnt_1h,0.89558,Int32
ratio_cnt_1h_24h,0.89558,float32
ratio_sum_1h_24h,0.89558,float32
tx_sum_1h,0.89558,float32
tx_sum_6h,0.868804,float32
tx_cnt_6h,0.868804,Int32
ratio_sum_24h_7d,0.804762,float32
tx_cnt_24h,0.804762,Int32
tx_sum_24h,0.804762,float32
ratio_cnt_24h_7d,0.804762,float32


tx_sum_1h         tx_sum_6h           0.965192
ratio_sum_1h_24h  ratio_cnt_1h_24h    0.982861
dtype: float64

In [None]:
# C.3 · Persistencia y log final
df_feat.to_parquet(CACHE_DIR / "df_features.parquet", compression="snappy")
log_step("df_features.parquet guardado")

# Guardamos CSV de metadatos
meta = (
    df_feat.describe(include="all")
      .T[["count", "mean", "std", "min", "max"]]
      .join(summary["pct_null"])
)
meta.to_csv(CACHE_DIR / "features_summary.csv")
log_step("features_summary.csv guardado")

2025-06-29 02:10:23 | INFO | df_features.parquet guardado
2025-06-29 02:11:23 | INFO | features_summary.csv guardado
