In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle 
from matplotlib.colors import ListedColormap, to_rgb
from great_tables import GT, from_column, style, loc
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
from catboost import CatBoostClassifier
from IPython.display import display, Markdown, Latex

import sklearn
sklearn.set_config(transform_output="pandas", display='diagram')

In [3]:
color_verde = "#255255"
color_verde_claro = "#BDCBCC"
target = 'is_fraud'

In [4]:
import matplotlib
import great_tables
import catboost
from IPython.display import display, Markdown, Latex
Markdown(f"""
📦 pandas=={pd.__version__}\n
📦 numpy=={np.__version__}\n
📦 scikit-learn=={sklearn.__version__}\n
📦 catboost=={catboost.__version__}\n
📦 matplotlib=={matplotlib.__version__}\n
📦 seaborn=={sns.__version__}\n
📦 great_tables=={great_tables.__version__}\n
""")


📦 pandas==2.3.2

📦 numpy==2.2.0

📦 scikit-learn==1.7.1

📦 catboost==1.2.8

📦 matplotlib==3.10.6

📦 seaborn==0.13.2

📦 great_tables==0.18.0



# Planteo del caso

## Prevención del fraude transaccional

In [None]:
# import kagglehub
# path = kagglehub.dataset_download("kartik2112/fraud-detection")
# df_train = pd.read_csv(f"{path}/fraudTrain.csv")
# df_test = pd.read_csv(f"{path}/fraudTest.csv")
# df = pd.concat([df_train, df_test], axis=0, ignore_index=True).reset_index(drop=True)

# target = 'is_fraud'
# cols_selected = [
#     "trans_date_trans_time",
#     "merchant",
#     "category",
#     "amt",
#     "city_pop",
#     "job",
#     "dob",
#     "lat",
#     "long",
#     "merch_lat",
#     "merch_long",
#     "is_fraud",
# ]
# df = df[cols_selected]

In [6]:
df = pd.read_parquet('../data/df_fraud.parquet')

In [7]:
## Valores faltantes
def add_random_nans(values, fraction=0.2):
    """
    Generador de valores faltantes
    """
    np.random.seed(42)
    mask = np.random.rand(len(values)) < fraction
    new_values = values.copy()
    new_values[mask] = np.nan
    return new_values

df = df.assign(
    merchant = lambda x: [i.replace('fraud_','') for i in x['merchant']],
    dob = lambda x: add_random_nans(x['dob'], fraction=0.05),
    job = lambda x: add_random_nans(x['job'], fraction=0.1),
    city_pop = lambda x: add_random_nans(x['city_pop'], fraction=0.03),
    merch_lat = lambda x: add_random_nans(x['merch_lat'], fraction=0.02),
    merch_long = lambda x: add_random_nans(x['merch_long'], fraction=0.02),
)

In [8]:
y = df[target]
X = df.drop([target], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=True, stratify=y, random_state=42
)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

for particion, (color, X, y) in {
    "entrenamiento": ("#4D9900", X_train, y_train),
    "evaluación": ("#FF9933", X_test, y_test),
}.items():
    particion_string = (
        f'<span style="color:{color}; font-weight:bold;">{particion}</span>'
    )
    display(
        Markdown(
            f"N observaciones para {particion_string}: {X.shape[0]} "
            f"({y.sum() / len(y):.2%} de transacciones fraudulentas)"
        )
    )

N observaciones para <span style="color:#4D9900; font-weight:bold;">entrenamiento</span>: 1296675 (0.52% de transacciones fraudulentas)

N observaciones para <span style="color:#FF9933; font-weight:bold;">evaluación</span>: 555719 (0.52% de transacciones fraudulentas)

# Preprocesamiento

## Tipos de transformaciones

> Ciertos tipos de transformaciones requieren `“aprender”` algunos aspectos de los [datos de entrenamiento]{style="color:#4D9900"} mientras que otras no.

Ejemplos de transformaciones que dependen de los datos de entrenamiento :

-   Imputación de valores faltantes con la mediana → La mediana depende de los datos

-   Escalado → Se debe calcular la media y desvío estándar de los datos

Ejemplos de transformaciones que no dependen de los datos de entrenamiento:

-   Construcción de una nueva variable mediante un cálculo simple → x\^2

-   Combinaciones de variables en una nueva variable → x/y


## Transformaciones iniciales

-   Cálculo de la edad

-   Cálculo de la distancia entre el comercio y el usuario

-   Generación de variables vinculadas a la fecha y hora de la transacción


In [9]:
def categorizar_hora(hora: int) -> str:
    """
    Discretizar la hora en momentos del día.
    """
    if 0 <= hora < 6:
        return "madrugada"
    elif 6 <= hora < 12:
        return "mañana"
    elif 12 <= hora < 19:
        return "tarde"
    else:
        return "noche"


def calcular_distancia_haversine(lat1, lon1, lat2, lon2, radio_tierra=6371):
    """
    Calcular la distancia Haversine entre dos pares de coordenadas (lat, lon)
    """
    R = 6371  # Radio de la Tierra en KMs

    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    dlat = lat2 - lat1
    dlon = lon2 - lon1

    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c

In [10]:
class TransformacionesIniciales(BaseEstimator, TransformerMixin):

    def __init__(self, timestamp_features=True, distance_features=True):
        """
        Args:
            timestamp_features (bool): Generar variables basadas en la fecha/hora de la trx
            distance_features (bool): Generar variables basadas en la distancia al comercio
        """
        self.timestamp_features = distance_features
        self.distance_features = distance_features

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy() # Copia para no afectar al df original

        # Casteo de variables
        X_ = X_.assign(
            dob = lambda x: pd.to_datetime(x['dob'], errors='coerce'),
            trans_date_trans_time = lambda x: pd.to_datetime(
                x["trans_date_trans_time"], errors="coerce"
            ),
        )

        # Cálculo de edad
        X_ = X_.assign(
            age = lambda x: round((x['trans_date_trans_time']-x['dob']).dt.days / 365.25,2)
        )

        # Features basadas en fecha y hora de la trx:
        if self.timestamp_features:
            X_ = X_.assign(
                trans_date__year = lambda x: x["trans_date_trans_time"].dt.year,
                trans_date__month = lambda x: x["trans_date_trans_time"].dt.month,
                trans_date__day = lambda x: x["trans_date_trans_time"].dt.day,
                trans_date__dow = lambda x: x["trans_date_trans_time"].dt.dayofweek,
                trans_date__hour = lambda x: x["trans_date_trans_time"].dt.hour,
                trans_date__partofday = lambda x: x['trans_date__hour'].apply(categorizar_hora)
            )

        if self.distance_features:
            # Distancia (en kilometros)
            X_["distance_to_merch"] = calcular_distancia_haversine(
                X_["lat"], X_["long"], X_["merch_lat"], X_["merch_long"]
            )

        X_ = X_.drop(['trans_date_trans_time','dob'], axis=1) 
        return X_

In [11]:
transformaciones = TransformacionesIniciales()
transformaciones.fit(X_train)
X_test_transformed = transformaciones.transform(X_test)

In [12]:
(GT(X_test_transformed.sample(4, random_state=42).round(2))
    .fmt_currency(columns="amt")
    .tab_options(
        column_labels_background_color=color_verde,
        table_font_names="Times New Roman"
    )
)

merchant,category,amt,city_pop,job,lat,long,merch_lat,merch_long,age,trans_date__year,trans_date__month,trans_date__day,trans_date__dow,trans_date__hour,trans_date__partofday,distance_to_merch
Harris Group,food_dining,$40.23,3451.0,Financial trader,33.92,-89.68,33.1,-89.25,35.3,2019,9,8,6,14,tarde,99.84
Ledner-Pfannerstill,gas_transport,$61.23,3807.0,Surgeon,43.97,-71.15,43.21,-71.55,19.23,2019,1,16,2,5,madrugada,91.2
Wilkinson LLC,personal_care,$64.80,2258.0,Building surveyor,41.46,-74.17,42.27,-74.9,82.96,2020,3,2,0,13,tarde,108.48
Fisher-Schowalter,shopping_net,$8.61,3096.0,"Social research officer, government",44.86,-85.81,45.85,-85.69,44.15,2019,12,4,2,22,noche,110.39


## Feature engineering

Scikit-learn cuenta con múltiples transformers ya definidos (SimpleImput, MinMaxScaler, etc). Además, se definen 4 transformers custom adicionales (`MeanEncoder, RareCategoryGrouper, ColumnCapper, IsolationForestTransformer`):


In [13]:
class MeanEncoder(BaseEstimator, TransformerMixin):
    """
    Encoding de variables categóricas mediante el promedio de la target en esa categoría.
    """

    def __init__(self, variables=None):
        self.variables = variables
        self.encoding_dict_ = {}
        self.global_mean_ = None

    def fit(self, X, y):
        X_ = X.copy()
        y_ = pd.Series(y)

        if self.variables is None:
            self.variables = X_.select_dtypes(
                include=["object", "category"]
            ).columns.tolist()

        self.global_mean_ = y_.mean()

        for var in self.variables:
            self.encoding_dict_[var] = y_.groupby(X_[var]).mean().to_dict()

        return self

    def transform(self, X):
        X_ = X.copy()
        for var in self.variables:
            X_[var] = X_[var].map(self.encoding_dict_[var])
            # Para nuevas categorías asigna el valor promedio
            X_[var] = X_[var].fillna(self.global_mean_)
        return X_

In [14]:
class RareCategoryGrouper(BaseEstimator, TransformerMixin):
    """
    Agrupar categorías poco frecuentes en "infrequent"
    """

    def __init__(self, variables=None, min_freq=0.05):
        """
        Args:
            variables (list): List of categorical variables to group.
            min_freq (float or int):
                Si float (0 < min_freq < 1), mínima proporción en el dataset.
                Si int (>=1), mínima cantidad de observaciones.
        """
        self.variables = variables
        self.min_freq = min_freq
        self.frequent_categories_ = {}

    def fit(self, X, y=None):
        X_ = X.copy()

        if self.variables is None:
            self.variables = X_.select_dtypes(include="object").columns.tolist()

        for var in self.variables:
            freqs = X_[var].value_counts(normalize=isinstance(self.min_freq, float))
            self.frequent_categories_[var] = freqs[
                freqs >= self.min_freq
            ].index.tolist()

        return self

    def transform(self, X):
        X_ = X.copy()
        for var in self.variables:
            X_[var] = X_[var].where(
                X_[var].isin(self.frequent_categories_[var]), "infrequent"
            )
        return X_

In [84]:
class IsolationForestTransformer(BaseEstimator, TransformerMixin):
    """
    Ajustar un modelo IssolationForest para detección de anomalías. 
    Retorna un anomaly_score. 
    """

    def __init__(self, **kwargs):
        self.iforest_kwargs = kwargs

    def fit(self, X, y=None):
        print('fitting anomaly score')
        self.iforest_ = IsolationForest(**self.iforest_kwargs)
        self.iforest_.fit(X)
        return self

    def transform(self, X):
        scores = self.iforest_.decision_function(X)
        return pd.DataFrame({"anomaly_score": scores}, index=X.index)

In [85]:
class ColumnCapper(BaseEstimator, TransformerMixin):
    """
    Cappear las variables mediante el método de IQR
    """

    def __init__(self, variables=None, factor=1.5):
        """
        Args:
            variables (list): lista de variables sobre las cuales remover valores atípicos
            factor (float): multiplicador del IQR
        """
        self.variables = variables
        self.factor = factor

    def fit(self, X, y=None):
        print('fitting capper')
        X_ = X.copy()
        if self.variables is None:
            self.variables = X_.select_dtypes(include="number").columns.tolist()
        # IQR
        Q1 = X_[self.variables].quantile(0.25)
        Q3 = X_[self.variables].quantile(0.75)
        IQR = Q3 - Q1
        self.lower_bound = Q1 - self.factor * IQR
        self.upper_bound = Q3 + self.factor * IQR
        return self

    def transform(self, X, y=None):
        X_ = X[self.variables].copy()
        for col in self.variables:
            X_[col] = X_[col].clip(
                lower=self.lower_bound[col], 
                upper=self.upper_bound[col]
            )
        return X_

In [86]:
df_example = pd.DataFrame({
    'age': [-100,70,20,30,20,40,110,20],
    'amount': [10000,70,20,30,20,40,110,20],
    'category': ['shopping','shopping','shopping','shopping','travel','travel','travel','home'],
    'is_fraud':[1,1,0,0,0,0,1,0]
})
X_example = df_example.drop(['is_fraud'], axis=1)
y_example = df_example['is_fraud']

df_example

Unnamed: 0,age,amount,category,is_fraud
0,-100,10000,shopping,1
1,70,70,shopping,1
2,20,20,shopping,0
3,30,30,shopping,0
4,20,20,travel,0
5,40,40,travel,0
6,110,110,travel,1
7,20,20,home,0


In [83]:
preproc_numericas = Pipeline(steps=[
    ('imputar_nulos', SimpleImputer(strategy='median')),
    ('scale', MinMaxScaler())
])

feature_eng = ColumnTransformer([
    ('num', preproc_numericas, make_column_selector(dtype_include=['float','int']))
], verbose_feature_names_out=False, remainder='drop', verbose=True)

features_preprocessing = Pipeline([
    ('feature_eng', feature_eng),
    ('anomalies', FeatureUnion([
            ('outliers', ColumnCapper()),
            ('anomaly', IsolationForestTransformer())
        ])
    )
], verbose=True)

features_preprocessing.fit(X_example, y_example)
features_preprocessing.transform(X_example)

[ColumnTransformer] ........... (1 of 1) Processing num, total=   0.0s
[Pipeline] ....... (step 1 of 2) Processing feature_eng, total=   0.0s
fitting capper
fitting capper
[Pipeline] ......... (step 2 of 2) Processing anomalies, total=   0.1s




Unnamed: 0,age,amount,anomaly_score
0,0.375,0.01503,-0.260527
1,0.809524,0.00501,-0.000759
2,0.571429,0.0,0.113449
3,0.619048,0.001002,0.102471
4,0.571429,0.0,0.113449
5,0.666667,0.002004,0.070972
6,0.89881,0.009018,-0.094618
7,0.571429,0.0,0.113449


Se crea un Pipeline de preprocesamiento general:

1. Transformaciones iniciales

2. Transformer que según el tipo de variable (numérica o categórica) aplica ciertas transformaciones:

    - Pipeline para procesamiento de variables categóricas
    - Pipeline para procesamiento de variables numéricas

3. Identificación de observaciones anómalas y cappeo de variables según el IQR


In [23]:
preproc_categoricas = Pipeline(steps=[
    ('rare_labels', RareCategoryGrouper(min_freq=0.01)),
    ('imputar_nulos', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('mean_encoder', MeanEncoder())
])

preproc_numericas = Pipeline(steps=[
    ('imputar_nulos', SimpleImputer(strategy='median')),
    ('scale', MinMaxScaler())
])

feature_eng = ColumnTransformer([
    ('cat', preproc_categoricas, make_column_selector(dtype_exclude=['float','int'])),
    ('num', preproc_numericas, make_column_selector(dtype_include=['float','int']))
], verbose_feature_names_out=False, remainder='drop', verbose=True)

features_preprocessing = Pipeline([
    ('data_cleaning', TransformacionesIniciales()),
    ('feature_eng', feature_eng),
    ('anomalies', FeatureUnion([
            ('outliers', ColumnCapper()),
            ('anomaly', IsolationForestTransformer())
        ])
    )
], verbose=True)

In [24]:
features_preprocessing.fit(X_train, y_train)
X_test_transformed = features_preprocessing.transform(X_test)

[Pipeline] ..... (step 1 of 3) Processing data_cleaning, total=   1.9s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   2.4s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   2.6s
[Pipeline] ....... (step 2 of 3) Processing feature_eng, total=   5.2s
[Pipeline] ......... (step 3 of 3) Processing anomalies, total=   1.9s




In [19]:
(GT(X_test_transformed.sample(4, random_state=42).round(3))
    .tab_options(
        column_labels_background_color=color_verde,
        table_font_names="Times New Roman"
    )
)

merchant,category,job,trans_date__partofday,amt,city_pop,lat,long,merch_lat,merch_long,age,trans_date__year,trans_date__month,trans_date__day,trans_date__dow,trans_date__hour,distance_to_merch,anomaly_score
0.005,0.001,0.005,0.001,0.001,0.001,0.298,0.778,0.291,0.776,0.26,0.0,0.727,0.233,1.0,0.609,0.656,0.058
0.005,0.004,0.005,0.009,0.002,0.001,0.513,0.967,0.5,0.954,0.065,0.0,0.0,0.5,0.333,0.217,0.599,0.008
0.005,0.002,0.005,0.001,0.002,0.001,0.459,0.936,0.48,0.92,0.839,1.0,0.182,0.033,0.0,0.565,0.713,0.002
0.005,0.013,0.005,0.011,0.0,0.001,0.532,0.817,0.555,0.812,0.368,0.0,1.0,0.1,0.333,0.957,0.726,-0.032


# Modelo

## Pipeline de modelado

Se crea un nuevo Pipeline que incluye un primer paso de preprocesamiento y un segundo paso de modelado (un clasificador).

In [20]:
clf = CatBoostClassifier(
    iterations=500,
    loss_function="Logloss",
    class_weights=[1, 20],
    random_seed=42,
    verbose=100,
)

pipe = Pipeline([
    ("preproc", features_preprocessing), 
    ("model", clf)
], verbose=True)

In [21]:
pipe.fit(X_train, y_train)

[Pipeline] ..... (step 1 of 3) Processing data_cleaning, total=   1.9s
[ColumnTransformer] ........... (1 of 2) Processing cat, total=   2.4s
[ColumnTransformer] ........... (2 of 2) Processing num, total=   2.6s
[Pipeline] ....... (step 2 of 3) Processing feature_eng, total=   5.2s
[Pipeline] ......... (step 3 of 3) Processing anomalies, total=  10.3s
[Pipeline] ........... (step 1 of 2) Processing preproc, total=  17.4s
Learning rate set to 0.415104
0:	learn: 0.1600901	total: 152ms	remaining: 1m 15s
100:	learn: 0.0287264	total: 13.7s	remaining: 54s
200:	learn: 0.0178672	total: 27s	remaining: 40.2s
300:	learn: 0.0117532	total: 42s	remaining: 27.7s
400:	learn: 0.0081597	total: 59.5s	remaining: 14.7s
499:	learn: 0.0059064	total: 1m 14s	remaining: 0us
[Pipeline] ............. (step 2 of 2) Processing model, total= 1.3min


0,1,2
,steps,"[('preproc', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,True

0,1,2
,steps,"[('data_cleaning', ...), ('feature_eng', ...), ...]"
,transform_input,
,memory,
,verbose,True

0,1,2
,timestamp_features,True
,distance_features,True

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,True
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,variables,
,min_freq,0.01

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,variables,

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,transformer_list,"[('outliers', ...), ('anomaly', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,numeric_features,"['merchant', 'category', ...]"
,factor,1.5


In [25]:
with open('../artifacts/pipe_model_fraud.pkl', 'wb') as file:
    pickle.dump(pipe, file)

In [26]:
df_preds_test = pd.DataFrame({
    'y_true': y_test.reset_index(drop=True),
    'y_pred_class': pipe.predict(X_test),
    'y_pred_prob': round(pd.Series(pipe.predict_proba(X_test)[:,1]),4)
})
index_fraud_prob = df_preds_test.sort_values('y_pred_prob', ascending=False).head(1).index

X_test.iloc[index_fraud_prob].to_dict(orient='index')



{61947: {'trans_date_trans_time': '2019-01-14 23:17:22',
  'merchant': 'Reichert, Huels and Hoppe',
  'category': 'shopping_net',
  'amt': 1034.21,
  'city_pop': 73.0,
  'job': 'Product designer',
  'dob': '1935-02-10',
  'lat': 41.4193,
  'long': -99.3844,
  'merch_lat': 41.12487,
  'merch_long': -100.349332}}

Almacenar el modelo para luego utilizarlo (despliegue):


# Predicciones

## Predicciones sobre datos nuevos (despliegue de modelos)

Cargar el archivo .pkl para utilizarlo:


In [None]:
with open('../artifacts/pipe_model_fraud.pkl', 'rb') as file:
    pipe = pickle.load(file)

🆕 Datos de una nueva transacción ([datos en producción]{style="color:#3399FF"}):


In [None]:
nueva_trx = pd.DataFrame({
    "trans_date_trans_time": "2019-10-09 20:38:49",
    "merchant": np.nan,
    "category": "gas_transport",
    "amt": 9.66,
    "city_pop": 10000,
    "job": np.nan,
    "dob": "1995-08-16",
    "zip": np.nan,
    "lat": 45.8433,
    "long": -113.1948,
    "merch_lat": 45.837213,
    "merch_long": -113.191425,
}, index=["nueva_trx"])

In [None]:
(GT(nueva_trx.round(2))
    .fmt_currency(columns="amt")
    .tab_options(
        column_labels_background_color=color_verde,
        table_font_names="Times New Roman"
    )
)

Utilizar el modelo para estimar la `probabilidad` de que la nueva transacción sea fraudulenta:


In [None]:
y_pred = pipe.predict_proba(nueva_trx)[:,1]

In [None]:
Markdown(f"La probabilidad de que la transacción sea fraudulenta es: {y_pred[0]:.2%}")