In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
import category_encoders as ce


def load_raw(path: str, sep: str = '|') -> pd.DataFrame:
    """
    Carga el dataset crudo desde CSV.
    """
    return pd.read_csv(path, sep=sep)


def filter_and_map_target(df: pd.DataFrame) -> pd.DataFrame:
    """
    Filtra casos revisados y mapea la variable objetivo a binario.
    """
    df = df[df['FRAUD'].isin(['Fraude', 'Descartado'])].copy()
    df['fraude_bin'] = (df['FRAUD'] == 'Fraude').astype(int)
    return df


def compute_weights(y: pd.Series) -> dict:
    """
    Calcula pesos de clase balanceados.
    """
    classes = np.unique(y)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
    return dict(zip(classes, weights))


def prepare_dates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Convierte fechas y calcula lag en días.
    """
    df['FEC_DENUNCIO'] = pd.to_datetime(df['FEC_DENUNCIO'], errors='coerce')
    df['FEC_SINIESTRO'] = pd.to_datetime(df['FEC_SINIESTRO'], errors='coerce')
    df['FEC_SINIESTRO'].fillna(df['FEC_DENUNCIO'], inplace=True)
    df['lag_dias'] = (df['FEC_DENUNCIO'] - df['FEC_SINIESTRO']).dt.days
    return df


def handle_missing_and_encode(df: pd.DataFrame) -> pd.DataFrame:
    """
    Imputa nulos, codifica variables categóricas y escala continuas.
    """
    # Eliminar columna con alto % de nulos
    df.drop(columns=['CANTIDAD_AUTOS'], inplace=True)

    # CANTIDAD_HIJOS: reemplazar 999 y imputar moda
    df['CANTIDAD_HIJOS'] = df['CANTIDAD_HIJOS'].replace(999, np.nan)
    df['CANTIDAD_HIJOS'].fillna(df['CANTIDAD_HIJOS'].mode()[0], inplace=True)
    df['CANTIDAD_HIJOS'] = df['CANTIDAD_HIJOS'].astype(int)

    # ANIO_VEHICULO: imputar mediana
    df['ANIO_VEHICULO'].fillna(df['ANIO_VEHICULO'].median(), inplace=True)
    df['ANIO_VEHICULO'] = df['ANIO_VEHICULO'].astype(int)

    # PRODUCTO: agrupar rarezas y target encoding
    df['PRODUCTO'] = df['PRODUCTO'].astype(str).fillna('Desconocido')
    freqs = df['PRODUCTO'].value_counts(normalize=True)
    rares = freqs[freqs < 0.01].index
    df['PRODUCTO_grp'] = df['PRODUCTO'].replace(rares, 'Otros')
    te_prod = ce.TargetEncoder(cols=['PRODUCTO_grp'], smoothing=0.3)
    prod_te_df = te_prod.fit_transform(df[['PRODUCTO_grp']], df['fraude_bin'])
    df['PRODUCTO_te'] = prod_te_df['PRODUCTO_grp']
    df.drop(columns=['PRODUCTO', 'PRODUCTO_grp'], inplace=True)

    # ESTADO_CIVIL: unificar categorías y one-hot (opcional, luego se descartará)
    ec_map = {
        'Casada/o': 'Casado/a', 'Casada': 'Casado/a',
        'Divorciado': 'Divorciado/a', 'Divorciado/a': 'Divorciado/a'
    }
    df['ESTADO_CIVIL'] = df['ESTADO_CIVIL'].replace(ec_map).fillna('Desconocido')
    ohe_ec = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    ec_array = ohe_ec.fit_transform(df[['ESTADO_CIVIL']])
    ec_cols = ohe_ec.get_feature_names_out(['ESTADO_CIVIL'])
    df[ec_cols] = ec_array
    df.drop(columns=['ESTADO_CIVIL'], inplace=True)

    # MARCA_VEHICULO: target encoding
    df['MARCA_VEHICULO'] = df['MARCA_VEHICULO'].fillna('Desconocido')
    te_marca = ce.TargetEncoder(cols=['MARCA_VEHICULO'], smoothing=0.3)
    marca_te_df = te_marca.fit_transform(df[['MARCA_VEHICULO']], df['fraude_bin'])
    df['MARCA_VEHICULO_te'] = marca_te_df['MARCA_VEHICULO']
    df.drop(columns=['MARCA_VEHICULO'], inplace=True)

    # PRIMA_MENSUAL_UF: imputar mediana
    df['PRIMA_MENSUAL_UF'].fillna(df['PRIMA_MENSUAL_UF'].median(), inplace=True)

    # ROBO: asegurar tipo category (opcional, luego se descartará)
    df['ROBO'] = df['ROBO'].astype('category')

    # DEDUCIBLE: cuantiles y one-hot (opcional, luego se descartará)
    df['ded_q'] = pd.qcut(df['DEDUCIBLE'], q=4, labels=[f'Q{i}' for i in range(1,5)])
    ohe_ded = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    ded_array = ohe_ded.fit_transform(df[['ded_q']])
    ded_cols = ohe_ded.get_feature_names_out(['ded_q'])
    df[ded_cols] = ded_array
    df['DEDUCIBLE_scaled'] = StandardScaler().fit_transform(df[['DEDUCIBLE']])
    df.drop(columns=['DEDUCIBLE', 'ded_q'], inplace=True)

    # CANAL_CONTRATACION: one-hot (opcional, luego se descartará)
    ohe_can = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    can_array = ohe_can.fit_transform(df[['CANAL_CONTRATACION']])
    can_cols = ohe_can.get_feature_names_out(['CANAL_CONTRATACION'])
    df[can_cols] = can_array
    df.drop(columns=['CANAL_CONTRATACION'], inplace=True)

    return df


def finalize_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Selecciona únicamente las 6 features principales y la variable objetivo.
    """
    selected = [
        'MARCA_VEHICULO_te',
        'lag_dias',
        'PRIMA_MENSUAL_UF',
        'PRODUCTO_te',
        'ANIO_VEHICULO',
        'CANTIDAD_HIJOS',
        'fraude_bin'
    ]
    return df[selected].copy()


def save_processed(df: pd.DataFrame, path: str) -> None:
    """
    Guarda el DataFrame final en CSV.
    """
    df.to_csv(path, index=False)


if __name__ == '__main__':
    raw_path = '../data/raw/dataset.csv'
    proc_path = '../data/processed/fraud_prepared.csv'

    df_raw = load_raw(raw_path)
    df = filter_and_map_target(df_raw)
    weights = compute_weights(df['fraude_bin'])
    print('Class weights:', weights)

    df = prepare_dates(df)
    df = handle_missing_and_encode(df)
    df = finalize_features(df)
    save_processed(df, proc_path)
    print('Data preparation completed. Only top 6 features saved.')


Class weights: {np.int64(0): np.float64(0.5154820884146342), np.int64(1): np.float64(16.647692307692306)}
Data preparation completed. Only top 6 features saved.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['FEC_SINIESTRO'].fillna(df['FEC_DENUNCIO'], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CANTIDAD_HIJOS'].fillna(df['CANTIDAD_HIJOS'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedi