In [16]:
import pandas as pd
import os
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import warnings
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np
import matplotlib.pyplot as plt
import optuna

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('float_format', '{:f}'.format)

In [5]:
data_train = pd.read_parquet('../data/processed/train.gzip').dropna(how='any')
data_sub = pd.read_parquet('../data/processed/test.gzip')

data_train.columns = [col.replace('-', '_') for col in data_train.columns]
data_sub.columns = [col.replace('-', '_') for col in data_sub.columns]

# list(data_train.columns)

In [6]:
print(data_train.isna().sum().sum())
print(data_sub.isna().sum())

0
ID                              0
TARGET                      60789
CO_TIPO_SEXO                    0
EDAD                            0
NO_DEPARTAMENTO                 0
                            ...  
PROXY_MOROSIDAD_2_median        0
PROXY_MOROSIDAD_3_mean          0
PROXY_MOROSIDAD_3_median        0
REGION                          0
NO_PROVINCIA                    0
Length: 200, dtype: int64


In [7]:
data_sub_id = data_sub[['ID']]

categorical_feats = ['CO_TIPO_SEXO', 'NO_DEPARTAMENTO', 'REGION', 'NO_PROVINCIA',
                     'SALDO_PEQ_EMP_FLAG_any', 'SALDO_MED_EMP_FLAG_any', 'SALDO_MIC_EMP_FLAG_any',
                     'CANT_EMP_NEG_FLAG_any','CANT_EMP_CONS_FLAG_any', 'CANT_EMP_HIPOT_FLAG_any',
                     ]
for col in categorical_feats:
    data_train[col] = data_train[col].astype('category')
    data_sub[col] = data_sub[col].astype('category')

TARGET_VAR = 'TARGET'
N_FOLDS = 5
N_JOBS = os.cpu_count() // 2.5

In [8]:
selected_cols = [
 TARGET_VAR,
 'ANIO_BANCARIZACION', 'MES_BANCARIZACION',
 'NO_PROVINCIA',
 'CO_TIPO_SEXO',
 'EDAD',
 'NO_DEPARTAMENTO',
 'MESES_HASTA_ACTUAL',
 'CANT_EMP_NEG_max',
 'CANT_EMP_NEG_last',
 'CANT_EMP_NEG_min',
 'CANT_EMP_CONS_max',
 'CANT_EMP_CONS_last',
 'CANT_EMP_CONS_min',
 'CANT_EMP_HIPOT_max',
 'CANT_EMP_HIPOT_last',
 'CANT_EMP_HIPOT_min',
 'SALDO_MED_EMP_mean',
 'SALDO_MED_EMP_median',
 'SALDO_MED_EMP_last',
 'SALDO_MED_EMP_min',
 'SALDO_MED_EMP_max',
 'SALDO_MED_EMP_first',
 'SALDO_PEQ_EMP_mean',
 'SALDO_PEQ_EMP_median',
 'SALDO_PEQ_EMP_last',
 'SALDO_PEQ_EMP_min',
 'SALDO_PEQ_EMP_max',
 'SALDO_PEQ_EMP_first',
 'SALDO_MIC_EMP_mean',
 'SALDO_MIC_EMP_median',
 'SALDO_MIC_EMP_last',
 'SALDO_MIC_EMP_min',
 'SALDO_MIC_EMP_max',
 'SALDO_MIC_EMP_first',
 'SALDO_CONS_REV_mean',
 'SALDO_CONS_REV_median',
 'SALDO_CONS_REV_last',
 'SALDO_CONS_REV_min',
 'SALDO_CONS_REV_max',
 'SALDO_CONS_REV_first',
 'SALDO_CONS_NO_REV_mean',
 'SALDO_CONS_NO_REV_median',
 'SALDO_CONS_NO_REV_last',
 'SALDO_CONS_NO_REV_min',
 'SALDO_CONS_NO_REV_max',
 'SALDO_CONS_NO_REV_first',
 'SALDO_HIPOT_mean',
 'SALDO_HIPOT_median',
 'SALDO_HIPOT_last',
 'SALDO_HIPOT_min',
 'SALDO_HIPOT_max',
 'SALDO_HIPOT_first',
 'SALDO_VENCIDO_mean',
 'SALDO_VENCIDO_median',
 'SALDO_VENCIDO_last',
 'SALDO_VENCIDO_min',
 'SALDO_VENCIDO_max',
 'SALDO_VENCIDO_first',
 'CANT_EMP_DOL_NEG_mean',
 'CANT_EMP_DOL_NEG_median',
 'CANT_EMP_DOL_NEG_last',
 'CANT_EMP_DOL_NEG_min',
 'CANT_EMP_DOL_NEG_max',
 'CANT_EMP_DOL_NEG_first',
 'SALDO_DOLA_NEG_mean',
 'SALDO_DOLA_NEG_median',
 'SALDO_DOLA_NEG_last',
 'SALDO_DOLA_NEG_min',
 'SALDO_DOLA_NEG_max',
 'SALDO_DOLA_NEG_first',
 'CANT_EMP_DOL_CONS_mean',
 'CANT_EMP_DOL_CONS_median',
 'CANT_EMP_DOL_CONS_last',
 'CANT_EMP_DOL_CONS_min',
 'CANT_EMP_DOL_CONS_max',
 'CANT_EMP_DOL_CONS_first',
 'SALDO_DOLA_CONS_mean',
 'SALDO_DOLA_CONS_median',
 'SALDO_DOLA_CONS_last',
 'SALDO_DOLA_CONS_min',
 'SALDO_DOLA_CONS_max',
 'SALDO_DOLA_CONS_first',
 'CANT_EMP_DOL_HIPOT_mean',
 'CANT_EMP_DOL_HIPOT_median',
 'CANT_EMP_DOL_HIPOT_last',
 'CANT_EMP_DOL_HIPOT_min',
 'CANT_EMP_DOL_HIPOT_max',
 'CANT_EMP_DOL_HIPOT_first',
 'SALDO_DOLA_HIPOT_mean',
 'SALDO_DOLA_HIPOT_median',
 'SALDO_DOLA_HIPOT_last',
 'SALDO_DOLA_HIPOT_min',
 'SALDO_DOLA_HIPOT_max',
 'SALDO_DOLA_HIPOT_first',
 'MAX_LINEA_DISP_U6M_mean',
 'MAX_LINEA_DISP_U6M_median',
 'MAX_LINEA_DISP_U6M_last',
 'MAX_LINEA_DISP_U6M_min',
 'MAX_LINEA_DISP_U6M_max',
 'MAX_LINEA_DISP_U6M_first',
 'SALDO_EMP_TOTAL_mean',
 'SALDO_EMP_TOTAL_last',
 'SALDO_EMP_TOTAL_max',
 'SALDO_EMP_TOTAL_min',
 'SALDO_EMP_TOTAL_median',
 'VAR_SALDO_EMP_TOTAL_1_mean',
 'VAR_SALDO_EMP_TOTAL_1_median',
 'VAR_SALDO_EMP_TOTAL_MA3_mean',
 'VAR_SALDO_EMP_TOTAL_MA3_median',
 'VAR_SALDO_EMP_TOTAL_MA6_mean',
 'VAR_SALDO_EMP_TOTAL_MA6_median',
 'VAR_SALDO_EMP_TOTAL_MA8_mean',
 'VAR_SALDO_EMP_TOTAL_MA8_median',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_1_mean',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_1_median',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA3_mean',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA3_median',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA6_mean',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA6_median',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA8_mean',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA8_median',
 'DIFF_SALDO_EMP_TOTAL_1_mean',
 'DIFF_SALDO_EMP_TOTAL_1_median',
 'DIFF_SALDO_EMP_TOTAL_MA3_mean',
 'DIFF_SALDO_EMP_TOTAL_MA3_median',
 'DIFF_SALDO_EMP_TOTAL_MA6_mean',
 'DIFF_SALDO_EMP_TOTAL_MA6_median',
 'DIFF_SALDO_EMP_TOTAL_MA8_mean',
 'DIFF_SALDO_EMP_TOTAL_MA8_median',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_1_mean',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_1_median',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA3_mean',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA3_median',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA6_mean',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA6_median',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA8_mean',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA8_median',
 'DIFF_MAX_LINEA_DISP_U6M_1_mean',
 'DIFF_MAX_LINEA_DISP_U6M_1_median',
 'DIFF_MAX_LINEA_DISP_U6M_MA3_mean',
 'DIFF_MAX_LINEA_DISP_U6M_MA3_median',
 'DIFF_MAX_LINEA_DISP_U6M_MA6_mean',
 'DIFF_MAX_LINEA_DISP_U6M_MA6_median',
 'DIFF_MAX_LINEA_DISP_U6M_MA8_mean',
 'DIFF_MAX_LINEA_DISP_U6M_MA8_median',
 'VAR_SALDO_REV_NO_REV_VENCIDO_1_mean',
 'VAR_SALDO_REV_NO_REV_VENCIDO_1_median',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA3_mean',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA3_median',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA6_mean',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA6_median',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA8_mean',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA8_median',
 'VAR_SALDO_TOTAL_VENCIDO_1_mean',
 'VAR_SALDO_TOTAL_VENCIDO_1_median',
 'VAR_SALDO_TOTAL_VENCIDO_MA3_mean',
 'VAR_SALDO_TOTAL_VENCIDO_MA3_median',
 'VAR_SALDO_TOTAL_VENCIDO_MA6_mean',
 'VAR_SALDO_TOTAL_VENCIDO_MA6_median',
 'VAR_SALDO_TOTAL_VENCIDO_MA8_mean',
 'VAR_SALDO_TOTAL_VENCIDO_MA8_median',
 'VAR_MAX_LINEA_DISP_U6M_1_mean',
 'VAR_MAX_LINEA_DISP_U6M_1_median',
 'VAR_MAX_LINEA_DISP_U6M_MA3_mean',
 'VAR_MAX_LINEA_DISP_U6M_MA3_median',
 'VAR_MAX_LINEA_DISP_U6M_MA6_mean',
 'VAR_MAX_LINEA_DISP_U6M_MA6_median',
 'VAR_MAX_LINEA_DISP_U6M_MA8_mean',
 'VAR_MAX_LINEA_DISP_U6M_MA8_median',
 'VAR_SALDO_DOLA_TOTAL_1_mean',
 'VAR_SALDO_DOLA_TOTAL_1_median',
 'VAR_SALDO_DOLA_TOTAL_MA3_mean',
 'VAR_SALDO_DOLA_TOTAL_MA3_median',
 'VAR_SALDO_DOLA_TOTAL_MA6_mean',
 'VAR_SALDO_DOLA_TOTAL_MA6_median',
 'VAR_SALDO_DOLA_TOTAL_MA8_mean',
 'VAR_SALDO_DOLA_TOTAL_MA8_median',
 'DIFF_SALDO_DOLA_TOTAL_1_mean',
 'DIFF_SALDO_DOLA_TOTAL_1_median',
 'DIFF_SALDO_DOLA_TOTAL_MA3_mean',
 'DIFF_SALDO_DOLA_TOTAL_MA3_median',
 'DIFF_SALDO_DOLA_TOTAL_MA6_mean',
 'DIFF_SALDO_DOLA_TOTAL_MA6_median',
 'DIFF_SALDO_DOLA_TOTAL_MA8_mean',
 'DIFF_SALDO_DOLA_TOTAL_MA8_median',
 'SALDO_PEQ_EMP_FLAG_any',
 'SALDO_MED_EMP_FLAG_any',
 'SALDO_MIC_EMP_FLAG_any',
 'CANT_EMP_NEG_FLAG_any',
 'CANT_EMP_CONS_FLAG_any',
 'CANT_EMP_HIPOT_FLAG_any',
 'PROXY_MOROSIDAD_1_mean',
 'PROXY_MOROSIDAD_1_median',
 'PROXY_MOROSIDAD_2_mean',
 'PROXY_MOROSIDAD_2_median',
 'PROXY_MOROSIDAD_3_mean',
 'PROXY_MOROSIDAD_3_median',

    # 'PROXY_MOROSIDAD_1_mean', 'PROXY_MOROSIDAD_2_mean', 'PROXY_MOROSIDAD_3_mean',
    # 'DIFF_MAX_LINEA_DISP_U6M_MA8_mean',
    # 'VAR_MAX_LINEA_DISP_U6M_MA8_mean',
    # 'SALDO_PEQ_EMP_FLAG_any', 'SALDO_MED_EMP_FLAG_any', 'SALDO_MIC_EMP_FLAG_any',
    # 'CANT_EMP_NEG_FLAG_any','CANT_EMP_CONS_FLAG_any', 'CANT_EMP_HIPOT_FLAG_any',
    # 'VAR_SALDO_DOLA_TOTAL_MA8_mean',
    # 'DIFF_SALDO_EMP_TOTAL_VENCIDO_1_mean',
    # 'DIFF_SALDO_EMP_TOTAL_MA3_mean',
    # 'SALDO_EMP_TOTAL_min', 'SALDO_EMP_TOTAL_max', 'DIFF_SALDO_EMP_TOTAL_MA8_mean', 
    # 'SALDO_DOLA_CONS_mean',
    # 'CANT_EMP_NEG_min',
    # 'SALDO_CONS_REV_mean',
    # 'SALDO_VENCIDO_mean', 'SALDO_VENCIDO_max', 'SALDO_VENCIDO_min', 'SALDO_VENCIDO_last', 'SALDO_VENCIDO_first',
    # 'MAX_LINEA_DISP_U6M_max', 
    # 'CO_TIPO_SEXO',
    # 'MESES_HASTA_ACTUAL',
    # 'EDAD',
    # 'NO_DEPARTAMENTO',
    # 'REGION', TARGET_VAR
]

data_train =  data_train.select_dtypes(include=['int64', 'float64', 'category'])
data_train = data_train[selected_cols]

data_sub = data_sub.select_dtypes(include=['int64', 'float64', 'category'])
data_sub = data_sub[selected_cols]

list(data_train.columns)
print(data_train.dtypes)

TARGET                         int64
ANIO_BANCARIZACION           float64
MES_BANCARIZACION            float64
NO_PROVINCIA                category
CO_TIPO_SEXO                category
                              ...   
PROXY_MOROSIDAD_1_median     float64
PROXY_MOROSIDAD_2_mean       float64
PROXY_MOROSIDAD_2_median     float64
PROXY_MOROSIDAD_3_mean       float64
PROXY_MOROSIDAD_3_median     float64
Length: 198, dtype: object


In [9]:
data_train[TARGET_VAR].value_counts(normalize=True)

0   0.849992
1   0.150008
Name: TARGET, dtype: float64

In [10]:
import xgboost as xgb

def get_xgboost_model():
    return xgb.XGBClassifier(
        n_estimators=200, 
        max_depth=8, 
        learning_rate=0.1, 
        objective='binary:logistic', 
        booster='gbtree', 
        tree_method='auto', 
        n_jobs=-1, 
        verbosity=0, 
        enable_categorical=True,
        random_state=42
    )

def get_catboost_model(cat_features):
    return CatBoostClassifier(
        iterations=100, 
        depth=6, 
        learning_rate=0.1, 
        loss_function='Logloss', 
        eval_metric='F1', 
        random_seed=42, 
        verbose=0, 
        cat_features=cat_features,
        thread_count=N_JOBS, 
        task_type='GPU', 
        devices='0:1',
        # class_weights=[1, 1.25]
    )


In [11]:
data_train[TARGET_VAR].value_counts()

0    77504
1    13678
Name: TARGET, dtype: int64

In [12]:
# random undersampling
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy={0: 70000}, random_state=42)
X_resampled, y_resampled = rus.fit_resample(data_train.drop(TARGET_VAR, axis=1), data_train[TARGET_VAR])

data_train_resampled = pd.concat([X_resampled, y_resampled], axis=1)

In [13]:
data_train_resampled[TARGET_VAR].value_counts(normalize=True)

0   0.836540
1   0.163460
Name: TARGET, dtype: float64

In [14]:
data_copy = data_train.copy()

X = data_train_resampled.drop(columns=[TARGET_VAR])
y = data_train_resampled[TARGET_VAR]
FACTOR = 0.3

cat_features = X.select_dtypes(include=['category']).columns.tolist()

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

f1_macros = []
for train_idx, test_idx in skf.split(X, y):
    cat_model = get_catboost_model(cat_features)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    cat_model.fit(X_train, y_train)
    y_pred = cat_model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred > FACTOR).astype(int)
    print(classification_report(y_test, y_pred))
    
    f1_macros.append(f1_score(y_test, y_pred, average='macro'))

print(f'Mean F1 macro: {np.mean(f1_macros):.3f}')

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)
cat_model = get_catboost_model(cat_features)
cat_model.fit(X_train, y_train)

# cat_model feature importance
importances = cat_model.feature_importances_
indices = np.argsort(importances)[::-1]

# # print feature ranking
# print("Feature ranking:")
# for f in range(X.shape[1]):
#     print("%d. feature %s (%f)" % (f + 1, X.columns[indices[f]], importances[indices[f]]))
# # plot feature importance
# plt.figure(figsize=(20, 10))
# plt.title("Feature Importance")
# plt.bar(range(X.shape[1]), importances[indices])
# plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)

data_copy['PRED'] = cat_model.predict_proba(data_copy.drop(columns=[TARGET_VAR]))[:, 1]
data_copy['PRED'] = (data_copy['PRED'] > FACTOR).astype(int)

data_sub['TARGET'] = cat_model.predict_proba(data_sub.drop(columns=[TARGET_VAR]))[:, 1]
data_sub['TARGET'] = (data_sub['TARGET'] > FACTOR).astype(int)

print(classification_report(data_copy[TARGET_VAR], data_copy['PRED']))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92     14000
           1       0.59      0.51      0.54      2736

    accuracy                           0.86     16736
   macro avg       0.75      0.72      0.73     16736
weighted avg       0.85      0.86      0.86     16736

              precision    recall  f1-score   support

           0       0.90      0.93      0.92     14000
           1       0.57      0.50      0.54      2736

    accuracy                           0.86     16736
   macro avg       0.74      0.71      0.73     16736
weighted avg       0.85      0.86      0.85     16736

              precision    recall  f1-score   support

           0       0.91      0.93      0.92     14000
           1       0.58      0.52      0.55      2736

    accuracy                           0.86     16736
   macro avg       0.74      0.72      0.73     16736
weighted avg       0.85      0.86      0.86     16736

              preci

## Optuna version

In [23]:
from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer

In [24]:

def objective_only_model_params(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-9, 100),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10),
    }

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    f1_macros = []

    for train_idx, test_idx in skf.split(X, y):
        cat_model = CatBoostClassifier(**params, cat_features=cat_features, task_type='GPU', devices='0:1')
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        cat_model.fit(X_train, y_train)
        y_pred = cat_model.predict_proba(X_test)[:, 1]
        y_pred = (y_pred > FACTOR).astype(int)
        f1_macros.append(f1_score(y_test, y_pred, average='macro'))

    return np.mean(f1_macros)

def objective(trial):
    """
    params = {
        'iterations': trial.suggest_int('iterations', 998, 1000),
        'depth': trial.suggest_int('depth', 8, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.06, 0.1),
        'random_strength': trial.suggest_float('random_strength', 7, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.4, 1.0),
        'border_count': trial.suggest_int('border_count', 34, 255),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 47, 100),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),
    }

    factor = trial.suggest_float('factor', 0.2, 0.9)  # Uso de FACTOR como parámetro a optimizar
    """
    
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-9, 100),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10),
    }

    factor = trial.suggest_float('factor', 0.1, 0.9)  # Uso de FACTOR como parámetro a optimizar
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    f1_macros = []

    for train_idx, test_idx in skf.split(X, y):
        cat_model = CatBoostClassifier(**params, cat_features=cat_features)
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        cat_model.fit(X_train, y_train)
        y_pred = cat_model.predict_proba(X_test)[:, 1]
        y_pred = (y_pred > factor).astype(int)
        f1_macros.append(f1_score(y_test, y_pred, average='macro'))

    return np.mean(f1_macros)

def objective_prec_recall(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'random_strength': trial.suggest_float('random_strength', 0.1, 10),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-9, 100),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10),
    }

    factor = trial.suggest_float('factor', 0.1, 0.9)  # Uso de FACTOR como parámetro a optimizar

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    f1_macros = []
    precision_scores = []
    recall_scores = []

    for train_idx, test_idx in skf.split(X, y):
        cat_model = CatBoostClassifier(**params, cat_features=cat_features)
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        cat_model.fit(X_train, y_train)
        y_pred = cat_model.predict_proba(X_test)[:, 1]
        y_pred_binary = (y_pred > factor).astype(int)
        
        f1_macros.append(f1_score(y_test, y_pred_binary, average='macro'))
        precision_scores.append(precision_score(y_test, y_pred_binary))
        recall_scores.append(recall_score(y_test, y_pred_binary))

    alpha = 0.5 
    weighted_precision_recall = alpha * np.mean(precision_scores) + (1 - alpha) * np.mean(recall_scores)

    weighted_f1 = np.mean(f1_macros) + weighted_precision_recall

    return weighted_f1

X = data_train_resampled.drop(columns=[TARGET_VAR])
y = data_train_resampled[TARGET_VAR]
cat_features = X.select_dtypes(include=['category']).columns.tolist()

study = optuna.create_study(direction='maximize')
study.optimize(objective_prec_recall, n_trials=50) 

best_params = study.best_params

best_cat_model = CatBoostClassifier(**best_params, cat_features=cat_features)
best_cat_model.fit(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
y_pred_test = best_cat_model.predict_proba(X_test)[:, 1]
y_pred_test = (y_pred_test > FACTOR).astype(int)

print("Classification Report (Test Data):")
print(classification_report(y_test, y_pred_test))

data_copy['PRED'] = best_cat_model.predict_proba(data_copy.drop(columns=[TARGET_VAR]))[:, 1]
data_copy['PRED'] = (data_copy['PRED'] > FACTOR).astype(int)

data_sub['TARGET'] = best_cat_model.predict_proba(data_sub.drop(columns=[TARGET_VAR]))[:, 1]
data_sub['TARGET'] = (data_sub['TARGET'] > FACTOR).astype(int)

print("Classification Report (Data Copy):")
print(classification_report(data_copy[TARGET_VAR], data_copy['PRED']))

[I 2023-11-19 19:28:28,876] A new study created in memory with name: no-name-80ebc78e-4a3a-439a-aa11-40ec2e897d0e


0:	learn: 0.6861917	total: 81.3ms	remaining: 16.3s
1:	learn: 0.6793551	total: 155ms	remaining: 15.5s
2:	learn: 0.6717647	total: 225ms	remaining: 14.9s
3:	learn: 0.6650262	total: 301ms	remaining: 14.9s
4:	learn: 0.6580187	total: 388ms	remaining: 15.3s
5:	learn: 0.6524527	total: 460ms	remaining: 15s
6:	learn: 0.6465861	total: 570ms	remaining: 15.9s
7:	learn: 0.6404096	total: 661ms	remaining: 16s
8:	learn: 0.6346628	total: 751ms	remaining: 16.1s
9:	learn: 0.6288707	total: 831ms	remaining: 16s
10:	learn: 0.6239914	total: 901ms	remaining: 15.6s
11:	learn: 0.6200483	total: 938ms	remaining: 14.8s
12:	learn: 0.6155742	total: 1.02s	remaining: 14.8s
13:	learn: 0.6108829	total: 1.09s	remaining: 14.7s
14:	learn: 0.6058456	total: 1.16s	remaining: 14.5s
15:	learn: 0.6019626	total: 1.24s	remaining: 14.4s
16:	learn: 0.5977393	total: 1.31s	remaining: 14.3s
17:	learn: 0.5940438	total: 1.4s	remaining: 14.3s
18:	learn: 0.5905355	total: 1.47s	remaining: 14.1s
19:	learn: 0.5870049	total: 1.54s	remaining: 14

[I 2023-11-19 19:29:47,634] Trial 0 finished with value: 0.9013375105603507 and parameters: {'iterations': 202, 'depth': 8, 'learning_rate': 0.01884927892042129, 'random_strength': 1.8680860603604046, 'bagging_temperature': 0.5048950946297968, 'border_count': 74, 'l2_leaf_reg': 65.38376430613232, 'scale_pos_weight': 9.777298359734086, 'factor': 0.27040524614238093}. Best is trial 0 with value: 0.9013375105603507.


0:	learn: 0.6324282	total: 35.4ms	remaining: 18.7s
1:	learn: 0.5751701	total: 78.1ms	remaining: 20.6s
2:	learn: 0.5286506	total: 117ms	remaining: 20.5s
3:	learn: 0.4882614	total: 151ms	remaining: 19.8s
4:	learn: 0.4555183	total: 187ms	remaining: 19.6s
5:	learn: 0.4319830	total: 224ms	remaining: 19.6s
6:	learn: 0.4107055	total: 269ms	remaining: 20.1s
7:	learn: 0.3913878	total: 308ms	remaining: 20.1s
8:	learn: 0.3745424	total: 345ms	remaining: 20s
9:	learn: 0.3614779	total: 383ms	remaining: 19.9s
10:	learn: 0.3502234	total: 417ms	remaining: 19.7s
11:	learn: 0.3409897	total: 452ms	remaining: 19.5s
12:	learn: 0.3345951	total: 498ms	remaining: 19.8s
13:	learn: 0.3277850	total: 542ms	remaining: 20s
14:	learn: 0.3217184	total: 581ms	remaining: 19.9s
15:	learn: 0.3159995	total: 626ms	remaining: 20.1s
16:	learn: 0.3116142	total: 673ms	remaining: 20.3s
17:	learn: 0.3075094	total: 710ms	remaining: 20.2s
18:	learn: 0.3038217	total: 754ms	remaining: 20.2s
19:	learn: 0.3015438	total: 792ms	remaining

[I 2023-11-19 19:31:35,520] Trial 1 finished with value: 1.2546016677842524 and parameters: {'iterations': 529, 'depth': 5, 'learning_rate': 0.07177428835430373, 'random_strength': 1.9404270471365817, 'bagging_temperature': 0.3802005682911742, 'border_count': 140, 'l2_leaf_reg': 62.900622550766954, 'scale_pos_weight': 0.6818926363805199, 'factor': 0.11622970154245885}. Best is trial 1 with value: 1.2546016677842524.


0:	learn: 0.6919753	total: 97.1ms	remaining: 1m 35s
1:	learn: 0.6907913	total: 139ms	remaining: 1m 8s
2:	learn: 0.6896191	total: 172ms	remaining: 56.2s
3:	learn: 0.6884707	total: 203ms	remaining: 49.8s
4:	learn: 0.6873383	total: 238ms	remaining: 46.6s
5:	learn: 0.6862119	total: 270ms	remaining: 44.1s
6:	learn: 0.6850964	total: 305ms	remaining: 42.6s
7:	learn: 0.6839545	total: 348ms	remaining: 42.5s
8:	learn: 0.6828491	total: 383ms	remaining: 41.6s
9:	learn: 0.6817297	total: 416ms	remaining: 40.5s
10:	learn: 0.6806557	total: 449ms	remaining: 39.8s
11:	learn: 0.6796643	total: 482ms	remaining: 39.1s
12:	learn: 0.6786861	total: 518ms	remaining: 38.7s
13:	learn: 0.6776827	total: 562ms	remaining: 39s
14:	learn: 0.6766230	total: 599ms	remaining: 38.8s
15:	learn: 0.6755823	total: 635ms	remaining: 38.5s
16:	learn: 0.6745335	total: 671ms	remaining: 38.2s
17:	learn: 0.6734842	total: 709ms	remaining: 38.1s
18:	learn: 0.6725181	total: 757ms	remaining: 38.5s
19:	learn: 0.6715837	total: 802ms	remaini

[I 2023-11-19 19:35:09,482] Trial 2 finished with value: 1.2188116780392728 and parameters: {'iterations': 985, 'depth': 5, 'learning_rate': 0.0031578646848027057, 'random_strength': 0.3869934552443395, 'bagging_temperature': 0.5615046514084372, 'border_count': 171, 'l2_leaf_reg': 66.81280536292701, 'scale_pos_weight': 5.438416366597219, 'factor': 0.7917776647763577}. Best is trial 1 with value: 1.2546016677842524.


0:	learn: 0.6663018	total: 186ms	remaining: 43.8s
1:	learn: 0.6431183	total: 357ms	remaining: 41.8s
2:	learn: 0.6233252	total: 569ms	remaining: 44.2s
3:	learn: 0.6069455	total: 763ms	remaining: 44.3s
4:	learn: 0.5924528	total: 945ms	remaining: 43.7s
5:	learn: 0.5802423	total: 1.11s	remaining: 42.7s
6:	learn: 0.5695927	total: 1.29s	remaining: 42.1s
7:	learn: 0.5606233	total: 1.45s	remaining: 41.4s
8:	learn: 0.5523527	total: 1.63s	remaining: 41.1s
9:	learn: 0.5448639	total: 1.8s	remaining: 40.7s
10:	learn: 0.5387847	total: 1.98s	remaining: 40.4s
11:	learn: 0.5329347	total: 2.14s	remaining: 39.9s
12:	learn: 0.5278041	total: 2.32s	remaining: 39.8s
13:	learn: 0.5232042	total: 2.48s	remaining: 39.4s
14:	learn: 0.5191671	total: 2.67s	remaining: 39.3s
15:	learn: 0.5148547	total: 2.85s	remaining: 39.2s
16:	learn: 0.5116937	total: 3.04s	remaining: 39.1s
17:	learn: 0.5087892	total: 3.22s	remaining: 39s
18:	learn: 0.5057757	total: 3.41s	remaining: 38.9s
19:	learn: 0.5030331	total: 3.59s	remaining:

[I 2023-11-19 19:38:33,369] Trial 3 finished with value: 1.2521722267660833 and parameters: {'iterations': 236, 'depth': 10, 'learning_rate': 0.06645786310408068, 'random_strength': 0.38308895165653933, 'bagging_temperature': 0.0817130592354689, 'border_count': 99, 'l2_leaf_reg': 10.463725930600587, 'scale_pos_weight': 5.157537008255448, 'factor': 0.45444262867366114}. Best is trial 1 with value: 1.2546016677842524.


0:	learn: 0.6660688	total: 39.7ms	remaining: 25.7s
1:	learn: 0.6347930	total: 80.8ms	remaining: 26.1s
2:	learn: 0.6096699	total: 125ms	remaining: 27s
3:	learn: 0.5877827	total: 172ms	remaining: 27.7s
4:	learn: 0.5716503	total: 213ms	remaining: 27.4s
5:	learn: 0.5591717	total: 255ms	remaining: 27.3s
6:	learn: 0.5470195	total: 315ms	remaining: 28.8s
7:	learn: 0.5381802	total: 358ms	remaining: 28.6s
8:	learn: 0.5300020	total: 399ms	remaining: 28.4s
9:	learn: 0.5212488	total: 445ms	remaining: 28.4s
10:	learn: 0.5145355	total: 495ms	remaining: 28.7s
11:	learn: 0.5067065	total: 544ms	remaining: 28.9s
12:	learn: 0.5007281	total: 598ms	remaining: 29.2s
13:	learn: 0.4932599	total: 645ms	remaining: 29.2s
14:	learn: 0.4895908	total: 699ms	remaining: 29.6s
15:	learn: 0.4860046	total: 773ms	remaining: 30.6s
16:	learn: 0.4843574	total: 818ms	remaining: 30.4s
17:	learn: 0.4804952	total: 867ms	remaining: 30.4s
18:	learn: 0.4775921	total: 916ms	remaining: 30.4s
19:	learn: 0.4743880	total: 969ms	remaini

[I 2023-11-19 19:41:22,189] Trial 4 finished with value: 1.1208743123843938 and parameters: {'iterations': 649, 'depth': 6, 'learning_rate': 0.08341034246358614, 'random_strength': 7.3521469992909845, 'bagging_temperature': 0.7860172525573607, 'border_count': 184, 'l2_leaf_reg': 72.66398087134911, 'scale_pos_weight': 2.0079220907220012, 'factor': 0.8902072886542983}. Best is trial 1 with value: 1.2546016677842524.


0:	learn: 0.6924433	total: 23.3ms	remaining: 14.1s
1:	learn: 0.6916381	total: 113ms	remaining: 34.1s
2:	learn: 0.6909303	total: 206ms	remaining: 41.3s
3:	learn: 0.6901332	total: 297ms	remaining: 44.6s
4:	learn: 0.6894251	total: 406ms	remaining: 48.7s
5:	learn: 0.6889073	total: 434ms	remaining: 43.2s
6:	learn: 0.6880016	total: 552ms	remaining: 47.1s
7:	learn: 0.6873590	total: 648ms	remaining: 48.3s
8:	learn: 0.6867097	total: 755ms	remaining: 49.9s
9:	learn: 0.6859575	total: 857ms	remaining: 50.9s
10:	learn: 0.6851131	total: 966ms	remaining: 52.1s
11:	learn: 0.6843145	total: 1.06s	remaining: 52.5s
12:	learn: 0.6835511	total: 1.17s	remaining: 53.3s
13:	learn: 0.6826588	total: 1.27s	remaining: 53.7s
14:	learn: 0.6818934	total: 1.38s	remaining: 54s
15:	learn: 0.6810402	total: 1.48s	remaining: 54.5s
16:	learn: 0.6802014	total: 1.58s	remaining: 54.6s
17:	learn: 0.6796518	total: 1.66s	remaining: 54s
18:	learn: 0.6789843	total: 1.75s	remaining: 54s
19:	learn: 0.6784837	total: 1.8s	remaining: 52

[W 2023-11-19 19:46:07,539] Trial 5 failed with parameters: {'iterations': 604, 'depth': 9, 'learning_rate': 0.0025077841021020186, 'random_strength': 6.907770169511329, 'bagging_temperature': 0.9468239510596529, 'border_count': 79, 'l2_leaf_reg': 91.33831876120588, 'scale_pos_weight': 3.243404278393965, 'factor': 0.15039155757998293} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/home/renatoseb/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_12307/2069272644.py", line 95, in objective_prec_recall
    cat_model.fit(X_train, y_train)
  File "/home/renatoseb/.local/lib/python3.10/site-packages/catboost/core.py", line 5131, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
  File "/home/renatoseb/.local/lib/python3.10/site-packages/catboost/core.p

KeyboardInterrupt: 

## Subsmission

In [133]:
submission = pd.concat([data_sub_id, data_sub['TARGET']], axis=1)
print(submission['TARGET'].value_counts(normalize=True))

submission.to_csv('../data/results/catboost_all_resampled.csv', index=False)

0   0.862475
1   0.137525
Name: TARGET, dtype: float64
