In [43]:
import pandas as pd
import os
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import warnings
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('float_format', '{:f}'.format)

In [44]:
data_train = pd.read_parquet('../data/processed/train.gzip')
data_sub = pd.read_parquet('../data/processed/test.gzip')

data_train.columns = [col.replace('-', '_') for col in data_train.columns]
data_sub.columns = [col.replace('-', '_') for col in data_sub.columns]

# list(data_train.columns)

In [45]:
data_sub_id = data_sub[['ID']]

categorical_feats = ['CO_TIPO_SEXO', 'NO_DEPARTAMENTO', 'REGION', 
                     'SALDO_PEQ_EMP_FLAG_any', 'SALDO_MED_EMP_FLAG_any', 'SALDO_MIC_EMP_FLAG_any',
                     'CANT_EMP_NEG_FLAG_any','CANT_EMP_CONS_FLAG_any', 'CANT_EMP_HIPOT_FLAG_any',
                     ]
for col in categorical_feats:
    data_train[col] = data_train[col].astype('category')
    data_sub[col] = data_sub[col].astype('category')

TARGET_VAR = 'TARGET'
N_FOLDS = 5
N_JOBS = int(os.cpu_count() // 2.5)

In [46]:
selected_cols = [
 TARGET_VAR,
 'ANIO_BANCARIZACION', 'MES_BANCARIZACION',
 'NO_PROVINCIA',
 'REGION',
 'CO_TIPO_SEXO',
 'EDAD',
 'NO_DEPARTAMENTO',
 'MESES_HASTA_ACTUAL',
 'CANT_EMP_NEG_max',
 'CANT_EMP_NEG_last',
 'CANT_EMP_NEG_min',
 'CANT_EMP_CONS_max',
 'CANT_EMP_CONS_last',
 'CANT_EMP_CONS_min',
 'CANT_EMP_HIPOT_max',
 'CANT_EMP_HIPOT_last',
 'CANT_EMP_HIPOT_min',
 'SALDO_MED_EMP_mean',
 'SALDO_MED_EMP_median',
 'SALDO_MED_EMP_last',
 'SALDO_MED_EMP_min',
 'SALDO_MED_EMP_max',
 'SALDO_MED_EMP_first',
 'SALDO_PEQ_EMP_mean',
 'SALDO_PEQ_EMP_median',
 'SALDO_PEQ_EMP_last',
 'SALDO_PEQ_EMP_min',
 'SALDO_PEQ_EMP_max',
 'SALDO_PEQ_EMP_first',
 'SALDO_MIC_EMP_mean',
 'SALDO_MIC_EMP_median',
 'SALDO_MIC_EMP_last',
 'SALDO_MIC_EMP_min',
 'SALDO_MIC_EMP_max',
 'SALDO_MIC_EMP_first',
 'SALDO_CONS_REV_mean',
 'SALDO_CONS_REV_median',
 'SALDO_CONS_REV_last',
 'SALDO_CONS_REV_min',
 'SALDO_CONS_REV_max',
 'SALDO_CONS_REV_first',
 'SALDO_CONS_NO_REV_mean',
 'SALDO_CONS_NO_REV_median',
 'SALDO_CONS_NO_REV_last',
 'SALDO_CONS_NO_REV_min',
 'SALDO_CONS_NO_REV_max',
 'SALDO_CONS_NO_REV_first',
 'SALDO_HIPOT_mean',
 'SALDO_HIPOT_median',
 'SALDO_HIPOT_last',
 'SALDO_HIPOT_min',
 'SALDO_HIPOT_max',
 'SALDO_HIPOT_first',
 'SALDO_VENCIDO_mean',
 'SALDO_VENCIDO_median',
 'SALDO_VENCIDO_last',
 'SALDO_VENCIDO_min',
 'SALDO_VENCIDO_max',
 'SALDO_VENCIDO_first',
 'CANT_EMP_DOL_NEG_mean',
 'CANT_EMP_DOL_NEG_median',
 'CANT_EMP_DOL_NEG_last',
 'CANT_EMP_DOL_NEG_min',
 'CANT_EMP_DOL_NEG_max',
 'CANT_EMP_DOL_NEG_first',
 'SALDO_DOLA_NEG_mean',
 'SALDO_DOLA_NEG_median',
 'SALDO_DOLA_NEG_last',
 'SALDO_DOLA_NEG_min',
 'SALDO_DOLA_NEG_max',
 'SALDO_DOLA_NEG_first',
 'CANT_EMP_DOL_CONS_mean',
 'CANT_EMP_DOL_CONS_median',
 'CANT_EMP_DOL_CONS_last',
 'CANT_EMP_DOL_CONS_min',
 'CANT_EMP_DOL_CONS_max',
 'CANT_EMP_DOL_CONS_first',
 'SALDO_DOLA_CONS_mean',
 'SALDO_DOLA_CONS_median',
 'SALDO_DOLA_CONS_last',
 'SALDO_DOLA_CONS_min',
 'SALDO_DOLA_CONS_max',
 'SALDO_DOLA_CONS_first',
 'CANT_EMP_DOL_HIPOT_mean',
 'CANT_EMP_DOL_HIPOT_median',
 'CANT_EMP_DOL_HIPOT_last',
 'CANT_EMP_DOL_HIPOT_min',
 'CANT_EMP_DOL_HIPOT_max',
 'CANT_EMP_DOL_HIPOT_first',
 'SALDO_DOLA_HIPOT_mean',
 'SALDO_DOLA_HIPOT_median',
 'SALDO_DOLA_HIPOT_last',
 'SALDO_DOLA_HIPOT_min',
 'SALDO_DOLA_HIPOT_max',
 'SALDO_DOLA_HIPOT_first',
 'MAX_LINEA_DISP_U6M_mean',
 'MAX_LINEA_DISP_U6M_median',
 'MAX_LINEA_DISP_U6M_last',
 'MAX_LINEA_DISP_U6M_min',
 'MAX_LINEA_DISP_U6M_max',
 'MAX_LINEA_DISP_U6M_first',
 'SALDO_EMP_TOTAL_mean',
 'SALDO_EMP_TOTAL_last',
 'SALDO_EMP_TOTAL_max',
 'SALDO_EMP_TOTAL_min',
 'SALDO_EMP_TOTAL_median',
 'VAR_SALDO_EMP_TOTAL_1_mean',
 'VAR_SALDO_EMP_TOTAL_1_median',
 'VAR_SALDO_EMP_TOTAL_MA3_mean',
 'VAR_SALDO_EMP_TOTAL_MA3_median',
 'VAR_SALDO_EMP_TOTAL_MA6_mean',
 'VAR_SALDO_EMP_TOTAL_MA6_median',
 'VAR_SALDO_EMP_TOTAL_MA8_mean',
 'VAR_SALDO_EMP_TOTAL_MA8_median',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_1_mean',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_1_median',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA3_mean',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA3_median',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA6_mean',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA6_median',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA8_mean',
 'VAR_SALDO_EMP_TOTAL_VENCIDO_MA8_median',
 'DIFF_SALDO_EMP_TOTAL_1_mean',
 'DIFF_SALDO_EMP_TOTAL_1_median',
 'DIFF_SALDO_EMP_TOTAL_MA3_mean',
 'DIFF_SALDO_EMP_TOTAL_MA3_median',
 'DIFF_SALDO_EMP_TOTAL_MA6_mean',
 'DIFF_SALDO_EMP_TOTAL_MA6_median',
 'DIFF_SALDO_EMP_TOTAL_MA8_mean',
 'DIFF_SALDO_EMP_TOTAL_MA8_median',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_1_mean',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_1_median',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA3_mean',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA3_median',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA6_mean',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA6_median',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA8_mean',
 'DIFF_SALDO_EMP_TOTAL_VENCIDO_MA8_median',
 'DIFF_MAX_LINEA_DISP_U6M_1_mean',
 'DIFF_MAX_LINEA_DISP_U6M_1_median',
 'DIFF_MAX_LINEA_DISP_U6M_MA3_mean',
 'DIFF_MAX_LINEA_DISP_U6M_MA3_median',
 'DIFF_MAX_LINEA_DISP_U6M_MA6_mean',
 'DIFF_MAX_LINEA_DISP_U6M_MA6_median',
 'DIFF_MAX_LINEA_DISP_U6M_MA8_mean',
 'DIFF_MAX_LINEA_DISP_U6M_MA8_median',
 'VAR_SALDO_REV_NO_REV_VENCIDO_1_mean',
 'VAR_SALDO_REV_NO_REV_VENCIDO_1_median',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA3_mean',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA3_median',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA6_mean',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA6_median',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA8_mean',
 'VAR_SALDO_REV_NO_REV_VENCIDO_MA8_median',
 'VAR_SALDO_TOTAL_VENCIDO_1_mean',
 'VAR_SALDO_TOTAL_VENCIDO_1_median',
 'VAR_SALDO_TOTAL_VENCIDO_MA3_mean',
 'VAR_SALDO_TOTAL_VENCIDO_MA3_median',
 'VAR_SALDO_TOTAL_VENCIDO_MA6_mean',
 'VAR_SALDO_TOTAL_VENCIDO_MA6_median',
 'VAR_SALDO_TOTAL_VENCIDO_MA8_mean',
 'VAR_SALDO_TOTAL_VENCIDO_MA8_median',
 'VAR_MAX_LINEA_DISP_U6M_1_mean',
 'VAR_MAX_LINEA_DISP_U6M_1_median',
 'VAR_MAX_LINEA_DISP_U6M_MA3_mean',
 'VAR_MAX_LINEA_DISP_U6M_MA3_median',
 'VAR_MAX_LINEA_DISP_U6M_MA6_mean',
 'VAR_MAX_LINEA_DISP_U6M_MA6_median',
 'VAR_MAX_LINEA_DISP_U6M_MA8_mean',
 'VAR_MAX_LINEA_DISP_U6M_MA8_median',
 'VAR_SALDO_DOLA_TOTAL_1_mean',
 'VAR_SALDO_DOLA_TOTAL_1_median',
 'VAR_SALDO_DOLA_TOTAL_MA3_mean',
 'VAR_SALDO_DOLA_TOTAL_MA3_median',
 'VAR_SALDO_DOLA_TOTAL_MA6_mean',
 'VAR_SALDO_DOLA_TOTAL_MA6_median',
 'VAR_SALDO_DOLA_TOTAL_MA8_mean',
 'VAR_SALDO_DOLA_TOTAL_MA8_median',
 'DIFF_SALDO_DOLA_TOTAL_1_mean',
 'DIFF_SALDO_DOLA_TOTAL_1_median',
 'DIFF_SALDO_DOLA_TOTAL_MA3_mean',
 'DIFF_SALDO_DOLA_TOTAL_MA3_median',
 'DIFF_SALDO_DOLA_TOTAL_MA6_mean',
 'DIFF_SALDO_DOLA_TOTAL_MA6_median',
 'DIFF_SALDO_DOLA_TOTAL_MA8_mean',
 'DIFF_SALDO_DOLA_TOTAL_MA8_median',
 'SALDO_PEQ_EMP_FLAG_any',
 'SALDO_MED_EMP_FLAG_any',
 'SALDO_MIC_EMP_FLAG_any',
 'CANT_EMP_NEG_FLAG_any',
 'CANT_EMP_CONS_FLAG_any',
 'CANT_EMP_HIPOT_FLAG_any',
 'PROXY_MOROSIDAD_1_mean',
 'PROXY_MOROSIDAD_1_median',
 'PROXY_MOROSIDAD_2_mean',
 'PROXY_MOROSIDAD_2_median',
 'PROXY_MOROSIDAD_3_mean',
 'PROXY_MOROSIDAD_3_median',
]

data_train =  data_train.select_dtypes(include=['int64', 'float64', 'category'])
data_train = data_train[selected_cols]

data_sub = data_sub.select_dtypes(include=['int64', 'float64', 'category'])
data_sub = data_sub[selected_cols]

list(data_train.columns)
print(data_train.dtypes)

TARGET                         int64
ANIO_BANCARIZACION           float64
MES_BANCARIZACION            float64
NO_PROVINCIA                category
REGION                      category
                              ...   
PROXY_MOROSIDAD_1_median     float64
PROXY_MOROSIDAD_2_mean       float64
PROXY_MOROSIDAD_2_median     float64
PROXY_MOROSIDAD_3_mean       float64
PROXY_MOROSIDAD_3_median     float64
Length: 199, dtype: object


In [47]:
data_train[TARGET_VAR].value_counts(normalize=True)

0   0.849992
1   0.150008
Name: TARGET, dtype: float64

In [48]:
def get_xgboost_model():
    return xgb.XGBClassifier(
        n_estimators=200, 
        max_depth=8, 
        learning_rate=0.1, 
        objective='binary:logistic', 
        booster='gbtree', 
        tree_method='auto', 
        n_jobs=-1, 
        verbosity=0, 
        enable_categorical=True,
        random_state=42
    )

def get_catboost_model(cat_features):
    return CatBoostClassifier(
        iterations=200, 
        depth=8, 
        learning_rate=0.1, 
        loss_function='Logloss', 
        eval_metric='F1', 
        random_seed=42, 
        verbose=0, 
        cat_features=cat_features,
        thread_count=N_JOBS, 
        task_type='GPU', 
        devices='0:1'
    )


def get_lgbm_model():
    return lgb.LGBMClassifier(
        boosting_type='gbdt', 
        num_leaves=31, 
        max_depth=-1, 
        learning_rate=0.1, 
        n_estimators=200, 
        objective='binary', 
        min_split_gain=0.0, 
        min_child_weight=0.001, 
        min_child_samples=20, 
        subsample=1.0, 
        subsample_freq=0, 
        colsample_bytree=1.0, 
        reg_alpha=0.0, 
        reg_lambda=0.0, 
        random_state=42, 
        n_jobs=N_JOBS,
        importance_type='split',
    )

def get_random_forest_model():
    return RandomForestClassifier(
        n_estimators=200, 
        max_depth=8, 
        min_samples_split=2, 
        min_samples_leaf=1, 
        min_weight_fraction_leaf=0.0, 
        max_features='auto', 
        max_leaf_nodes=None, 
        min_impurity_decrease=0.0, 
        bootstrap=True, 
        oob_score=False, 
        n_jobs=N_JOBS, 
        random_state=42, 
        verbose=0, 
        warm_start=False, 
        class_weight=None, 
        ccp_alpha=0.0, 
        max_samples=None
    )

In [49]:
# random undersampling
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy={0: 70000}, random_state=42)
X_resampled, y_resampled = rus.fit_resample(data_train.drop(TARGET_VAR, axis=1), data_train[TARGET_VAR])

data_train_resampled = pd.concat([X_resampled, y_resampled], axis=1)

In [50]:
data_copy = data_train.copy()

X = data_train_resampled.drop(columns=[TARGET_VAR])
y = data_train_resampled[TARGET_VAR]

In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y
)

cat_features = X.select_dtypes(include=['category']).columns.tolist()
X_train[cat_features] = X_train[cat_features].astype('category')
X_test[cat_features] = X_test[cat_features].astype('category')


# generate ensemble of models
models = []
models.append(('XGB', get_xgboost_model()))
models.append(('CAT', get_catboost_model(cat_features)))
models.append(('LGBM', get_lgbm_model()))
models.append(('RF', get_random_forest_model()))

for name, model in models:
    if model.__class__.__name__ == 'CatBoostClassifier':
        model.fit(X_train, y_train, verbose=0)
    elif model.__class__.__name__ == 'LGBMClassifier':
        model.fit(X_train, y_train, categorical_feature=cat_features)
    elif model.__class__.__name__ == 'XGBClassifier':
        model.fit(X_train, y_train, verbose=0)
    elif model.__class__.__name__ == 'RandomForestClassifier':
        # perform one hot encoding
        X_train_aux = pd.get_dummies(X_train, drop_first=True)
        
        model.fit(X_train_aux, y_train)

[LightGBM] [Info] Number of positive: 9575, number of negative: 48999
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 36927
[LightGBM] [Info] Number of data points in the train set: 58574, number of used features: 196
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.163468 -> initscore=-1.632644
[LightGBM] [Info] Start training from score -1.632644


In [52]:
stacked_predictions_train = pd.DataFrame()

FACTOR = 0.3
selected_models = ['CAT', 'LGBM']  # 'XGB', 'RF'

for name, model in models:
    if name not in selected_models:
        continue
    model_name = model.__class__.__name__
    if model_name == 'RandomForestClassifier':
        X_train_aux = pd.get_dummies(X_train, drop_first=True)
        stacked_predictions_train[f'{model_name}_train'] = model.predict_proba(X_train_aux)[:, 1]
    else:
        stacked_predictions_train[f'{model_name}_train'] = model.predict_proba(X_train)[:, 1]
stacked_predictions_train['TRAIN'] = stacked_predictions_train.mean(axis=1)
stacked_predictions_train['TRAIN'] = np.where(stacked_predictions_train['TRAIN'] > FACTOR, 1, 0)
print('TRAIN')
print(classification_report(y_train, stacked_predictions_train['TRAIN']))

stacked_predictions_test = pd.DataFrame()

for name, model in models:
    if name not in selected_models:
        continue
    model_name = model.__class__.__name__
    if model_name == 'RandomForestClassifier':
        X_test_aux = pd.get_dummies(X_test, drop_first=True)
        stacked_predictions_test[f'{model_name}_test'] = model.predict_proba(X_test_aux)[:, 1]
    else:
        stacked_predictions_test[f'{model_name}_test'] = model.predict_proba(X_test)[:, 1]
stacked_predictions_test['TEST'] = stacked_predictions_test.mean(axis=1)
stacked_predictions_test['TEST'] = np.where(stacked_predictions_test['TEST'] > FACTOR, 1, 0)
print('TEST')
print(classification_report(y_test, stacked_predictions_test['TEST']))

TRAIN
              precision    recall  f1-score   support

           0       0.93      0.95      0.94     48999
           1       0.70      0.66      0.68      9575

    accuracy                           0.90     58574
   macro avg       0.82      0.80      0.81     58574
weighted avg       0.90      0.90      0.90     58574

TEST
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     21001
           1       0.59      0.52      0.55      4103

    accuracy                           0.86     25104
   macro avg       0.75      0.72      0.73     25104
weighted avg       0.86      0.86      0.86     25104



In [53]:
# perform stratified k-fold cross validation
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

f1_macros = []
for train_idx, test_idx in skf.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    models = []
    # models.append(('XGB', get_xgboost_model()))
    models.append(('CAT', get_catboost_model(cat_features)))
    # models.append(('LGBM', get_lgbm_model()))
    models.append(('RF', get_random_forest_model()))

    for name, model in models:
        if model.__class__.__name__ == 'CatBoostClassifier':
            model.fit(X_train, y_train, verbose=0)
        elif model.__class__.__name__ == 'LGBMClassifier':
            model.fit(X_train, y_train, categorical_feature=cat_features)
        elif model.__class__.__name__ == 'XGBClassifier':
            model.fit(X_train, y_train, verbose=0)
        elif model.__class__.__name__ == 'RandomForestClassifier':
            # perform one hot encoding
            X_train_aux = pd.get_dummies(X_train, drop_first=True)
            
            model.fit(X_train_aux, y_train)
    
    stacked_predictions_train = pd.DataFrame()

    FACTOR = 0.3
    selected_models = ['CAT', 'LGBM']  # , 'LGBM',  

    for name, model in models:
        if name not in selected_models:
            continue
        model_name = model.__class__.__name__
        if model_name == 'RandomForestClassifier':
            X_train_aux = pd.get_dummies(X_train)
            stacked_predictions_train[f'{model_name}_train'] = model.predict_proba(X_train_aux)[:, 1]
        else:
            stacked_predictions_train[f'{model_name}_train'] = model.predict_proba(X_train)[:, 1]
    
    stacked_predictions_train['TRAIN'] = stacked_predictions_train.mean(axis=1)
    stacked_predictions_train['TRAIN'] = np.where(stacked_predictions_train['TRAIN'] > FACTOR, 1, 0)

    print('TRAIN')
    print(classification_report(y_train, stacked_predictions_train['TRAIN']))

    stacked_predictions_test = pd.DataFrame()

    for name, model in models:
        if name not in selected_models:
            continue
        model_name = model.__class__.__name__
        if model_name == 'RandomForestClassifier':
            X_test_aux = pd.get_dummies(X_test, drop_first=True)
            stacked_predictions_test[f'{model_name}_test'] = model.predict_proba(X_test_aux)[:, 1]
        else:
            stacked_predictions_test[f'{model_name}_test'] = model.predict_proba(X_test)[:, 1]
    stacked_predictions_test['TEST'] = stacked_predictions_test.mean(axis=1)
    stacked_predictions_test['TEST'] = np.where(stacked_predictions_test['TEST'] > FACTOR, 1, 0)
    print('TEST')
    print(classification_report(y_test, stacked_predictions_test['TEST']))

    # calculate f1 macro score
    print('F1 macro score')
    print(f1_score(y_test, stacked_predictions_test['TEST'], average='macro'))

    f1_macros.append(f1_score(y_test, stacked_predictions_test['TEST'], average='macro'))
    
print(f'F1 macro mean: {np.mean(f1_macros)}')    

TRAIN
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     56000
           1       0.66      0.60      0.63     10942

    accuracy                           0.88     66942
   macro avg       0.79      0.77      0.78     66942
weighted avg       0.88      0.88      0.88     66942

TEST
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     14000
           1       0.59      0.51      0.55      2736

    accuracy                           0.86     16736
   macro avg       0.75      0.72      0.73     16736
weighted avg       0.85      0.86      0.86     16736

F1 macro score
0.7327331522040901
TRAIN
              precision    recall  f1-score   support

           0       0.92      0.94      0.93     56000
           1       0.66      0.61      0.63     10942

    accuracy                           0.89     66942
   macro avg       0.79      0.77      0.78     66942
weighted avg       0.88