In [6]:
import pandas as pd
import os
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('float_format', '{:f}'.format)

In [None]:
data_train = pd.read_parquet('../data/processed/train.gzip')
data_sub = pd.read_parquet('../data/processed/test.gzip')

data_train.columns = [col.replace('-', '_') for col in data_train.columns]
data_sub.columns = [col.replace('-', '_') for col in data_sub.columns]

# list(data_train.columns)

In [None]:
data_sub_id = data_sub[['ID']]

categorical_feats = ['CO_TIPO_SEXO', 'NO_DEPARTAMENTO', 'REGION']
for col in categorical_feats:
    data_train[col] = data_train[col].astype('category')
    data_sub[col] = data_sub[col].astype('category')

TARGET_VAR = 'TARGET'
N_FOLDS = 5
N_JOBS = os.cpu_count() // 2.5

In [None]:
exclude_substr_list = [
    # 'CANT_', 'VAR_', 'DIFF_', 
    # 'min', 'max', 'first', 'last', 'median', 
    # 'MED_EMP', 'PEQ_EMP', 'MIC_EMP', 
    # 'REV', 'HIPOT', 'VENCIDO', 'DOLA', 
    # 'ANIO_BANCARIZACION', 'MES_BANCARIZACION', 'NO_DEPARTAMENTO', 'SALDO_EMP_TOTAL_mean'
    ]

selected_cols = [
    'DIFF_SALDO_EMP_TOTAL_VENCIDO_1_mean',
    'DIFF_SALDO_EMP_TOTAL_MA3_mean',
    'SALDO_EMP_TOTAL_min',
    'SALDO_EMP_TOTAL_max',
    'SALDO_DOLA_CONS_mean',
    'CANT_EMP_NEG_min',
    'SALDO_CONS_REV_mean',
    # 'SALDO_HIPOT_mean',
    'SALDO_VENCIDO_mean',
    'DIFF_SALDO_EMP_TOTAL_MA8_mean', 
    'MAX_LINEA_DISP_U6M_max', 
    'CO_TIPO_SEXO',
    'MESES_HASTA_ACTUAL',
    'EDAD',
    'REGION', TARGET_VAR
]
data_train =  data_train.select_dtypes(include=['int64', 'float64', 'category'])
# data_train =  data_train[[col for col in data_train.columns if all([sub not in col for sub in exclude_substr_list])]]
data_train = data_train[selected_cols]

data_sub = data_sub.select_dtypes(include=['int64', 'float64', 'category'])
# data_sub = data_sub[[col for col in data_sub.columns if all([sub not in col for sub in exclude_substr_list])]]
data_sub = data_sub[selected_cols]



list(data_train.columns)
print(data_train.dtypes)

In [23]:
models = {}

data_copy = data_train.copy()

for region in data_train['REGION'].unique():
    print(f'\t\t{region.upper()}')
    sub_df = data_train[data_train['REGION'] == region].drop(columns=['REGION'])
    X_train, X_test, y_train, y_test = train_test_split(sub_df.drop(columns=[TARGET_VAR]), sub_df[TARGET_VAR], test_size=0.2, random_state=42, stratify=sub_df[TARGET_VAR])

    df_train = pd.concat([X_train, y_train], axis=1)
    df_test = pd.concat([X_test, y_test], axis=1)
    
    cat_features = X_train.select_dtypes(include=['category']).columns.tolist()

    # Initialize and train CatBoost model
    cat_model = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss', cat_features=cat_features, verbose=0)
    cat_model.fit(X_train, y_train)

    FACTOR = 0.25
    data_copy.loc[data_copy['REGION'] == region, 'PRED'] = cat_model.predict_proba(sub_df.drop(columns=[TARGET_VAR]))[:, 1]
    data_copy.loc[data_copy['REGION'] == region, 'PRED'] = (data_copy.loc[data_copy['REGION'] == region, 'PRED'] > FACTOR).astype(int)

    data_sub.loc[data_sub['REGION'] == region, 'TARGET'] = cat_model.predict_proba(data_sub[data_sub['REGION'] == region].drop(columns=[TARGET_VAR, 'REGION']))[:, 1]
    data_sub.loc[data_sub['REGION'] == region, 'TARGET'] = (data_sub.loc[data_sub['REGION'] == region, 'TARGET'] > FACTOR).astype(int)

print(classification_report(data_copy[TARGET_VAR], data_copy['PRED']))

		CENTRO
		NORTE
		SUR
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     77505
           1       0.55      0.48      0.52     13678

    accuracy                           0.86     91183
   macro avg       0.73      0.71      0.72     91183
weighted avg       0.86      0.86      0.86     91183



In [20]:
submission = pd.concat([data_sub_id, data_sub['TARGET']], axis=1)
print(submission['TARGET'].value_counts(normalize=True))
submission.to_csv('../data/results/catboost.csv', index=False)

0.000000   0.836697
1.000000   0.163303
Name: TARGET, dtype: float64
