In [1]:
import pandas as pd
import numpy as np
import seaborn as sns             #visualisation
import matplotlib.pyplot as plt   #visualisation
import os

%matplotlib inline 
sns.set(color_codes=True)

from dotenv import load_dotenv
from core_ds4a_project import cleaning, columns as project_columns, datasets

%load_ext autoreload
%autoreload 1
%aimport core_ds4a_project, core_ds4a_project.cleaning, core_ds4a_project.columns, core_ds4a_project.datasets

pd.set_option("display.max_columns", None)

In [2]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [3]:
load_dotenv('envvars')

ROOT_DATA_PATH = os.environ.get('ROOT_DATA_PATH')
RAW_DATA_PATH = os.environ.get('RAW_DATA_PATH') or f'{ROOT_DATA_PATH}/raw'

In [4]:
(cartera_df, clientes_df, colocacion_df) = datasets.read_joining_datasets(dir_path=RAW_DATA_PATH)

In [5]:
cartera_df = cartera_df.sort_values(['FECHA_CIERRE', 'OBLIGACION'], ascending=True)
ind_first = ~(cartera_df.duplicated(keep='first', subset='OBLIGACION'))
ind_last = ~(cartera_df.duplicated(keep='last', subset='OBLIGACION'))

In [502]:
lasts_records_cartera_df = cartera_df[ind_last].copy()
idx = lasts_records_cartera_df['CALIFICACION_CIERRE'] != 'A'
model1_df = pd.merge(lasts_records_cartera_df, colocacion_df, how='left', on="OBLIGACION", suffixes=('_left', '_right'))
model1_df = model1_df.rename(columns={'CLIENTE_left':'CLIENTE',
                                      'VALOR_CUOTA_left':'VALOR_CUOTA'
                                     })
model_df = pd.merge(model1_df, clientes_df, how='left', on="CLIENTE", suffixes=('_left_model', '_right_client'))
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38641 entries, 0 to 38640
Data columns (total 83 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   OBLIGACION               38641 non-null  int64         
 1   CLIENTE                  38641 non-null  object        
 2   TIPO_CLIENTE_COD         38641 non-null  float64       
 3   TIPO_CLIENTE             38641 non-null  object        
 4   SUCURSAL_REAL            38629 non-null  object        
 5   REGION                   38641 non-null  object        
 6   MUNICIPIO_CLIENTE        38641 non-null  object        
 7   VALOR_CUOTA              38641 non-null  object        
 8   CUOTAS_PACTADAS          38641 non-null  int64         
 9   CUOTAS_PENDIENTES        38641 non-null  int64         
 10  TASA_PERIODICA           38641 non-null  float64       
 11  PERIODICIDAD_PAGO_left   38641 non-null  category      
 12  CALIFICACION_CIERRE      38641 n

In [503]:
idx = model_df["DEFAULT"] != True
model_df.loc[idx,'DEFAULT'] = 0
idx = model_df["DEFAULT"] == True
model_df.loc[idx,'DEFAULT'] = 1
model_df['DEFAULT'] = model_df['DEFAULT'].astype('object')

In [504]:
num_to_cat = ['MUNICIPIO_CLIENTE', 'MUJER_CABEZA', 'RESPONSABLE_DE_HOGAR', 'OCUPACION', 'ESTADO_CIVIL_COD', 'GENERO_COD', 
              'NIVEL_ESTUDIOS_COD', 'TIPO_VIVIENDA', 'ESTRATO']
model_df[num_to_cat] = model_df[num_to_cat].astype('object')

In [505]:
numeric_cols = model_df.select_dtypes(include = np.number).columns.tolist()
numeric_cols

['OBLIGACION',
 'TIPO_CLIENTE_COD',
 'CUOTAS_PACTADAS',
 'CUOTAS_PENDIENTES',
 'TASA_PERIODICA',
 'SALDO',
 'DIAS_VENCIDO',
 'PORCENTAJE_PAGO',
 'MUNICIPIO_LAT',
 'MUNICIPIO_LON',
 'TASA_ANUAL',
 'MONTO',
 'VALOR_REFINANCIADO',
 'VALOR_DESEMBOLSADO',
 'NRO_CUOTAS',
 'CREDITOS_VIGENTES',
 'EDAD',
 'SUELDO_BASICO']

In [506]:
num_selected = ['TASA_ANUAL',
 'MONTO',
 'NRO_CUOTAS',
 #'EDAD'
]

In [507]:
model_df['SUELDO_BASICO'].value_counts()

0.0           34740
2000000.0       211
3000000.0       193
1800000.0       148
2500000.0       145
              ...  
13120000.0        1
1045200.0         1
998300.0          1
3946800.0         1
3220000.0         1
Name: SUELDO_BASICO, Length: 622, dtype: int64

In [508]:
categorical_cols2 = model_df.select_dtypes('category').columns.tolist()
model_df[categorical_cols2]=model_df[categorical_cols2].astype('object')

In [509]:
categorical_cols = model_df.select_dtypes('object').columns.tolist()
categorical_cols

['CLIENTE',
 'TIPO_CLIENTE',
 'SUCURSAL_REAL',
 'REGION',
 'MUNICIPIO_CLIENTE',
 'VALOR_CUOTA',
 'PERIODICIDAD_PAGO_left',
 'CALIFICACION_CIERRE',
 'MODALIDAD_left',
 'VENCIDA',
 'CAPITAL_VEN',
 'INTERES_VEN',
 'MORA',
 'SEGURO_VIDA',
 'COMISION',
 'OTROS',
 'TIPO_CREDITO_left',
 'SUCURSAL_COD',
 'COD_LINEA',
 'LINEA',
 'COD_DESTINACION',
 'DESTINACION',
 'OBSERVACIONES',
 'VALOR_CUOTA_right',
 'PERIODICIDAD_PAGO_right',
 'COD_MODALIDAD',
 'MODALIDAD_right',
 'ANO_CONTABILIZA',
 'CODEUDOR',
 'DIAS_CICLO_CREDITO',
 'TIPO_CREDITO_right',
 'CLIENTE_right',
 'DEFAULT',
 'ESTRATO',
 'SUCURSAL',
 'TIPO_DE_CLIENTE',
 'ACTIVIDAD_ECONOMICA',
 'TIPO_DE_IDENTIFICACION',
 'OFICIO',
 'MUJER_CABEZA',
 'RESPONSABLE_DE_HOGAR',
 'OCUPACION',
 'ACTIVIDAD_CIIU_PRIMARIA',
 'ESTADO_CIVIL_COD',
 'GENERO_COD',
 'NIVEL_ESTUDIOS_COD',
 'PROFESION_COD',
 'TIPO_UBICACION_COD',
 'TIPO_VIVIENDA_COD',
 'PROFESION',
 'TIPO_UBICACION',
 'GENERO',
 'NIVEL_ESTUDIOS',
 'ESTADO_CIVIL',
 'TIPO_VIVIENDA',
 'ACTIVIDAD']

In [510]:
model_df[categorical_cols] = model_df[categorical_cols].astype('str')

In [511]:
cat_selected = [ 'TIPO_CLIENTE',
 #'MUNICIPIO_CLIENTE',
 'PERIODICIDAD_PAGO_left',
 'TIPO_CREDITO_left',
 #'SUCURSAL_COD',
 'COD_LINEA',
 #'COD_DESTINACION',
 #'COD_MODALIDAD',
 'CODEUDOR',
 #'ACTIVIDAD_ECONOMICA',
 'MUJER_CABEZA',
 'RESPONSABLE_DE_HOGAR',
 'ESTADO_CIVIL_COD',
 'GENERO_COD',
 'NIVEL_ESTUDIOS_COD',
 #'PROFESION_COD',
 'TIPO_UBICACION_COD',
 'ESTRATO'
 #'TIPO_VIVIENDA_COD'
               ]

In [512]:
model_df[cat_selected]

Unnamed: 0,TIPO_CLIENTE,PERIODICIDAD_PAGO_left,TIPO_CREDITO_left,COD_LINEA,CODEUDOR,MUJER_CABEZA,RESPONSABLE_DE_HOGAR,ESTADO_CIVIL_COD,GENERO_COD,NIVEL_ESTUDIOS_COD,TIPO_UBICACION_COD,ESTRATO
0,Microfinanciero,Mensual,NUEVO,GER,COUDEUDOR_1,No |N,No |N,U,F,P,2,1.0
1,Microfinanciero,Mensual,RETANQUEADO,CRE,CODEUDOR_B,No |N,No |N,C,M,V,1,2.0
2,Microfinanciero,Mensual,NUEVO,CRE,SIN_CODEUDOR,Si |Y,Si |Y,U,M,S,1,2.0
3,Microfinanciero,Mensual,NUEVO,CRE,CODEUDOR_B,No |N,Si |Y,S,F,P,1,2.0
4,Microfinanciero,Mensual,NUEVO,CRE,SIN_CODEUDOR,Si |Y,Si |Y,C,F,P,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
38636,Microfinanciero,Trimestral,SIN_PERFIL,FID,SIN_CODEUDOR,No |N,Si |Y,S,M,S,1,1.0
38637,Microfinanciero,Mensual,PARALELO,CRE,SIN_CODEUDOR,No |N,Si |Y,U,M,S,1,1.0
38638,Microfinanciero,Trimestral,SIN_PERFIL,FID,SIN_CODEUDOR,No |N,Si |Y,,F,S,1,1.0
38639,Microfinanciero,Trimestral,SIN_PERFIL,GER,COUDEUDOR_1,No |N,No |N,S,M,S,2,1.0


## Regularización

In [513]:
imputer = SimpleImputer(strategy = 'mean').fit(model_df[num_selected])
model_df[num_selected] = imputer.transform(model_df[num_selected])

In [514]:
model_df[num_selected]

Unnamed: 0,TASA_ANUAL,MONTO,NRO_CUOTAS
0,31.67,2000000.0,12.0
1,35.29,6000000.0,36.0
2,35.29,2950000.0,24.0
3,35.29,2900000.0,24.0
4,35.29,1000000.0,12.0
...,...,...,...
38636,35.28,1000000.0,1.0
38637,38.40,6000000.0,24.0
38638,35.28,1000000.0,1.0
38639,31.68,4000000.0,6.0


In [515]:
model_df[num_selected].isna().sum()

TASA_ANUAL    0
MONTO         0
NRO_CUOTAS    0
dtype: int64

In [516]:
model_df[num_selected].describe().loc[['min','max']]

Unnamed: 0,TASA_ANUAL,MONTO,NRO_CUOTAS
min,12.0,300000.0,1.0
max,39.6,80000000.0,72.0


In [517]:
scaler = MinMaxScaler().fit(model_df[num_selected])
model_df[num_selected] = scaler.transform(model_df[num_selected])
model_df[num_selected].describe().loc[['min','max']]

Unnamed: 0,TASA_ANUAL,MONTO,NRO_CUOTAS
min,0.0,0.0,0.0
max,1.0,1.0,1.0


In [518]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(model_df[cat_selected])
encoded_cols = list(encoder.get_feature_names_out(cat_selected))

In [519]:
model_df[encoded_cols] = encoder.transform(model_df[cat_selected]);
model_df[encoded_cols].head()

Unnamed: 0,TIPO_CLIENTE_Codeudor no cliente,TIPO_CLIENTE_Fondeador,TIPO_CLIENTE_Gestion social,TIPO_CLIENTE_Microfinanciero,TIPO_CLIENTE_Mixto,TIPO_CLIENTE_Proveedor,PERIODICIDAD_PAGO_left_Bimensual,PERIODICIDAD_PAGO_left_Cuatrimestral,PERIODICIDAD_PAGO_left_Mensual,PERIODICIDAD_PAGO_left_Trimestral,TIPO_CREDITO_left_NUEVO,TIPO_CREDITO_left_PARALELO,TIPO_CREDITO_left_RENOVADO,TIPO_CREDITO_left_RETANQUEADO,TIPO_CREDITO_left_SIN_PERFIL,COD_LINEA_CRD,COD_LINEA_CRE,COD_LINEA_CRN,COD_LINEA_FID,COD_LINEA_GER,COD_LINEA_GRR,COD_LINEA_MCA,COD_LINEA_UNE,CODEUDOR_CODEUDOR_A,CODEUDOR_CODEUDOR_B,CODEUDOR_CODEUDOR_E,CODEUDOR_COUDEUDOR_1,CODEUDOR_COUDEUDOR_2,CODEUDOR_SIN_CODEUDOR,MUJER_CABEZA_No |N,MUJER_CABEZA_Si |Y,RESPONSABLE_DE_HOGAR_No |N,RESPONSABLE_DE_HOGAR_Si |Y,RESPONSABLE_DE_HOGAR_nan,ESTADO_CIVIL_COD_C,ESTADO_CIVIL_COD_D,ESTADO_CIVIL_COD_O,ESTADO_CIVIL_COD_S,ESTADO_CIVIL_COD_U,ESTADO_CIVIL_COD_V,ESTADO_CIVIL_COD_nan,GENERO_COD_F,GENERO_COD_M,NIVEL_ESTUDIOS_COD_A,NIVEL_ESTUDIOS_COD_C,NIVEL_ESTUDIOS_COD_E,NIVEL_ESTUDIOS_COD_I,NIVEL_ESTUDIOS_COD_M,NIVEL_ESTUDIOS_COD_P,NIVEL_ESTUDIOS_COD_S,NIVEL_ESTUDIOS_COD_T,NIVEL_ESTUDIOS_COD_U,NIVEL_ESTUDIOS_COD_V,NIVEL_ESTUDIOS_COD_X,NIVEL_ESTUDIOS_COD_nan,TIPO_UBICACION_COD_1,TIPO_UBICACION_COD_2,ESTRATO_1.0,ESTRATO_2.0,ESTRATO_3.0,ESTRATO_4.0,ESTRATO_5.0,ESTRATO_6.0,ESTRATO_nan
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [520]:
model_df['SUCURSAL_COD'].value_counts()

1      6166
8      5598
2      4561
9      3727
3      3386
6      3240
11     2478
4      2462
12     1857
5      1405
7      1295
10      967
13      967
14      524
nan       8
Name: SUCURSAL_COD, dtype: int64

In [521]:
sel_cols = encoded_cols+num_selected

In [522]:
model_df_def = model_df[sel_cols+['DEFAULT']]

In [523]:
model_df_def.head()

Unnamed: 0,TIPO_CLIENTE_Codeudor no cliente,TIPO_CLIENTE_Fondeador,TIPO_CLIENTE_Gestion social,TIPO_CLIENTE_Microfinanciero,TIPO_CLIENTE_Mixto,TIPO_CLIENTE_Proveedor,PERIODICIDAD_PAGO_left_Bimensual,PERIODICIDAD_PAGO_left_Cuatrimestral,PERIODICIDAD_PAGO_left_Mensual,PERIODICIDAD_PAGO_left_Trimestral,TIPO_CREDITO_left_NUEVO,TIPO_CREDITO_left_PARALELO,TIPO_CREDITO_left_RENOVADO,TIPO_CREDITO_left_RETANQUEADO,TIPO_CREDITO_left_SIN_PERFIL,COD_LINEA_CRD,COD_LINEA_CRE,COD_LINEA_CRN,COD_LINEA_FID,COD_LINEA_GER,COD_LINEA_GRR,COD_LINEA_MCA,COD_LINEA_UNE,CODEUDOR_CODEUDOR_A,CODEUDOR_CODEUDOR_B,CODEUDOR_CODEUDOR_E,CODEUDOR_COUDEUDOR_1,CODEUDOR_COUDEUDOR_2,CODEUDOR_SIN_CODEUDOR,MUJER_CABEZA_No |N,MUJER_CABEZA_Si |Y,RESPONSABLE_DE_HOGAR_No |N,RESPONSABLE_DE_HOGAR_Si |Y,RESPONSABLE_DE_HOGAR_nan,ESTADO_CIVIL_COD_C,ESTADO_CIVIL_COD_D,ESTADO_CIVIL_COD_O,ESTADO_CIVIL_COD_S,ESTADO_CIVIL_COD_U,ESTADO_CIVIL_COD_V,ESTADO_CIVIL_COD_nan,GENERO_COD_F,GENERO_COD_M,NIVEL_ESTUDIOS_COD_A,NIVEL_ESTUDIOS_COD_C,NIVEL_ESTUDIOS_COD_E,NIVEL_ESTUDIOS_COD_I,NIVEL_ESTUDIOS_COD_M,NIVEL_ESTUDIOS_COD_P,NIVEL_ESTUDIOS_COD_S,NIVEL_ESTUDIOS_COD_T,NIVEL_ESTUDIOS_COD_U,NIVEL_ESTUDIOS_COD_V,NIVEL_ESTUDIOS_COD_X,NIVEL_ESTUDIOS_COD_nan,TIPO_UBICACION_COD_1,TIPO_UBICACION_COD_2,ESTRATO_1.0,ESTRATO_2.0,ESTRATO_3.0,ESTRATO_4.0,ESTRATO_5.0,ESTRATO_6.0,ESTRATO_nan,DEFAULT
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0


# MODELING

## Decision tree

In [524]:
X_train, X_test, y_train, y_test = train_test_split(model_df_def[sel_cols], model_df_def['DEFAULT'].astype('category').cat.codes,
                                                    test_size=0.3,
                                                    random_state=1234)
model_dt = DecisionTreeClassifier(max_depth=7, random_state=4321)
model_dt = model_dt.fit(X_train, y_train)

y_pred_train = model_dt.predict(X_train)
y_pred_test = model_dt.predict(X_test)

print(f"Accuracy in training set: {accuracy_score(y_train, y_pred_train)}")
print(f"Accuracy in the other samples: {accuracy_score(y_test, y_pred_test)}")

Accuracy in training set: 0.9756728778467909
Accuracy in the other samples: 0.973863538342103


In [525]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_test)

array([[11290,     4],
       [  299,     0]])

### Apliying SMOTE

In [526]:
from imblearn.over_sampling import SMOTE

In [527]:
SMOTE =  SMOTE()

In [528]:
X_train_SMOTE, y_train_SMOTE = SMOTE.fit_resample(X_train, y_train)

In [529]:
model_dt_SMOTE = DecisionTreeClassifier(max_depth=8, random_state=4321)
model_dt_SMOTE = model_dt_SMOTE.fit(X_train_SMOTE, y_train_SMOTE)

y_pred_train_SMOTE = model_dt_SMOTE.predict(X_train_SMOTE)
y_pred_test_SMOTE = model_dt_SMOTE.predict(X_test)

print(f"Accuracy in training set: {accuracy_score(y_train_SMOTE, y_pred_train_SMOTE)}")
print(f"Accuracy in the other samples: {accuracy_score(y_test, y_pred_test_SMOTE)}")

Accuracy in training set: 0.7724757428744694
Accuracy in the other samples: 0.7501940826360735


In [530]:
confusion_matrix(y_test, y_pred_test_SMOTE)

array([[8609, 2685],
       [ 211,   88]])

### OneClassSVM

In [531]:
from sklearn.svm import OneClassSVM

In [532]:
model_SVM = OneClassSVM(gamma='scale', nu=0.025)

In [533]:
X_train = X_train[y_train==0]
model_SVM.fit(X_train)

In [534]:
model_SVM.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1])

In [535]:
y_test[y_test == 1] = -1
y_test[y_test == 0] = 1

In [536]:
print(f"Accuracy in the other samples: {accuracy_score(y_test, model_SVM.predict(X_test))}")

Accuracy in the other samples: 0.947554558785474


In [537]:
confusion_matrix(y_test, model_SVM.predict(X_test))

array([[    2,   297],
       [  311, 10983]])

### iForest

In [538]:
from sklearn.ensemble import IsolationForest

In [539]:
model_iF = IsolationForest(contamination=0.025)

In [540]:
model_iF.fit(X_test)



In [541]:
print(f"Accuracy in the other samples: {accuracy_score(y_test, model_iF.predict(X_test))}")

Accuracy in the other samples: 0.949193478823428


In [542]:
confusion_matrix(y_test, model_SVM.predict(X_test))

array([[    2,   297],
       [  311, 10983]])

## Random Forest

In [559]:
X_train, X_test, y_train, y_test = train_test_split(model_df_def[sel_cols], model_df_def['DEFAULT'].astype('category').cat.codes,
                                                    test_size=0.3,
                                                    random_state=123)
model_rf = RandomForestClassifier(n_estimators=50, max_depth=8, random_state=321)
model_rf = model_rf.fit(X_train, y_train)

y_pred_train = model_rf.predict(X_train)
y_pred_test = model_rf.predict(X_test)

print(f"Accuracy in training set: {accuracy_score(y_train, y_pred_train)}")
print(f"Accuracy in the other samples: {accuracy_score(y_test, y_pred_test)}")

Accuracy in training set: 0.9754880212954747
Accuracy in the other samples: 0.974122315190201


In [560]:
confusion_matrix(y_test, model_rf.predict(X_test))

array([[11293,     0],
       [  300,     0]])

In [604]:
model_rf_SMOTE = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=321)
model_rf_SMOTE = model_rf_SMOTE.fit(X_train_SMOTE, y_train_SMOTE)

y_pred_train_SMOTE = model_rf_SMOTE.predict(X_train_SMOTE)
y_pred_test_SMOTE = model_rf_SMOTE.predict(X_test)

print(f"Accuracy in training set: {accuracy_score(y_train_SMOTE, y_pred_train_SMOTE)}")
print(f"Accuracy in the other samples: {accuracy_score(y_test, y_pred_test_SMOTE)}")

Accuracy in training set: 0.9657747119466343
Accuracy in the other samples: 0.9387561459501423


In [605]:
confusion_matrix(y_test, model_rf_SMOTE.predict(X_test))

array([[10779,   514],
       [  196,   104]])

## XGBOOST

In [606]:
from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier

In [616]:
model_xgb = XGBClassifier(random_state=321, n_jobs=-1, n_estimators=30, max_depth=20, scale_pos_weight=40)
model_xgb = model_xgb.fit(X_train, y_train)

In [617]:
accuracy_score(y_test, model_xgb.predict(X_test))

0.8144569999137411

In [618]:
confusion_matrix(y_test, model_xgb.predict(X_test))

array([[9373, 1920],
       [ 231,   69]])

In [652]:
model_xgb_SMOTE = XGBClassifier(random_state=321, n_jobs=-1, n_estimators=100, max_depth=22)
model_xgb_SMOTE = model_xgb_SMOTE.fit(X_train_SMOTE, y_train_SMOTE)

In [653]:
accuracy_score(y_test, model_xgb_SMOTE.predict(X_test))

0.9566117484689036

In [654]:
confusion_matrix(y_test, model_xgb_SMOTE.predict(X_test))

array([[11012,   281],
       [  222,    78]])