<a href="https://colab.research.google.com/github/jpantojaj/Backtesting_Stresstesting/blob/main/Sesi%C3%B3n_2_4_Construccion_Credit_Scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Desarrollo de un Modelo de Credit Scoring**

### **1. Carga Inicial de Librerías**

In [None]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
import warnings
warnings.filterwarnings('ignore')

### **2. Entendimiento y Analisis Exploratorio de datos**

In [None]:
df_clase = pd.read_csv('Base_SolicitudesCreditoEfectivo_201307_201505.csv', sep = ";")
df_clase.head()

In [None]:
df_clase.shape

In [None]:
df_clase.columns

In [None]:
df_clase.info()

In [None]:
df_clase['CODMES']=df_clase['CODMES'].astype(str)
df_clase['CODSOLICITUD']=df_clase['CODSOLICITUD'].astype(str)
df_clase['MIN_MES_DE_DEFAULT']=df_clase['MIN_MES_DE_DEFAULT'].astype(str)
df_clase['FLG_GARANTIA']=df_clase['FLG_GARANTIA'].astype(str)
df_clase['TARJETA_RELACIONADA']=df_clase['TARJETA_RELACIONADA'].astype(str)
df_clase['VEHICULAR_RELACIONADA']=df_clase['VEHICULAR_RELACIONADA'].astype(str)
df_clase['HIPOTECARIO_RELACIONADA']=df_clase['HIPOTECARIO_RELACIONADA'].astype(str)
df_clase['CLASIF_SISTEMA_ULT_12M']=df_clase['CLASIF_SISTEMA_ULT_12M'].astype(str)
df_clase['FLG_PDH']=df_clase['FLG_PDH'].astype(str)
df_clase['FLG_TC_VISA']=df_clase['FLG_TC_VISA'].astype(str)
df_clase['FLG_TC_MC']=df_clase['FLG_TC_MC'].astype(str)

In [None]:
df_clase.head()

In [None]:
df_clase.info()

In [None]:
target_count = df_clase['FLG_DEFAULT_12M'].value_counts()
target_count

In [None]:
sns.countplot(data = df_clase, x = "FLG_DEFAULT_12M", hue="FLG_DEFAULT_12M")
target_count = df_clase.FLG_DEFAULT_12M.value_counts()
print('# Buen_Pagador:', target_count[0])
print('# 1 Mora_12M:', target_count[1])
print('Bad rate:', target_count[1]/(target_count[0]+target_count[1]))

In [None]:
a1=df_clase.pivot_table(values="CODSOLICITUD", index="CODMES", aggfunc="count", sort=True)
a1.plot(kind = 'bar',
       #stacked = 'True',          # Muestra las barras apiladas
       alpha = 0.4,               # nivel de transparencia
       width = 0.9,               # Grosor de las barras para dejar espacio entre ellas
       figsize=(9,4));            # Cambiamos el tamaño de la figura

In [None]:
a2=df_clase.pivot_table(values="FLG_DEFAULT_12M", index="CODMES", aggfunc="mean", sort=True)
a2.plot(alpha = 0.4, figsize=(9,4), ylim=(0.05,0.08))

### ***Hagamos el Análisis Univariado***

Revisemos la cantidad de nulos y sus proporciones por variable

In [None]:
null_values = pd.concat([df_clase.isnull().sum(), df_clase.isnull().sum() / len(df_clase)], axis = 1)
null_values.rename(columns = {0: 'number_null_values',1: 'ratio_null_values'}, inplace = True)
null_values

Revisemos sus estadísticos básicos

In [None]:
df_clase.select_dtypes(include=['number']).describe().transpose()

In [None]:
df_clase.select_dtypes(include=['object']).describe().transpose()

Revisemos como se distribuye cada variable

In [None]:
df_clase.drop(columns = ['FLG_DEFAULT_12M']).hist(figsize = (12, 12))
plt.show()

In [None]:
df_clase.drop(columns = ['FLG_DEFAULT_12M']).boxplot(figsize = (20, 12))
plt.yscale('log')
plt.xticks(rotation = 45)
plt.show()

In [None]:
def outliers_col(df):
  for columna in df:
    if df[columna].dtype != object:
      q1 = stats.scoreatpercentile(df[columna], 25)
      q3 = stats.scoreatpercentile(df[columna], 75)
      iqr = q3-q1
      lim_inf = q1-1.5*iqr
      lim_sup = q3+1.5*iqr
      n_outliers_inf = len(df[(df[columna]<lim_inf)])
      n_outliers_sup = len(df[(df[columna]>lim_sup)])
      print("{} | {} | {}".format(
          df[columna].name,
          n_outliers_inf,
          n_outliers_sup
          ))

In [None]:
outliers_col(df_clase)

# **3. Feature Engineering**

#### 3.1 Tratamiento de Missing:
####Según el caso elegiremos rellenar estos casos con un valor usualmente conocido (dado el tipo de variable que estemos analizando), o imputar con la mediana o el valor más frecuente, según sea numérica o categórica respectivamente

In [None]:
# Para partir las bases
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
pip install feature_engine

### Partición Train y test (considerando estratificación de la Y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_clase.drop("FLG_DEFAULT_12M", axis=1),
    df_clase["FLG_DEFAULT_12M"],
    test_size=0.3,
    random_state=0,
    stratify=df_clase["FLG_DEFAULT_12M"] #este punto es importante para asegurar un adecuado muestreo de la variable objetivo
)

X_train.shape, X_test.shape

In [None]:
# Comprobación de la proporción de Y en train
y_train.value_counts()[1]/(y_train.value_counts()[0]+y_train.value_counts()[1])

In [None]:
# Comprobación de la proporción de Y en test
y_test.value_counts()[1]/(y_test.value_counts()[0]+y_test.value_counts()[1])

In [None]:
# Revisión de la proporción de nulos por variable
X_train.isnull().mean().where(X_train.isnull().mean()>0)

### Variable numéricas

In [None]:
var_num = X_train.select_dtypes(include = ["number"])
var_num.isnull().mean().where(var_num.isnull().mean()>0)

### Variables categóricas

In [None]:
var_cat = X_train.select_dtypes(exclude = ["number"])
var_cat.isnull().mean().where(var_cat.isnull().mean()>0)

### Generamos un pipeline para tratamiento de Nulos

In [None]:
# Llamemos a la librería feature engine
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer

In [None]:
# Construyamos alternativamente un nuevo pipeline con todos los métodos de imputación en uno solo
pipe_2 = Pipeline(
    [
        (
            "median_imputer",
            MeanMedianImputer(imputation_method="median", variables=['LINEA_DE_TC','EDAD_T','INGRESO_CLIENTE']),
        ),
        (
            "arbitrary_imputer",
            ArbitraryNumberImputer(arbitrary_number=0, variables=['CUOTA', 'DEUDA_TOTAL_SISTEMA', 'MEDIANA_AHORROS_ULT_6M', 'MESES_AHORROS_ULT_6M', 'ATRASO_MAXIMO_ULT_24M','ATRASO_MAXIMO_ULT_12M','MONTO_TC_MEMBRESIA']),
        ),
        (   "mode_imputer",
           CategoricalImputer(imputation_method="frequent", variables=['PROFESION','ZONA_DEL_DESEMBOLSO','ESTADO_CIVIL'])
        ),
    ]
)

In [None]:
pipe_2.fit(X_train)

In [None]:
pipe_2.named_steps["median_imputer"].imputer_dict_

In [None]:
pipe_2.named_steps["arbitrary_imputer"].imputer_dict_

In [None]:
pipe_2.named_steps['mode_imputer'].imputer_dict_

In [None]:
X_train_t = pipe_2.transform(X_train)
X_test_t = pipe_2.transform(X_test)

In [None]:
X_train_t.isnull().mean()

In [None]:
X_test_t.isnull().mean()

### 3.2 Tratamiento de Valores Raros o Poco frecuentes
En este punto, nos detenemos para revisar problemas de cardinalidad y si existen valores raros o poco frecuentes en nuestras variables categóricas

In [None]:
cat_cols=['FLG_GARANTIA','SEGMENTOCLIENTE','TARJETA_RELACIONADA','VEHICULAR_RELACIONADA','HIPOTECARIO_RELACIONADA','CLASIF_SISTEMA_ULT_12M',
          'FLG_PDH','PROFESION','ZONA_DEL_DESEMBOLSO','ESTADO_CIVIL','FLG_TC_VISA','FLG_TC_MC']

In [None]:
# Examinemos esto en la muestra de train
for col in cat_cols:
    print('variable: ', col, ' nro de categorias: ', X_train_t[col].nunique())

In [None]:
#Examinemos esto en la muestra de test
for col in cat_cols:
    print('variable: ', col, ' nro de categorias: ', X_test_t[col].nunique())

In [None]:
# Analicemos la variable Profesión
unique_to_train_set = [x for x in X_train_t.PROFESION.unique() if x not in X_test_t.PROFESION.unique()]
print('El nro de categorias que aparecen en el train y no en el test es', len(unique_to_train_set))
unique_to_test_set = [x for x in X_test_t.PROFESION.unique() if x not in X_train_t.PROFESION.unique()]
print('El nro de categorias que aparecen en el test y no en el train es', len(unique_to_test_set))

In [None]:
# Analicemos la variable SEGMENTOCLIENTE
unique_to_train_set = [x for x in X_train_t.SEGMENTOCLIENTE.unique() if x not in X_test_t.SEGMENTOCLIENTE.unique()]
print('El nro de categorias que aparecen en el train y no en el test es', len(unique_to_train_set))
unique_to_test_set = [x for x in X_test_t.SEGMENTOCLIENTE.unique() if x not in X_train_t.SEGMENTOCLIENTE.unique()]
print('El nro de categorias que aparecen en el test y no en el train es', len(unique_to_test_set))

#### Generamos un pipeline para tratamiento de valores raros

In [None]:
from feature_engine.encoding import RareLabelEncoder

In [None]:
# Construyamos alternativamente un nuevo pipeline con todos los métodos de imputación y codificación de valores raros en uno solo
pipe_3 = Pipeline(
    [
        (
            "median_imputer",
            MeanMedianImputer(imputation_method="median", variables=['LINEA_DE_TC','EDAD_T','INGRESO_CLIENTE']),
        ),
        (
            "arbitrary_imputer",
            ArbitraryNumberImputer(arbitrary_number=0, variables=['CUOTA', 'DEUDA_TOTAL_SISTEMA', 'MEDIANA_AHORROS_ULT_6M', 'MESES_AHORROS_ULT_6M', 'ATRASO_MAXIMO_ULT_24M','ATRASO_MAXIMO_ULT_12M','MONTO_TC_MEMBRESIA']),
        ),
        (   "mode_imputer",
           CategoricalImputer(imputation_method="frequent", variables=['PROFESION','ZONA_DEL_DESEMBOLSO','ESTADO_CIVIL'])
        ),
        (
            "rare_encoder",
            RareLabelEncoder(tol=0.01,n_categories=5,variables=["PROFESION","SEGMENTOCLIENTE",])
        ),
    ]
)

In [None]:
pipe_3.fit(X_train)

In [None]:
X_train_t = pipe_3.transform(X_train)
X_test_t = pipe_3.transform(X_test)

In [None]:
# Comprobando el funcionamiento del pipeline
# Analicemos la variable PROFESION
unique_to_train_set = [x for x in X_train_t.PROFESION.unique() if x not in X_test_t.PROFESION.unique()]
print('El nro de categorias que aparecen en el train y no en el test es', len(unique_to_train_set))
unique_to_test_set = [x for x in X_test_t.PROFESION.unique() if x not in X_train_t.PROFESION.unique()]
print('El nro de categorias que aparecen en el test y no en el train es', len(unique_to_test_set))

In [None]:
# Analicemos la variable SEGMENTOCLIENTE
unique_to_train_set = [x for x in X_train_t.SEGMENTOCLIENTE.unique() if x not in X_test_t.SEGMENTOCLIENTE.unique()]
print('El nro de categorias que aparecen en el train y no en el test es', len(unique_to_train_set))
unique_to_test_set = [x for x in X_test_t.SEGMENTOCLIENTE.unique() if x not in X_train_t.SEGMENTOCLIENTE.unique()]
print('El nro de categorias que aparecen en el test y no en el train es', len(unique_to_test_set))

In [None]:
X_train_t.PROFESION.value_counts()

In [None]:
X_train_t.SEGMENTOCLIENTE.value_counts()

### IMPORTANTE: Hasta aqui tenemos un pipeline para imputar nuestras variables numéricas y categóricas en caso de missing, además de generar un grupo de casos "Raros" para las variables categóricas donde encontramos problemas.

### 3.3 Tratamiento de Outliers y Escalamiento

In [None]:
X_train_t.describe().transpose()

In [None]:
outliers_col(X_train_t)

In [None]:
num_cols=['DEUDA', 'ATRASO_MAXIMO_ULT_6M','ATRASO_MAXIMO_ULT_12M', 'ATRASO_MAXIMO_ULT_24M',
          'MEDIANA_AHORROS_ULT_6M', 'DEUDA_TOTAL_SISTEMA', 'MONTO_TC_SISTEMA', 'INGRESO_CLIENTE','EDAD_T','CUOTA',
          'LINEA_DE_TC', 'MONTO_TC_MEMBRESIA']

In [None]:
from feature_engine.outliers import Winsorizer
capper = Winsorizer(
    variables=num_cols,
    capping_method="quantiles",
    tail="right",
    fold=0.01,
)
capper.fit(X_train_t)

In [None]:
#capper.right_tail_caps_

In [None]:
X_train_t = capper.transform(X_train_t)
X_test_t = capper.transform(X_test_t)

In [None]:
#plot_boxplot_and_hist(X_train_t2, "var")
X_train_t.describe().transpose()

### Ahora hagamos el escalado de variables de las variables numéricas

In [None]:
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler().set_output(transform="pandas")
#scaler.fit(X_train)
#X_train_scaled = scaler.transform(X_train)
#X_test_scaled = scaler.transform(X_test)

In [None]:
num_cols=['DEUDA', 'PLAZO_CREDITO', 'ATRASO_MAXIMO_ULT_6M','ATRASO_MAXIMO_ULT_12M', 'ATRASO_MAXIMO_ULT_24M','MESES_AHORROS_ULT_6M',
          'MEDIANA_AHORROS_ULT_6M', 'DEUDA_TOTAL_SISTEMA','NUMERO_DE_PAGOS_PDH', 'MONTO_TC_SISTEMA', 'INGRESO_CLIENTE', 'EDAD_T','CUOTA',
          'LINEA_DE_TC', 'MONTO_TC_MEMBRESIA']

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler().set_output(transform="pandas")
scaler.fit(X_train_t[num_cols])
X_train_t_numoutscal = scaler.transform(X_train_t[num_cols])
X_test_t_numoutscal = scaler.transform(X_test_t[num_cols])

In [None]:
X_train_t_numoutscal.describe().transpose()

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
# Actualicemos nuestro pipeline
pipe_4 = Pipeline(
    [
        (
            "median_imputer",
            MeanMedianImputer(imputation_method="median", variables=['LINEA_DE_TC','EDAD_T','INGRESO_CLIENTE'])
        ),
        (
            "arbitrary_imputer",
            ArbitraryNumberImputer(arbitrary_number=0, variables=['CUOTA', 'DEUDA_TOTAL_SISTEMA', 'MEDIANA_AHORROS_ULT_6M', 'MESES_AHORROS_ULT_6M', 'ATRASO_MAXIMO_ULT_24M','ATRASO_MAXIMO_ULT_12M','MONTO_TC_MEMBRESIA']),
        ),
        (   "mode_imputer",
           CategoricalImputer(imputation_method="frequent", variables=['PROFESION','ZONA_DEL_DESEMBOLSO','ESTADO_CIVIL'])
        ),
        (
            "rare_encoder",
            RareLabelEncoder(tol=0.01,n_categories=5,variables=["PROFESION","SEGMENTOCLIENTE",])
        ),
        (   "capper",
            Winsorizer(variables=['DEUDA', 'ATRASO_MAXIMO_ULT_6M','ATRASO_MAXIMO_ULT_12M', 'ATRASO_MAXIMO_ULT_24M',
          'MEDIANA_AHORROS_ULT_6M', 'DEUDA_TOTAL_SISTEMA', 'MONTO_TC_SISTEMA', 'INGRESO_CLIENTE','EDAD_T','CUOTA',
          'LINEA_DE_TC', 'MONTO_TC_MEMBRESIA'], capping_method="quantiles", tail="right", fold=0.01)
        ),
        (   "scaler",
            ColumnTransformer(transformers=[('e', RobustScaler(), ['DEUDA', 'PLAZO_CREDITO', 'ATRASO_MAXIMO_ULT_6M','ATRASO_MAXIMO_ULT_12M', 'ATRASO_MAXIMO_ULT_24M','MESES_AHORROS_ULT_6M',
          'MEDIANA_AHORROS_ULT_6M', 'DEUDA_TOTAL_SISTEMA','NUMERO_DE_PAGOS_PDH', 'MONTO_TC_SISTEMA', 'INGRESO_CLIENTE', 'EDAD_T','CUOTA',
          'LINEA_DE_TC', 'MONTO_TC_MEMBRESIA'])],remainder='passthrough',verbose_feature_names_out=False).set_output(transform="pandas")

        )
    ]
)

In [None]:
pipe_4.fit(X_train)

In [None]:
X_train_t = pipe_4.transform(X_train)
X_test_t = pipe_4.transform(X_test)

In [None]:
X_train_t.describe().transpose()

## **4. Feature Selection**
Ahora, podemos determinar cuanto aportan estas variables? para esto calculemos su IV

In [None]:
tot_train_t=pd.concat([X_train_t[cat_cols], y_train], axis=1)
tot_train_t.head()

In [None]:
def calculate_woe_iv(dataset, feature_cat, target):
    lst = []
    feature=feature_cat
    for i in range(dataset[feature].nunique()):
        val = list(dataset[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': dataset[dataset[feature] == val].count()[feature],
            'Good': dataset[(dataset[feature] == val) & (dataset[target] == 0)].count()[feature],
            'Bad': dataset[(dataset[feature] == val) & (dataset[target] == 1)].count()[feature]
        })
    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    dset = dset.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()
    dset = dset.sort_values(by='WoE')
    return iv, dset

In [None]:
def plot_by_woe(df_WoE, rotation_of_x_axis_labels = 0):
    x = np.array(df_WoE.iloc[:, 0].apply(str))
    y = df_WoE['WoE']
    plt.figure(figsize=(18, 6))
    plt.plot(x, y, marker = 'o', linestyle = '--', color = 'k')
    plt.xlabel(df_WoE.columns[0])
    plt.ylabel('WOE')
    plt.title(str('WOE por ' + df_WoE.columns[0]))
    plt.xticks(rotation = rotation_of_x_axis_labels)

### Generemos un reporte con todas los IVs

In [None]:
def getFeatureIV_Importance(df,features,target):
    featureIV_Importance=list()
    for v in features:
      iv, rep=calculate_woe_iv(df,v,target)
      featureIV_Importance.append(iv)
    display(pd.DataFrame({"Feature":features, "IV":featureIV_Importance}).sort_values("IV",ascending=False))

### 4.1 Variables Categóricas

In [None]:
getFeatureIV_Importance(tot_train_t,cat_cols,"FLG_DEFAULT_12M")

### En este punto, quedémonos solo con las variables cuyo IV es mayor al 2% y menor a 50%

In [None]:
cat_cols_2=['SEGMENTOCLIENTE','CLASIF_SISTEMA_ULT_12M','FLG_PDH','PROFESION','ZONA_DEL_DESEMBOLSO','ESTADO_CIVIL','FLG_GARANTIA','HIPOTECARIO_RELACIONADA']

### Codificación de variables categóricas:

In [None]:
from feature_engine.encoding import WoEEncoder

In [None]:
encoder_2 = WoEEncoder(variables=['SEGMENTOCLIENTE','CLASIF_SISTEMA_ULT_12M','FLG_PDH','PROFESION','ZONA_DEL_DESEMBOLSO','ESTADO_CIVIL',
                                  'FLG_GARANTIA','HIPOTECARIO_RELACIONADA'],
                       fill_value=0) #ignore_format=False
encoder_2.fit(X_train_t[cat_cols_2], y_train)

In [None]:
# encoder_2.encoder_dict_
# encoder_2.variables_

In [None]:
X_train_woe_enc = encoder_2.transform(X_train_t[cat_cols_2])
X_test_woe_enc = encoder_2.transform(X_test_t[cat_cols_2])

In [None]:
X_train_woe_enc.head()

### 4.2 Variables numéricas

In [None]:
X_train_t.select_dtypes(include=['number']).columns

In [None]:
num_cols=['DEUDA', 'PLAZO_CREDITO', 'ATRASO_MAXIMO_ULT_6M','ATRASO_MAXIMO_ULT_12M', 'ATRASO_MAXIMO_ULT_24M','MESES_AHORROS_ULT_6M',
          'MEDIANA_AHORROS_ULT_6M', 'DEUDA_TOTAL_SISTEMA','NUMERO_DE_PAGOS_PDH', 'MONTO_TC_SISTEMA', 'INGRESO_CLIENTE', 'EDAD_T','CUOTA',
          'LINEA_DE_TC', 'MONTO_TC_MEMBRESIA']

#### En este punto discretizo para luego encontrar el IV de las variables

In [None]:
from feature_engine.discretisation import EqualFrequencyDiscretiser

In [None]:
disc1 = EqualFrequencyDiscretiser(q=10,
                                  variables=num_cols,
                                  return_boundaries=True,)
disc1.fit(X_train_t[num_cols])

In [None]:
#disc1.binner_dict_

In [None]:
X_train_t_numdisc=disc1.transform(X_train_t[num_cols])
X_test_t_numdisc=disc1.transform(X_test_t[num_cols])

In [None]:
X_train_t_numdisc.head()

### Veamos cuales son las variables numéricas más importantes, basado en su IV

In [None]:
tot_train_t2=pd.concat([X_train_t_numdisc, y_train], axis=1)
tot_train_t2.head()

In [None]:
getFeatureIV_Importance(tot_train_t2,num_cols,"FLG_DEFAULT_12M")

### Nuevamente, en este punto también solo quedémonos con las variables numéricas con IV mayor a 2% y menor a 50%

In [None]:
num_cols_2=['MEDIANA_AHORROS_ULT_6M','INGRESO_CLIENTE','MESES_AHORROS_ULT_6M','NUMERO_DE_PAGOS_PDH','PLAZO_CREDITO','LINEA_DE_TC','EDAD_T',
            'MONTO_TC_MEMBRESIA','DEUDA','ATRASO_MAXIMO_ULT_24M','DEUDA_TOTAL_SISTEMA','ATRASO_MAXIMO_ULT_12M']

### Actualizamos nuestro pipeline con todo el flujo de tratamiento de datos

In [None]:
pipe_5 = Pipeline(
    [
        (   "median_imputer",
            MeanMedianImputer(imputation_method="median", variables=['LINEA_DE_TC','EDAD_T','INGRESO_CLIENTE'])
        ),
        (   "arbitrary_imputer",
            ArbitraryNumberImputer(arbitrary_number=0, variables=['CUOTA', 'DEUDA_TOTAL_SISTEMA', 'MEDIANA_AHORROS_ULT_6M', 'MESES_AHORROS_ULT_6M', 'ATRASO_MAXIMO_ULT_24M','ATRASO_MAXIMO_ULT_12M','MONTO_TC_MEMBRESIA']),
        ),
        (   "mode_imputer",
           CategoricalImputer(imputation_method="frequent", variables=['PROFESION','ZONA_DEL_DESEMBOLSO','ESTADO_CIVIL'])
        ),
        (   "rare_encoder",
            RareLabelEncoder(tol=0.01,n_categories=5,variables=["PROFESION","SEGMENTOCLIENTE",])
        ),
        (   "capper",
            Winsorizer(variables=['DEUDA', 'ATRASO_MAXIMO_ULT_6M','ATRASO_MAXIMO_ULT_12M', 'ATRASO_MAXIMO_ULT_24M',
          'MEDIANA_AHORROS_ULT_6M', 'DEUDA_TOTAL_SISTEMA', 'MONTO_TC_SISTEMA', 'INGRESO_CLIENTE','EDAD_T','CUOTA',
          'LINEA_DE_TC', 'MONTO_TC_MEMBRESIA'], capping_method="quantiles", tail="right", fold=0.01)
        ),
        (   "scaler",
            ColumnTransformer(transformers=[('e', RobustScaler(), ['DEUDA', 'PLAZO_CREDITO', 'ATRASO_MAXIMO_ULT_6M','ATRASO_MAXIMO_ULT_12M', 'ATRASO_MAXIMO_ULT_24M','MESES_AHORROS_ULT_6M',
          'MEDIANA_AHORROS_ULT_6M', 'DEUDA_TOTAL_SISTEMA','NUMERO_DE_PAGOS_PDH', 'MONTO_TC_SISTEMA', 'INGRESO_CLIENTE', 'EDAD_T','CUOTA',
          'LINEA_DE_TC', 'MONTO_TC_MEMBRESIA'])],remainder='passthrough',verbose_feature_names_out=False).set_output(transform="pandas")
        ),
        (   "encoder_2",
            WoEEncoder(variables=['SEGMENTOCLIENTE','CLASIF_SISTEMA_ULT_12M','FLG_PDH','PROFESION','ZONA_DEL_DESEMBOLSO','ESTADO_CIVIL',
                                  'FLG_GARANTIA','HIPOTECARIO_RELACIONADA'],
                       fill_value=0)
        ),
    ]
)

In [None]:
pipe_5.fit(X_train,y_train)

In [None]:
X_train_t_p = pipe_5.transform(X_train)
X_test_t_p = pipe_5.transform(X_test)

### 4.3 Análisis de Correlaciones
En este punto, verifiquemos si existe correlación en las variables predictoras que hemos elegido, con el fin de depurarlas

In [None]:
# Creemos una función para identificar de manera masiva a las variables correlacionadas, en caso de tener un mayor volumen de variables
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                print(abs(corr_matrix.iloc[i, j]), corr_matrix.columns[i], corr_matrix.columns[j])
                colname = corr_matrix.columns[j]
                col_corr.add(colname)
    return col_corr

In [None]:
X_train_t2=pd.concat([X_train_t_p[cat_cols_2], X_train_t_p[num_cols_2]], axis=1)
X_test_t2=pd.concat([X_test_t_p[cat_cols_2], X_test_t_p[num_cols_2]], axis=1)
X_train_t2.head()

In [None]:
corr_features = correlation(X_train_t2, 0.8)
len(set(corr_features))

In [None]:
# Si hubiesen variables correlacionadas deberían ser extraidas, de la siguiente forma
# X_train.drop(labels=corr_features, axis=1, inplace=True)
# X_test.drop(labels=corr_features, axis=1, inplace=True)
# X_train.shape, X_test.shape
# Alternativamente, podemos hacer lo siguiente
#from feature_engine.selection import DropCorrelatedFeatures
#sel = DropCorrelatedFeatures(
#    threshold=0.8,
#    method='pearson',
#    missing_values='ignore'
#)
#sel.fit(X_train)
#X_train = sel.transform(X_train)
#X_test = sel.transform(X_test)
#X_train.shape, X_test.shape


### Alternativamente usaremos las variables numéricas discretizadas y codificadas

In [None]:
X_train_t_numdisc.shape

In [None]:
encoder_4 = WoEEncoder(variables=num_cols_2,
                       fill_value=0) #ignore_format=False
encoder_4.fit(X_train_t_numdisc[num_cols_2], y_train)

In [None]:
X_train_woe_enc2 = encoder_4.transform(X_train_t_numdisc[num_cols_2])
X_test_woe_enc2 = encoder_4.transform(X_test_t_numdisc[num_cols_2])

In [None]:
X_train_woe_enc2.head()

## **5. Entrenamiento del Modelo**
En esta sección construiremos tres tipos de modelos y los evaluaremos para quedarnos con el mejor de ellos

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

In [None]:
logit = LogisticRegression()
logit.fit(X_train_t2, y_train)
pred_train_logit = logit.predict_proba(X_train_t2)
pred_test_logit = logit.predict_proba(X_test_t2)
print('Train set')
print('Logistic regression roc-auc: {}'.format(roc_auc_score(y_train, pred_train_logit[:,1])))
print('Test set')
print('Logistic regression roc-auc: {}'.format(roc_auc_score(y_test, pred_test_logit[:,1])))

In [None]:
rf = RandomForestClassifier(n_estimators=300, random_state=39)
rf.fit(X_train_t2, y_train)
pred_train_rf = rf.predict_proba(X_train_t2)
pred_test_rf = rf.predict_proba(X_test_t2)
print('Train set')
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred_train_rf[:,1])))
print('Test set')
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred_test_rf[:,1])))

In [None]:
gbc = GradientBoostingClassifier(n_estimators=300, random_state=44)
gbc.fit(X_train_t2, y_train)
pred_train_gbc = gbc.predict_proba(X_train_t2)
pred_test_gbc = gbc.predict_proba(X_test_t2)
print('Train set')
print('Gradient Boosted Trees roc-auc: {}'.format(roc_auc_score(y_train, pred_train_gbc[:,1])))
print('Test set')
print('Gradient Boosted Trees roc-auc: {}'.format(roc_auc_score(y_test, pred_test_gbc[:,1])))

#### IMPORTANTE: Hasta aqui tenemos un modelo ganador por estabilidad entre train y test< entonces revisemos como podemos optimizar sus hiperparámetros para reducir este efecto

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

## **Random Forest**

#### Evaluemos que la sensibilidad de cada hiperparámetro

#### Nro de árboles




In [None]:
# random forests
rf = RandomForestClassifier(random_state=39)

# hyperparameter space
rf_param_grid = dict(
    n_estimators=[10, 20, 50, 100, 200],
#     max_depth=[1, 2, 3, 4, 5],
#     min_samples_split=[0.01, 0.05, 0.1, 0.2, 0.3, 0.5]
)

# search
reg = GridSearchCV(rf, rf_param_grid,scoring='roc_auc', return_train_score=True, cv=5, n_jobs=4)
search = reg.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)

# best hyperparameters
search.best_params_

In [None]:
results = pd.DataFrame(search.cv_results_)[['params','mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']]
results

In [None]:
# plot results
results.index = rf_param_grid['n_estimators']
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylim(0.6, 1)
plt.ylabel('roc_auc')
plt.xlabel('n_estimators')

#### Profundidad

In [None]:
# random forests
rf = RandomForestClassifier(random_state=39)

# hyperparameter space
rf_param_grid = dict(
#     n_estimators=[10, 20, 50, 100, 200],
      max_depth=[1, 2, 3, 4, 5],
#     min_samples_split=[0.01, 0.05, 0.1, 0.2, 0.3, 0.5]
)

# search
reg = GridSearchCV(rf, rf_param_grid,scoring='roc_auc', return_train_score=True, cv=5, n_jobs=4)
search = reg.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)

# best hyperparameters
search.best_params_

In [None]:
results = pd.DataFrame(search.cv_results_)[['params', 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']]
results

In [None]:
# plot results
results.index = rf_param_grid['max_depth']
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylim(0.6, 1)
plt.ylabel('roc_auc')
plt.xlabel('max_depth')

#### Nro mínimo de observaciones para partir el nodo

In [None]:
# random forests
rf = RandomForestClassifier(random_state=39)

# hyperparameter space
rf_param_grid = dict(
#     n_estimators=[10, 20, 50, 100, 200],
#     max_depth=[1, 2, 3, 4, 5],
      min_samples_split=[0.01, 0.05, 0.1, 0.2, 0.3, 0.5]
)

# search
reg = GridSearchCV(rf, rf_param_grid,scoring='roc_auc', return_train_score=True, cv=5, n_jobs=4)
search = reg.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)

# best hyperparameters
search.best_params_

In [None]:
results = pd.DataFrame(search.cv_results_)[['params', 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']]
results

In [None]:
# plot results
results.index = rf_param_grid['min_samples_split']
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylim(0.6, 1)
plt.ylabel('roc_auc')
plt.xlabel('min_samples_split')

## Veamos el GridSearch con todos los parámetros para evaluar la mejor combinación

In [None]:
# random forests
rf = RandomForestClassifier(random_state=39)

# hyperparameter space
rf_param_grid = dict(
    n_estimators=[10, 20, 50, 100, 200],# 5 valores
    max_depth=[1, 2, 3, 4, 5],#5 valores
    min_samples_split=[0.01, 0.05, 0.1, 0.2, 0.3, 0.5] #6 valores
)

# search
reg = GridSearchCV(rf, rf_param_grid,scoring='roc_auc',return_train_score=True, cv=5, n_jobs=4)
search = reg.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)

# best hyperparameters
search.best_params_

In [None]:
results = pd.DataFrame(search.cv_results_)[['params', 'mean_train_score', 'std_train_score','mean_test_score', 'std_test_score']]
results.sort_values(by='mean_test_score', ascending=False, inplace=True)
results.reset_index(drop=True, inplace=True)
results

In [None]:
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylabel('Mean ROC_AUC')
plt.xlabel('Hyperparameter space')

In [None]:
# Opciones de Cross Validation -> Esto se inserta en el hiperparámetro cv
# K-Fold Cross-Validation
#kf = KFold(n_splits=5, shuffle=True, random_state=4)
# Repeated K-Fold Cross-Validation
#rkf = RepeatedKFold(n_splits=5, n_repeats=10,random_state=4)
# Leave One Out Cross-Validation
#loo = LeaveOneOut()
# Leave P Out Cross-Validation
#lpo = LeavePOut(p=2)
# Stratified K Fold Cross-Validation
#skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)

## Veamos el RandomSearch para encontrar la mejor combinación en el Random Forest

In [None]:
# set up the model
rf = RandomForestClassifier(random_state=39)
# hyperparameter space
rf_param_grid = dict(
    n_estimators=stats.randint(10, 200),
    min_samples_split=stats.uniform(0, 1),
    max_depth=stats.randint(1, 5),
    )
# search
reg_rf = RandomizedSearchCV(rf, rf_param_grid, scoring='roc_auc', cv=5, return_train_score=True, n_iter = 60, random_state=10, n_jobs=4)
search_rf = reg_rf.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)

# best hyperparameters
search_rf.best_params_

In [None]:
results = pd.DataFrame(search_rf.cv_results_)[['params', 'mean_train_score', 'std_train_score','mean_test_score', 'std_test_score']]
results.sort_values(by='mean_test_score', ascending=False, inplace=True)
results.reset_index(drop=True, inplace=True)
results

In [None]:
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylabel('Mean ROC_AUC')
plt.xlabel('Hyperparameter space')

#### CONCLUSION RF: Evaluemos como le va al modelo con los hiperparametros elegidos en train y test

In [None]:
# Lo dejamos entrenado con los parámetros hallados
rf = RandomForestClassifier(max_depth= 4, min_samples_split=0.07685550174624711, n_estimators=155, random_state=39)
rf.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)
pred_train_rf = rf.predict_proba(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1))
pred_test_rf = rf.predict_proba(pd.concat([X_test_woe_enc,X_test_t_numoutscal],axis=1))
print('Train roc_auc: ', roc_auc_score(y_train, pred_train_rf[:,1]))
print('Test roc_auc: ', roc_auc_score(y_test, pred_test_rf[:,1]))

## **Gradient Boosting**


#### Evaluemos la sensibilidad de cada parámetro

#### Nro de Arboles

In [None]:
# set up the model
gbc = GradientBoostingClassifier(random_state=0)

# determine the hyperparameter space
gbc_param_grid = dict(
    n_estimators=[10, 20, 50, 100, 200],
    #min_samples_split=[0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
    #max_depth=[1,2,3,4,5],
    )

# search
reg = GridSearchCV(gbc, gbc_param_grid, scoring='roc_auc', return_train_score=True, cv=5, n_jobs=4)
search = reg.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)

# best hyperparameters
search.best_params_

In [None]:
results = pd.DataFrame(search.cv_results_)[['params', 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']]
results

In [None]:
# plot results
results.index = gbc_param_grid['n_estimators']
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylim(0.6, 1)
plt.ylabel('roc_auc')
plt.xlabel('n_estimators')

#### Nro mínimo de observaciones para partir el nodo

In [None]:
# set up the model
gbc = GradientBoostingClassifier(random_state=0)

# determine the hyperparameter space
gbc_param_grid = dict(
    #n_estimators=[10, 20, 50, 100, 200],
    min_samples_split=[0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
    #max_depth=[1,2,3,4,5],
    )

# search
reg = GridSearchCV(gbc, gbc_param_grid, scoring='roc_auc', return_train_score=True, cv=5, n_jobs=4)
search = reg.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)

# best hyperparameters
search.best_params_

In [None]:
results = pd.DataFrame(search.cv_results_)[['params', 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']]
results

In [None]:
# plot results
results.index = gbc_param_grid['min_samples_split']
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylim(0.6, 1)
plt.ylabel('roc_auc')
plt.xlabel('min_samples_split')

#### Profundidad

In [None]:
# set up the model
gbc = GradientBoostingClassifier(random_state=0)

# determine the hyperparameter space
gbc_param_grid = dict(
    #n_estimators=[10, 20, 50, 100, 200],
    #min_samples_split=[0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
    max_depth=[1,2,3,4,5],
    )

# search
reg = GridSearchCV(gbc, gbc_param_grid, scoring='roc_auc', return_train_score=True, cv=5, n_jobs=4)
search = reg.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)

# best hyperparameters
search.best_params_

In [None]:
results = pd.DataFrame(search.cv_results_)[['params', 'mean_train_score', 'std_train_score', 'mean_test_score', 'std_test_score']]
results

In [None]:
# plot results
results.index = gbc_param_grid['max_depth']
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylim(0.6, 1)
plt.ylabel('roc_auc')
plt.xlabel('max_depth')

## Veamos al GridSearch con todos los parámetros para evaluar la mejor combinación

In [None]:
# set up the model
gbc = GradientBoostingClassifier(random_state=0)

# determine the hyperparameter space
gbc_param_grid = dict(
    n_estimators=[10, 20, 50, 100, 200],
    min_samples_split=[0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
    max_depth=[1,2,3,4,5],
    )

# search
reg = GridSearchCV(gbc, gbc_param_grid, scoring='roc_auc', return_train_score=True, cv=5, n_jobs=4)
search = reg.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)

# best hyperparameters
search.best_params_

In [None]:
results = pd.DataFrame(search.cv_results_)[['params', 'mean_train_score', 'std_train_score','mean_test_score', 'std_test_score']]
results.sort_values(by='mean_test_score', ascending=False, inplace=True)
results.reset_index(drop=True, inplace=True)
results

In [None]:
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylabel('Mean ROC_AUC')
plt.xlabel('Hyperparameter space')

## Veamos el RandomSearch para encontrar la mejor combinación en el Gradient Boosting

In [None]:
# set up the model
gbc = GradientBoostingClassifier(random_state=0)

# determine the hyperparameter space
gbc_param_grid = dict(
    n_estimators=stats.randint(10, 200),
    min_samples_split=stats.uniform(0, 1),
    max_depth=stats.randint(1, 5),
    )

# search
reg_gbc = RandomizedSearchCV(gbc, gbc_param_grid, scoring='roc_auc', cv=5, return_train_score=True, n_iter = 60, random_state=10, n_jobs=4)
search_gbc = reg_gbc.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)

# best hyperparameters
search_gbc.best_params_

In [None]:
results = pd.DataFrame(search_gbc.cv_results_)[['params', 'mean_train_score', 'std_train_score','mean_test_score', 'std_test_score']]
results.sort_values(by='mean_test_score', ascending=False, inplace=True)
results.reset_index(drop=True, inplace=True)
results

In [None]:
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylabel('Mean ROC_AUC')
plt.xlabel('Hyperparameter space')

### CONCLUSION GBC: Evaluemos como le va al modelo con los parámetros elegidos en train y test

In [None]:
# Lo dejamos entrenado con los parámetros hallados
gbc = GradientBoostingClassifier(max_depth= 4, min_samples_split=0.4674032789842478, n_estimators=153, random_state=0)
gbc.fit(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1), y_train)
pred_train_gbc = gbc.predict_proba(pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1))
pred_test_gbc = gbc.predict_proba(pd.concat([X_test_woe_enc,X_test_t_numoutscal],axis=1))
print('Train roc_auc: ', roc_auc_score(y_train, pred_train_gbc[:,1]))
print('Test roc_auc: ', roc_auc_score(y_test, pred_test_gbc[:,1]))

### Entonces, basado en sus rendimientos en test, cuál es el mejor modelo?

In [None]:
print('Logit Test roc_auc: ', roc_auc_score(y_test, pred_test_logit[:,1]), 'Logit Test GINI: ', 2*roc_auc_score(y_test, pred_test_logit[:,1])-1)
print('RF Test roc_auc: ', roc_auc_score(y_test, pred_test_rf[:,1]), 'RF Test GINI: ', 2*roc_auc_score(y_test, pred_test_rf[:,1])-1)
print('GBC Test roc_auc: ', roc_auc_score(y_test, pred_test_gbc[:,1]), 'GBC Test GINI: ', 2*roc_auc_score(y_test, pred_test_gbc[:,1])-1)

#### FINALMENTE: Si bien no hay una gran diferencia entre los 3 modelos, en cuanto a performance, el GINI más alto es el correspondiente al Modelo de **Gradient Boosting**

### **6. Valoración del Modelo**
Empecemos a revisar con el modelo final algunas métricas de valoración del modelo

### **Accuracy**
Porcentaje de predicciones correctas

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
br_test=y_test.value_counts() / len(y_test)
br_test

In [None]:
y_train_base = pd.Series(np.zeros(len(y_train)))
y_test_base = pd.Series(np.zeros(len(y_test)))

In [None]:
print('Accuracy Baseline test: ', accuracy_score(y_test, y_test_base))
print('Accuracy GBC test:', accuracy_score(y_test, gbc.predict(pd.concat([X_test_woe_enc,X_test_t_numoutscal],axis=1))))

In [None]:
from sklearn.metrics import (
    #accuracy_score,
    balanced_accuracy_score,
    recall_score,
)

In [None]:
print('Balanced accuracy, Baseline test: ', balanced_accuracy_score(y_test, y_test_base))
print('Balanced accuracy, GBC test:',  balanced_accuracy_score(y_test,gbc.predict(pd.concat([X_test_woe_enc,X_test_t_numoutscal],axis=1))))

## Precision, Recall, F-measure, Support

- **Precision** = tp / (tp + fp)

- **Recall** = tp / (tp + fn)

- **F1** = 2 * (precision * recall) / (precision + recall)

- **Support** = Number of cases on each class

In [None]:
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    precision_recall_fscore_support,
)

In [None]:
# Precision
print('Precision Baseline test: ', precision_score(y_test, y_test_base))
print('Precision GBC test:', precision_score(y_test,gbc.predict(pd.concat([X_test_woe_enc,X_test_t_numoutscal],axis=1))))

In [None]:
# Recall
print('Recall Baseline test: ', recall_score(y_test, y_test_base))
print('Recall GBC test:', recall_score(y_test,gbc.predict(pd.concat([X_test_woe_enc,X_test_t_numoutscal],axis=1))))

In [None]:
# F1-Score
print('F-measure Baseline test: ', f1_score(y_test, y_test_base))
print('F-measure GBC test:', f1_score(y_test, gbc.predict(pd.concat([X_test_woe_enc,X_test_t_numoutscal],axis=1))))

In [None]:
precision, recall, fscore, support = precision_recall_fscore_support(
    y_test, gbc.predict(pd.concat([X_test_woe_enc,X_test_t_numoutscal],axis=1)))

print('Precision: ', precision)
print('Recall: ', recall)
print('F1-score: ', fscore)
print('Support: ', support)

In [None]:
X_train_woe_enc_tot=pd.concat([X_train_woe_enc,X_train_t_numoutscal],axis=1)
X_test_woe_enc_tot=pd.concat([X_test_woe_enc,X_test_t_numoutscal],axis=1)

In [None]:
X_train_woe_enc_tot.shape

In [None]:
X_test_woe_enc_tot.shape

## Matriz de Confusión

TN | FP

FN | TP

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, y_test_base, labels=[0,1])

In [None]:
confusion_matrix(y_test, gbc.predict(X_test_woe_enc_tot), labels=[0,1])

## Identificamos un punto de corte óptimo que maximice el F1 Score

In [None]:
from yellowbrick.classifier import (
    DiscriminationThreshold,
)

In [None]:
visualizer = DiscriminationThreshold(gbc, is_fitted=True, random_state=0, argmax='fscore')#fbeta=3
visualizer.fit(X_test_woe_enc_tot, y_test)
visualizer.score(X_test_woe_enc_tot, y_test)
visualizer.show()

### Adicionamos un cálculo de ROC

In [None]:
from yellowbrick.classifier import ROCAUC

In [None]:
visualizer2 = ROCAUC(gbc, is_fitted=True, micro=False, macro=False,)
visualizer2.fit(X_test_woe_enc_tot, y_test)
visualizer2.score(X_test_woe_enc_tot, y_test)
visualizer2.show()

### 7. Técnicas de balanceo
En este punto, propongamos algunos métodos de balanceo y revisemos su impacto en el performance del modelo candidato

In [None]:
pip install imbalanced-learn

In [None]:
from imblearn.under_sampling import RandomUnderSampler

#### Undersampling

In [None]:
rus = RandomUnderSampler(
    sampling_strategy='auto',
    random_state=0,
    replacement=True
)
X_train_woe_enc_tot_rus, y_train_rus = rus.fit_resample(X_train_woe_enc_tot, y_train)

In [None]:
# set up the model
gbc2 = GradientBoostingClassifier(random_state=0)

# determine the hyperparameter space
gbc_param_grid = dict(
    n_estimators=stats.randint(10, 200),
    min_samples_split=stats.uniform(0, 1),
    max_depth=stats.randint(1, 5),
    )

# search
reg_gbc2 = RandomizedSearchCV(gbc2, gbc_param_grid, scoring='roc_auc', cv=5, return_train_score=True, n_iter = 60, random_state=10, n_jobs=4)
search_gbc2 = reg_gbc2.fit(X_train_woe_enc_tot_rus, y_train_rus)

# best hyperparameters
search_gbc2.best_params_

In [None]:
results = pd.DataFrame(search_gbc2.cv_results_)[['params', 'mean_train_score', 'std_train_score','mean_test_score', 'std_test_score']]
results.sort_values(by='mean_test_score', ascending=False, inplace=True)
results.reset_index(drop=True, inplace=True)
results

In [None]:
results['mean_train_score'].plot(yerr=[results['std_train_score'], results['std_train_score']], subplots=True)
results['mean_test_score'].plot(yerr=[results['std_test_score'], results['std_test_score']], subplots=True)
plt.ylabel('Mean ROC_AUC')
plt.xlabel('Hyperparameter space')

In [None]:
# Modelo GBC con Balanceo
pred_train_gbc2 = search_gbc2.predict_proba(X_train_woe_enc_tot)
pred_test_gbc2 = search_gbc2.predict_proba(X_test_woe_enc_tot)
print('Train roc_auc: ', roc_auc_score(y_train, pred_train_gbc2[:,1]), 'GINI Train ', 2*roc_auc_score(y_train, pred_train_gbc2[:,1])-1)
print('Test roc_auc: ', roc_auc_score(y_test, pred_test_gbc2[:,1]), 'GINI Test ', 2*roc_auc_score(y_test, pred_test_gbc2[:,1])-1)

In [None]:
# Modelo GBC Previo
pred_train_gbc = search_gbc.predict_proba(X_train_woe_enc_tot)
pred_test_gbc = search_gbc.predict_proba(X_test_woe_enc_tot)
print('Train roc_auc: ', roc_auc_score(y_train, pred_train_gbc[:,1]), 'GINI Train ', 2*roc_auc_score(y_train, pred_train_gbc[:,1])-1)
print('Test roc_auc: ', roc_auc_score(y_test, pred_test_gbc[:,1]), 'GINI Test ', 2*roc_auc_score(y_test, pred_test_gbc[:,1])-1)

### 8. Calibración del Modelo:
En esta parte verificaremos, y de ser necesario, calibraremos el modelo revisando la relación entre los valores los ratios de default reales y las PDs promedios que arroja el modelo.

In [None]:
from sklearn.calibration import calibration_curve

In [None]:
# Veamos los RDs y los promedios de PDs en cada bucket para el modelo original (sin balanceo)
fraction_of_positives, mean_predicted_value = calibration_curve(
    y_test, pred_test_gbc[:, 1], n_bins=10, strategy='uniform')

len(mean_predicted_value), len(fraction_of_positives)

In [None]:
mean_predicted_value

In [None]:
fraction_of_positives

In [None]:
# Una función para consolidar lo anterior
def plot_calibration_curve(y_true, probs, bins, strategy):

    fraction_of_positives, mean_predicted_value = calibration_curve(
        y_true, probs, n_bins=bins, strategy=strategy)

    max_val = max(mean_predicted_value)

    plt.figure(figsize=(8,10))
    plt.subplot(2, 1, 1)
    plt.plot(mean_predicted_value, fraction_of_positives, label='Logistic Regression')
    plt.plot(np.linspace(0, max_val, bins), np.linspace(0, max_val, bins),
         linestyle='--', color='red', label='Perfect calibration')

    plt.xlabel('Probability Predictions')
    plt.ylabel('Fraction of positive examples')
    plt.title('Calibration Curve')
    plt.legend(loc='upper left')


    plt.subplot(2, 1, 2)
    plt.hist(probs, range=(0, 1), bins=bins, density=True, stacked=True, alpha=0.3)
    plt.xlabel('Probability Predictions')
    plt.ylabel('Fraction of examples')
    plt.title('Density')
    plt.show()

In [None]:
# Modelo Original, sin balanceo
plot_calibration_curve(y_test, pred_test_gbc[:, 1], bins=10, strategy='uniform')

In [None]:
from sklearn.metrics import brier_score_loss

In [None]:
brier_score_loss(y_test, pred_test_gbc[:, 1])

In [None]:
# Modelo con Undersampling
plot_calibration_curve(y_test, pred_test_gbc2[:, 1], bins=10, strategy='uniform')

In [None]:
brier_score_loss(y_test, pred_test_gbc2[:, 1])

### Planteemos dos métodos, el ajuste por función sigmoide y el ajuste por función isotónica

In [None]:
from sklearn.calibration import CalibratedClassifierCV

In [None]:
# Sobre el modelo original
# Calibración Sigmoide
cal_sigmoid = CalibratedClassifierCV(search_gbc, cv='prefit', method='sigmoid')
cal_sigmoid.fit(X_test_woe_enc_tot, y_test)
prob_sigmoid = cal_sigmoid.predict_proba(X_test_woe_enc_tot)[:, 1]

# Calibración Isotónica
cal_isotonic = CalibratedClassifierCV(search_gbc, cv='prefit', method='isotonic')
cal_isotonic.fit(X_test_woe_enc_tot, y_test)
prob_isotonic = cal_isotonic.predict_proba(X_test_woe_enc_tot)[:, 1]

#### Revisemos la calibración Sigmoide

In [None]:
plot_calibration_curve(y_test, prob_sigmoid, bins=10, strategy='uniform')

### Acaso la calibración varía la discriminación del modelo?

In [None]:
print('Test set')
print('GBC roc-auc: {}'.format(roc_auc_score(y_test, pred_test_gbc[:,1])))
print('Test set con Calibrado')
print('GBC roc-auc: {}'.format(roc_auc_score(y_test, prob_sigmoid)))

In [None]:
print('Test set')
print('Brier Score: {}'.format(brier_score_loss(y_test, pred_test_gbc[:, 1])))
print('Test set con Calibrado')
print('Brier Score: {}'.format(brier_score_loss(y_test, prob_sigmoid)))

### Ahora revisemos la calibración isotónica

In [None]:
plot_calibration_curve(y_test, prob_isotonic, bins=10, strategy='uniform')

In [None]:
print('Test set')
print('GBC roc-auc: {}'.format(roc_auc_score(y_test, pred_test_gbc[:,1])))
print('Test set con Calibrado')
print('GBC roc-auc: {}'.format(roc_auc_score(y_test, prob_isotonic)))

In [None]:
print('Test set')
print('Brier Score: {}'.format(brier_score_loss(y_test, pred_test_gbc[:, 1])))
print('Test set con Calibrado')
print('Brier Score: {}'.format(brier_score_loss(y_test, prob_isotonic)))

In [None]:
# Sobre el modelo balanceado por undersampling
# Calibración Sigmoide
cal_sigmoid2 = CalibratedClassifierCV(search_gbc2, cv='prefit', method='sigmoid')
cal_sigmoid2.fit(X_test_woe_enc_tot, y_test)
prob_sigmoid2 = cal_sigmoid2.predict_proba(X_test_woe_enc_tot)[:, 1]

# Calibración Isotónica
cal_isotonic2 = CalibratedClassifierCV(search_gbc2, cv='prefit', method='isotonic')
cal_isotonic2.fit(X_test_woe_enc_tot, y_test)
prob_isotonic2 = cal_isotonic2.predict_proba(X_test_woe_enc_tot)[:, 1]

In [None]:
# Calibración sigmoidea
plot_calibration_curve(y_test, prob_sigmoid2, bins=10, strategy='uniform')

In [None]:
print('Test set')
print('GBC roc-auc: {}'.format(roc_auc_score(y_test, pred_test_gbc2[:,1])))
print('Test set con Calibrado')
print('GBC roc-auc: {}'.format(roc_auc_score(y_test, prob_sigmoid2)))

In [None]:
print('Test set')
print('Brier Score: {}'.format(brier_score_loss(y_test, pred_test_gbc2[:, 1])))
print('Test set con Calibrado')
print('Brier Score: {}'.format(brier_score_loss(y_test, prob_sigmoid2)))

In [None]:
# Calibración isotónica
plot_calibration_curve(y_test, prob_isotonic2, bins=10, strategy='uniform')

In [None]:
print('Test set')
print('GBC roc-auc: {}'.format(roc_auc_score(y_test, pred_test_gbc2[:,1])))
print('Test set con Calibrado')
print('GBC roc-auc: {}'.format(roc_auc_score(y_test, prob_isotonic2)))

In [None]:
print('Test set')
print('Brier Score: {}'.format(brier_score_loss(y_test, pred_test_gbc2[:, 1])))
print('Test set con Calibrado')
print('Brier Score: {}'.format(brier_score_loss(y_test, prob_isotonic2)))

## **9. Unboxing de Modelos**

---



#### Primero, entendamos la importancia de cada variable. Hagamos el ejercicio con cada tipo de algoritmo

#### **Logit**

In [None]:
fi=pd.Series(logit.coef_[0],index=logit.feature_names_in_)
fi.sort_values(ascending=True, inplace=True)
fi.plot.barh(color='blue')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance")
plt.show ()

#### **Random Forest**

In [None]:
fi=pd.Series(rf.feature_importances_,index=rf.feature_names_in_)
fi.sort_values(ascending=True, inplace=True)
fi.plot.barh(color='blue')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance")
plt.show ()

#### **Gradient Boosting**

In [None]:
fi=pd.Series(gbc.feature_importances_,index=gbc.feature_names_in_)
fi.sort_values(ascending=True, inplace=True)
fi.plot.barh(color='blue')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance")
plt.show ()

### Ahora realicemos un análisis de la importancia de las variables usando **SHAP**

In [None]:
pip install shap

In [None]:
import shap

In [None]:
explainer = shap.Explainer(gbc)
shap_values_bin = explainer(X_test_woe_enc_tot)
print(shap_values_bin.shape)

In [None]:
shap.plots.waterfall(shap_values_bin[0],max_display=30)

### En este punto hagamos la prueba de como se incrementa o reduce el riesgo según el aprote de cada variable

In [None]:
pred_test_gbc[0,1]

In [None]:
import math

In [None]:
# el valor del log odds en la primera observación es
x=-3.461
# Por tanto su probabilidad es:
1/(1+math.exp(-x))

#### Podemos graficar la relación entre los shap values (log odds) y los valores de una variable

In [None]:
shap.plots.scatter(shap_values_bin[:, "EDAD_T"])

In [None]:
shap.plots.scatter(shap_values_bin[:, "INGRESO_CLIENTE"])

In [None]:
# En este punto revisamos el valor en probabilidad y como se explica cada punto
shap.initjs()
shap.plots.force(shap_values_bin[0],link='logit')

### Con el gráfico de barras vamos a evaluar la importancia media de cada variable

In [None]:
shap.plots.bar(shap_values_bin,max_display=30)

#### Podemos también construir un gráfico local, es decir, para una observación

In [None]:
shap.plots.bar(shap_values_bin[0],max_display=30)

In [None]:
# Este tipo de gráfico evalua todo el conjunto
shap.plots.beeswarm(shap_values_bin, max_display=30)

## **10. Pickling y Unpickling el Modelo y del Pipeline del Feature Engineering**

In [None]:
import pickle

In [None]:
# Empaquetamos el pipeline del feature engineering
with open('fe_pipeline.pickle','wb') as fe_data_file:
     pickle.dump(pipe_5,fe_data_file)

In [None]:
# Empaquetamos el modelo obtenido
with open('final_model.pickle','wb') as modelFile:
     pickle.dump(gbc,modelFile)

In [None]:
# Cargar el pipeline del feature engineering
with open('fe_pipeline.pickle','rb') as fe_data_file:
     fe_final = pickle.load(fe_data_file)

In [None]:
# Cargar el modelo
with open('final_model.pickle','rb') as modelFile:
     modelo_final = pickle.load(modelFile)

In [None]:
#Probemos el pipeline
X_train_t_p=fe_final.transform(X_train)
X_test_t_p=fe_final.transform(X_test)

In [None]:
X_train_woe_enc_tot_p=pd.concat([X_train_t_p[cat_cols_2],X_train_t_p[num_cols_2]],axis=1)
X_test_woe_enc_tot_p=pd.concat([X_test_t_p[cat_cols_2],X_test_t_p[num_cols_2]],axis=1)

In [None]:
# Probemos el modelo
pred_train_prueba = modelo_final.predict_proba(X_train_woe_enc_tot_p)
pred_test_prueba = modelo_final.predict_proba(X_test_woe_enc_tot_p)
print('GBC Train Prueba roc-auc: {}'.format(roc_auc_score(y_train, pred_train_prueba[:,1])))
print('GBC Test Prueba roc-auc: {}'.format(roc_auc_score(y_test, pred_test_prueba[:,1])))

In [None]:
pd.concat([X_test,y_test],axis=1).to_csv('Base_SolicitudesCreditoEfectivo_Test.csv', index = False)