In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from functions import *

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance

import mlflow
import optuna
from optuna.samplers import RandomSampler, TPESampler
import pickle

from helper import Helper
#Sparky
from sparky_bc.sparky import Sparky

In [None]:
sp = Sparky('lasalaza', 'IMPALA_PROD', hostname="sbmdeblze004.bancolombia.corp")

In [None]:
np.random.seed(seed=42)

In [None]:
# Constantes.

SEED = 42

LIMITES_G = {
    'G1': [0, 0.009],
    'G2': [0.009, 0.017],
    'G3': [0.017, 0.03],
    'G4': [0.03, 0.05],
    'G5': [0.05, 0.08],
    'G6': [0.08, 0.12],
    'G7': [0.12, 0.28],
    'G8': [0.28, 1]
}

In [None]:
# Base de construcción del modelo.

df_desembolsos = pd.read_csv('Dataset_curso_ML_python.csv')
#df_desembolsos = sp.helper.obtener_dataframe(f"SELECT * FROM resultados_bipa_vpr.score_orig_pasivos_17939_base_entrenamiento_v2" )

df_desembolsos.shape

In [None]:
df_desembolsos

In [None]:
get_default_rate(df_desembolsos)

In [None]:
variables_predictoras = [c for c in df_desembolsos.columns if c not in ['id', 'tipo_doc', 'f_analisis', 'llave_sistema', 'segm', 'default']]
len(variables_predictoras)

In [None]:
# Creación de dos variables aleatorias de ruido

df_desembolsos = df_desembolsos.assign(random_normal=np.random.normal(0, 1, size=(len(df_desembolsos.index))))
df_desembolsos = df_desembolsos.assign(random_uniform=np.random.uniform(0, 1, size=(len(df_desembolsos.index))))

In [None]:
# Número de clientes y tasa de default por fecha de análisis

df_fechas = df_desembolsos.groupby('f_analisis').agg(clientes=('id', 'count'), default=('default', 'sum')).reset_index()

df_fechas['%_clientes'] = df_fechas['clientes'] / sum(df_fechas['clientes'])
df_fechas['%_clientes_acum'] = df_fechas['%_clientes'].cumsum()
df_fechas['TDO'] = df_fechas['default'] / df_fechas['clientes']

df_fechas

In [None]:
x = [str(f) for f in df_fechas['f_analisis']]
y = df_fechas['TDO']

fig, ax = plt.subplots()
ax.plot(x, y)
#ax.legend()

ax.set_ylabel('Tasa de Default')
ax.set_xlabel('Fecha de Análisis')

ax.set_ylim([0.02, 0.2])

plt.xticks(rotation=90)
fig.set_figwidth(15)
plt.grid(color='c', linestyle='--', linewidth=1)
plt.show()

# 1. Particionamiento de la base

In [None]:
df_entrenamiento = df_desembolsos[df_desembolsos['f_analisis'] <= 201809]
df_fuera_tiempo = df_desembolsos[df_desembolsos['f_analisis'] > 201809]

print('Entrenamiento', df_entrenamiento.shape)
print('Fuera de tiempo', df_fuera_tiempo.shape)

In [None]:
X, y = df_entrenamiento[variables_predictoras + ['random_uniform', 'random_normal']], df_entrenamiento['default']
X_oot, y_oot = df_fuera_tiempo[variables_predictoras + ['random_uniform', 'random_normal']], df_fuera_tiempo['default']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
print('Entrenamiento:', X_train.shape, 'Tasa Default:', sum(y_train) / len(y_train))
print('Prueba', X_test.shape, 'Tasa Default:', sum(y_test) / len(y_test))
print('Fuera de tiempo (OOT)', X_oot.shape, 'Tasa Default:', sum(y_oot) / len(y_oot))

# 2. Ensamble débil para depurar variables

In [None]:
clf = RandomForestClassifier(max_depth=10, n_estimators=100, verbose=2, random_state=42)
clf.fit(X_train, y_train)

In [None]:
importancias = dict(zip(X_train.columns, clf.feature_importances_))
importancias

In [None]:
df_importancias = pd.DataFrame(importancias.items(), columns=['variable', 'importancia'])
df_importancias = df_importancias.sort_values(by='importancia', ascending=False)
df_importancias

In [None]:
umbral = df_importancias[df_importancias['variable'].isin(['random_uniform', 'random_normal'])].iloc[0]['importancia']

variables_predictoras = df_importancias[df_importancias['importancia'] > umbral]['variable'].tolist()
len(variables_predictoras)

# 3. Búsqueda de Hiperparámetros (espacio amplio)

In [None]:
X, y = df_entrenamiento[variables_predictoras + ['random_uniform', 'random_normal']], df_entrenamiento['default']
X_oot, y_oot = df_fuera_tiempo[variables_predictoras], df_fuera_tiempo['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.8),
        'max_iter': trial.suggest_int('max_iter', 50, 800),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 80),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1000, 15000),
        'l2_regularization': trial.suggest_float('l2_regularization', 0, 0.5),
        'max_bins': trial.suggest_int('max_bins', 5, 255),
        'validation_fraction': trial.suggest_float('validation_fraction', 0.05, 0.5)
    }
        
    clf = HistGradientBoostingClassifier(random_state=SEED).set_params(**params)
    clf.fit(X_train, y_train)
        
    y_proba_train = clf.predict_proba(X_train)[:,1]
    y_proba_test = clf.predict_proba(X_test)[:,1]
    
    metrics_train = get_metrics(y_train, y_proba_train, [('G1', 'G6')], LIMITES_G, 'entr')
    metrics_test = get_metrics(y_test, y_proba_test, [('G1', 'G6')], LIMITES_G, 'prueba')
    
    loss = metrics_train['n_en_rango_g1_g6_entr'] + metrics_test['n_en_rango_g1_g6_prueba']

    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        mlflow.log_metrics(metrics_train)
        mlflow.log_metrics(metrics_test)
        mlflow.log_metric('loss', loss)

    return loss

In [None]:
# Ejecución de la optimización con el espacio amplio.

with mlflow.start_run() as run:
    print("tags.mlflow.parentRunId = '" + run.info.run_id + "'")
    
    sampler = RandomSampler(seed=10)
    
    study = optuna.create_study(sampler=sampler, direction='maximize')
    study.optimize(objective, n_trials=5, show_progress_bar=True)

### Reto 1
El modelo propuesto se entrena usando Histogram-based Gradient Boosting Classification Tree, se tiene la idea de implementar el modelo usando LightGBM, ¿Cómo podemos hacer esto? 

In [None]:
# Mejores hiperparámetros del modelo.

params = study.best_params
params

In [None]:
# Probar el mejor modelo seleccionado.

clf = HistGradientBoostingClassifier(random_state=SEED).set_params(**params)
clf.fit(X_train, y_train)
    
y_proba_train = clf.predict_proba(X_train)[:,1]
y_proba_test = clf.predict_proba(X_test)[:,1]

In [None]:
# Tabla con TDO y % Clientes por G en entrenamiento.

cumulative_gains_table(y_train, y_proba_train, LIMITES_G, percentage=True)

In [None]:
# Tabla con TDO y % Clientes por G en prueba.

cumulative_gains_table(y_test, y_proba_test, LIMITES_G, percentage=True)

### Reto 2
Mostrar en la tabla anterior dos columnas que indiquen la proporción de clientes buenos y malos por cada G sobre la distribución total.

In [None]:
# Importancia de los hiperparámetros.

optuna.visualization.plot_param_importances(study)

In [None]:
# Importancias del mejor modelo del espacio de búsqueda amplio.

importances = permutation_importance(estimator=clf, X=X_train, y=y_train, n_repeats=5, n_jobs=-1, random_state=SEED)
importances_dict = dict(zip(variables_predictoras + ['random_uniform', 'random_normal'], importances['importances_mean']))

importances_df = pd.DataFrame(importances_dict.items(), columns=['feature', 'importance'])
importances_df = importances_df.sort_values(by='importance', ascending=False)

importances_df

In [None]:
# Selección de variables más importantes que los dos ruidos aleatorios.

threshold = importances_df[importances_df['feature'].isin(['random_uniform', 'random_normal'])].iloc[0]['importance']
variables_predictoras = importances_df[importances_df['importance'] > threshold]['feature'].tolist()

len(variables_predictoras)

# 4. Búsqueda de Hiperparámetros (espacio acotado)

In [None]:
X, y = df_entrenamiento[variables_predictoras + ['random_uniform', 'random_normal']], df_entrenamiento['default']
X_oot, y_oot = df_fuera_tiempo[variables_predictoras], df_fuera_tiempo['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.8),
        'max_iter': trial.suggest_int('max_iter', 50, 800),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 80),
        'max_depth': trial.suggest_int('max_depth', 3, 30),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1000, 15000),
        'l2_regularization': trial.suggest_float('l2_regularization', 0, 0.5),
        'max_bins': trial.suggest_int('max_bins', 5, 255),
        'validation_fraction': trial.suggest_float('validation_fraction', 0.05, 0.5)
    }
        
    clf = HistGradientBoostingClassifier(random_state=SEED).set_params(**params)
    clf.fit(X_train, y_train)
        
    y_proba_train = clf.predict_proba(X_train)[:,1]
    y_proba_test = clf.predict_proba(X_test)[:,1]
    
    metrics_train = get_metrics(y_train, y_proba_train, [('G1', 'G6')], LIMITES_G, 'entr')
    metrics_test = get_metrics(y_test, y_proba_test, [('G1', 'G6')], LIMITES_G, 'prueba')
    
    loss = metrics_train['n_en_rango_g1_g6_entr'] + metrics_test['n_en_rango_g1_g6_prueba']

    with mlflow.start_run(nested=True):
        mlflow.log_params(params)
        mlflow.log_metrics(metrics_train)
        mlflow.log_metrics(metrics_test)
        mlflow.log_metric('loss', loss)

    return loss

In [None]:
# Ejecución de la optimización con el espacio acotado.

with mlflow.start_run() as run:
    print("tags.mlflow.parentRunId = '" + run.info.run_id + "'")
    
    sampler = TPESampler(seed=10)  # Make the sampler behave in a deterministic way.
    
    study_2 = optuna.create_study(sampler=sampler, direction='maximize')
    study_2.optimize(objective, n_trials=5, show_progress_bar=True)

In [None]:
# Mejores hiperparámetros (mejor modelo) de la optimización.

params_2 = study_2.best_params
params_2

In [None]:
# Probar el mejor modelo seleccionado.

clf = HistGradientBoostingClassifier(random_state=SEED).set_params(**params_2)
clf.fit(X_train, y_train)
    
y_proba_train = clf.predict_proba(X_train)[:,1]
y_proba_test = clf.predict_proba(X_test)[:,1]

In [None]:
# Tabla con TDO y % Clientes por G en entrenamiento.

cumulative_gains_table(y_train, y_proba_train, LIMITES_G, percentage=True)

In [None]:
# Tabla con TDO y % Clientes por G en prueba.

cumulative_gains_table(y_test, y_proba_test, LIMITES_G, percentage=True)

In [None]:
# Importancia de los hiperparámetros.

optuna.visualization.plot_param_importances(study_2)

In [None]:
# Importancias del mejor modelo del espacio de búsqueda acotado.

importances = permutation_importance(estimator=clf, X=X_train, y=y_train, n_repeats=5, n_jobs=-1, random_state=SEED)
importances_dict = dict(zip(variables_predictoras + ['random_uniform', 'random_normal'], importances['importances_mean']))

importances_df = pd.DataFrame(importances_dict.items(), columns=['feature', 'importance'])
importances_df = importances_df.sort_values(by='importance', ascending=False)

importances_df

In [None]:
# Selección de variables más importantes que los dos ruidos aleatorios.

threshold = importances_df[importances_df['feature'].isin(['random_uniform', 'random_normal'])].iloc[0]['importance']
variables_predictoras = importances_df[importances_df['importance'] > threshold]['feature'].tolist()

len(variables_predictoras)

# 5. Mejor Modelo Seleccionado

In [None]:
X, y = df_entrenamiento[variables_predictoras], df_entrenamiento['default']
X_oot, y_oot = df_fuera_tiempo[variables_predictoras], df_fuera_tiempo['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Modelo final.

clf = HistGradientBoostingClassifier(random_state=SEED).set_params(**params_2)
clf.fit(X_train, y_train)
    
y_proba_train = clf.predict_proba(X_train)[:,1]
y_proba_test = clf.predict_proba(X_test)[:,1]

In [None]:
# Tabla con TDO y % Clientes por G en entrenamiento.

cumulative_gains_table(y_train, y_proba_train, LIMITES_G, percentage=True)

In [None]:
# Tabla con TDO y % Clientes por G en prueba.

cumulative_gains_table(y_test, y_proba_test, LIMITES_G, percentage=True)

In [None]:
# Guadar PKL del modelo final.

with open('modelo_score.pkl', "wb") as modelo_pkl:
    pickle.dump(clf, modelo_pkl)