<a href="https://colab.research.google.com/github/jpantojaj/DIP-CS_AI/blob/main/Credit_Scoring_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **1. Carga Inicial de Librerías**

In [None]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import math
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder

In [None]:
class CategoricalImputerWithDecoding(BaseEstimator, TransformerMixin):
    def __init__(self, random_state=42, max_iter=10):
        self.random_state = random_state
        self.max_iter = max_iter
        self.ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        self.imputer = IterativeImputer(
            estimator=RandomForestClassifier(random_state=self.random_state),
            max_iter=self.max_iter,
            random_state=self.random_state,
            add_indicator=False
        )
        self.columns_ = None

    def set_output(self, *, transform=None):
        return self

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        self.columns_ = X.columns.tolist()
        X_encoded = self.ordinal_encoder.fit_transform(X)
        self.imputer.fit(X_encoded)
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.columns_)
        X_encoded = self.ordinal_encoder.transform(X)
        X_imputed_encoded = self.imputer.transform(X_encoded)
        X_imputed_decoded = self.ordinal_encoder.inverse_transform(X_imputed_encoded)
        return pd.DataFrame(X_imputed_decoded, columns=self.columns_, index=X.index)

In [None]:
class PowerWinsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, capping_method='both', lower_percentile=0.01, upper_percentile=0.99):
        self.capping_method = capping_method
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile
        self.pt = PowerTransformer(method='yeo-johnson', standardize=True)
        self.columns_ = None
        self.capping_values_ = {}
        self.original_dtypes_ = None

    def fit(self, X, y=None):
        if isinstance(X, pd.Series):
            X = pd.DataFrame(X)
        elif not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=[f'col_{i}' for i in range(X.shape[1])])
        self.columns_ = X.columns.tolist()
        self.original_dtypes_ = X.dtypes
        self.pt.fit(X)
        X_transformed = self.pt.transform(X)

        for i, col in enumerate(self.columns_):
            lower_cap = np.percentile(X_transformed[:, i], self.lower_percentile * 100)
            upper_cap = np.percentile(X_transformed[:, i], self.upper_percentile * 100)
            self.capping_values_[col] = {'lower_cap': lower_cap, 'upper_cap': upper_cap}

        return self

    def transform(self, X):
        if isinstance(X, pd.Series):
            X = pd.DataFrame(X, columns=self.columns_)
        elif not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.columns_)
        original_index = X.index
        X_transformed = self.pt.transform(X)
        X_capped_transformed = X_transformed.copy()
        for i, col in enumerate(self.columns_):
            lower_cap = self.capping_values_[col]['lower_cap']
            upper_cap = self.capping_values_[col]['upper_cap']
            if self.capping_method == 'right' or self.capping_method == 'both':
                X_capped_transformed[:, i][X_capped_transformed[:, i] > upper_cap] = upper_cap
            if self.capping_method == 'left' or self.capping_method == 'both':
                X_capped_transformed[:, i][X_capped_transformed[:, i] < lower_cap] = lower_cap

        X_final = self.pt.inverse_transform(X_capped_transformed)
        X_final_df = pd.DataFrame(X_final, columns=self.columns_, index=original_index)
        for col in X_final_df.columns:
            if col in self.original_dtypes_.index:
                X_final_df[col] = X_final_df[col].astype(self.original_dtypes_[col])

        return X_final_df

    def set_output(self, *, transform=None):
        return self

## **2. Carga y Análisis inicial de datos**

In [None]:
df_val = pd.read_csv('Base_SolicitudesCreditoEfectivo_Val.csv', sep = ",")
df_val.head()

In [None]:
# Repormateo de algunos campos
df_val['CODMES']=df_val['CODMES'].astype(str)
df_val['CODSOLICITUD']=df_val['CODSOLICITUD'].astype(str)
df_val['FLG_GARANTIA']=df_val['FLG_GARANTIA'].astype(str)
df_val['TARJETA_RELACIONADA']=df_val['TARJETA_RELACIONADA'].astype(str)
df_val['VEHICULAR_RELACIONADA']=df_val['VEHICULAR_RELACIONADA'].astype(str)
df_val['HIPOTECARIO_RELACIONADA']=df_val['HIPOTECARIO_RELACIONADA'].astype(str)
df_val['CLASIF_SISTEMA_ULT_12M']=df_val['CLASIF_SISTEMA_ULT_12M'].astype(str)
df_val['FLG_PDH']=df_val['FLG_PDH'].astype(str)
df_val['FLG_TC_VISA']=df_val['FLG_TC_VISA'].astype(str)
df_val['FLG_TC_MC']=df_val['FLG_TC_MC'].astype(str)

In [None]:
# Eliminación de un campo innecesario
df_val.drop('MIN_MES_DE_DEFAULT', axis=1, inplace=True)

In [None]:
# Parche para que se deje en nulo los registros donde tenemos nan
for col in df_val.select_dtypes(include='object').columns:
    df_val[col] = df_val[col].replace('nan', np.nan)

In [None]:
sns.countplot(data = df_val, x = "FLG_DEFAULT_12M")
target_count = df_val.FLG_DEFAULT_12M.value_counts()
print('# Buen_Pagador:', target_count[0])
print('# 1 Mora_12M:', target_count[1])
print('Bad rate:', target_count[1]/(target_count[0]+target_count[1]))

In [None]:
a2=df_val.pivot_table(values="FLG_DEFAULT_12M", index="CODMES", aggfunc="mean", sort=True)
a2.plot(alpha = 0.4, figsize=(9,4), ylim=(0.0,0.15))

## **3. Carga de los artefactos desarrollados en el entrenamiento**

In [None]:
import pickle

In [None]:
pip install feature_engine

In [None]:
# Cargar el pipeline del feature engineering
with open('fe_pipeline.pkl','rb') as fe_data_file:
     fe_final = pickle.load(fe_data_file)

In [None]:
# Cargar el modelo
with open('final_model.pkl','rb') as modelFile:
     modelo_final = pickle.load(modelFile)

In [None]:
#Probemos el pipeline
df_val_xt=fe_final.transform(df_val.drop(['FLG_DEFAULT_12M'],axis=1))
df_val_y=df_val['FLG_DEFAULT_12M']

In [None]:
tot_df_val_xt=pd.concat([df_val_xt, df_val_y],axis=1)

## **2. Pruebas de Validación**

### **Boostrapping**

In [None]:
from sklearn.model_selection  import KFold, StratifiedKFold, cross_val_score
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score

In [None]:
bootstrap_iter = 50

In [None]:
roc_auc = []

In [None]:
for i in range(bootstrap_iter):
    X_, y_ = resample(df_val_xt, df_val_y)
    #gbc.fit(X_, y_)
    y_pred = modelo_final.predict_proba(X_)
    acc = roc_auc_score(y_,y_pred[:,1])
    roc_auc.append(acc)

In [None]:
roc_auc_final = np.array(roc_auc)

In [None]:
roc_auc_final

In [None]:
gini_final=2*roc_auc_final-1

In [None]:
gini_final

In [None]:
print('Gini')
print('Average: ', gini_final.mean())
print('Standard deviation: ', gini_final.std())

In [None]:
sns.kdeplot(gini_final, shade=True, color='blue')
plt.title('Gráfico de Gini')
plt.xlabel('Valor')
plt.ylabel('Densidad')
plt.grid(True)
plt.show()