## Setup enviroment

In [1]:
import pickle
import os
import gc
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, roc_auc_score, average_precision_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append(r'C:\Users\jeanr\Documents\projetos\classificacao\lending-club\global')
from util import *
import optuna
import arfs.feature_selection.allrelevant as arfsgroot
import shap
import json
import joblib
from venn_abers import VennAbersCalibrator

sns.set(style='whitegrid')

  from .autonotebook import tqdm as notebook_tqdm


## Read train, validation, calibration and test data

In [2]:
train_df = pd.read_parquet('../data/processed/train_df.parquet')
validation_df = pd.read_parquet('../data/processed/validation_df.parquet')
calibration_df = pd.read_parquet('../data/processed/calibration_df.parquet')
test_df = pd.read_parquet('../data/processed/feature_engineering_test.parquet')

## Preparando os dados

In [3]:
metadados = generate_metadata(train_df, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,verification_status_joint,Explicativa,88683,99.93,1,category
1,emp_length,Explicativa,4446,5.01,11,category
2,num_op_rev_tl,Explicativa,0,0.0,48,Int16
3,pub_rec_bankruptcies,Explicativa,0,0.0,9,Int16
4,percent_bc_gt_75,Explicativa,0,0.0,136,float16
5,pct_tl_nvr_dlq,Explicativa,0,0.0,425,float16
6,num_tl_op_past_12m,Explicativa,0,0.0,22,Int16
7,num_tl_90g_dpd_24m,Explicativa,0,0.0,20,Int16
8,num_tl_30dpd,Explicativa,0,0.0,5,Int16
9,num_tl_120dpd_2m,Explicativa,0,0.0,4,Int16


## Excluindo variáveis com mais de 70% de nulos

In [4]:
missing_cutoff = 70

drop_vars_nulos = metadados[(metadados['PC_NULOS'] >= missing_cutoff)]
lista_drop_vars = list(drop_vars_nulos.FEATURE.values)
print('Variáveis que serão excluídas por alto percentual de nulos: ',lista_drop_vars)
  # retirando lista de variáveis com alto percentual de nulos
train_df_02 = train_df.drop(axis=1,columns=lista_drop_vars)
train_df_02.shape

Variáveis que serão excluídas por alto percentual de nulos:  ['verification_status_joint']


(88744, 101)

In [5]:
# Salvando a lista em um arquivo .pkl
with open('../artifacts/prd_drop_nullvars.pkl', 'wb') as f:
    pickle.dump(lista_drop_vars, f)

In [6]:
# Abrindo arquivo .pkl
with open('../artifacts/prd_drop_nullvars.pkl', 'rb') as f:
  lista_drop_vars = pickle.load(f)

## Aplicando a retirada de nulos nas outras bases

In [7]:
  # retirando lista de variáveis com alto percentual de nulos
validation_df_02 = validation_df.drop(axis=1,columns=lista_drop_vars)
validation_df_02.shape

(264688, 101)

In [8]:
  # retirando lista de variáveis com alto percentual de nulos
calibration_df_02 = calibration_df.drop(axis=1,columns=lista_drop_vars)
calibration_df_02.shape

(66173, 101)

In [9]:
# Loading features from features_list.json
with open('../artifacts/features_list.json', 'r') as f:
    features = json.load(f)

In [10]:
if 'default' in features:
    features.remove('default')
test_df = test_df[features]

In [11]:
  # retirando lista de variáveis com alto percentual de nulos
test_df_02 = test_df.drop(axis=1,columns=lista_drop_vars)
test_df_02.shape

(538826, 100)

## Tratamento de nulos

In [12]:
train_df_02 = train_df_02.drop(axis=1, columns=['default'])
train_df_02.head()

Unnamed: 0,funded_amnt,funded_amnt_inv,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,...,income_to_funded_ratio,debt_to_income_ratio,funded_amnt_per_income,fico_avg,credit_utilization_ratio,total_credit_lines,delinquency_ratio,int_rate_to_income_ratio,public_records_impact,pct_active_bc
2061403,14400,14400.0,36,0.099915,B,B3,2 years,RENT,66000.0,Source Verified,...,4.583333,0.000121,0.218182,0.502985,0.563946,73,0.333333,1.513857e-06,14400,0.5
1693728,9325,9328.0,36,0.129883,B,B5,10+ years,MORTGAGE,41000.0,Verified,...,4.396783,0.000701,0.227439,0.50274,0.602335,27,0.0,3.167874e-06,0,0.166667
1516805,5000,5000.0,36,0.143066,C,C4,,OWN,72000.0,Verified,...,14.4,0.000368,0.069444,0.502963,0.834756,20,0.076923,1.987033e-06,0,0.25
74963,28000,27952.0,60,0.119873,B,B3,1 year,RENT,120000.0,Verified,...,4.285714,0.000146,0.233333,0.502721,0.446627,36,0.0,9.98942e-07,0,0.4
2107479,10775,10776.0,36,0.156128,D,D1,< 1 year,MORTGAGE,55200.0,Source Verified,...,5.12297,0.000194,0.195199,0.502985,0.612586,15,0.0,2.828405e-06,10775,0.333333


In [13]:
train_df_03, means = fillna_numeric(train_df_02)

with open('../artifacts/prd_fillna_num.pkl', 'wb') as f:
  pickle.dump(means, f)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(means[col], inplace=True)


In [14]:
train_df_03, modes = fillna_categorical(train_df_02)

with open('../artifacts/prd_fillna_catg.pkl', 'wb') as f:
  pickle.dump(modes, f)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(modes[col], inplace=True)


## Aplicando tratamento de nulos nas outras bases

In [15]:
with open('../artifacts/prd_fillna_num.pkl', 'rb') as f:
  loaded_means = pickle.load(f)

In [16]:
with open('../artifacts/prd_fillna_catg.pkl', 'rb') as f:
  loaded_modes = pickle.load(f)

In [17]:
validation_df_03 = fillna_num_prod(validation_df_02,loaded_means)
validation_df_03.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mean_value, inplace=True)


(264688, 101)

In [18]:
calibration_df_03 = fillna_num_prod(calibration_df_02,loaded_means)
calibration_df_03.shape

(66173, 101)

In [19]:
test_df_03 = fillna_num_prod(test_df_02,loaded_means)
test_df_03.shape

(538826, 100)

In [20]:
train_df_03 = fillna_catg_prod(train_df_03,loaded_modes)
validation_df_03 = fillna_catg_prod(validation_df_03,loaded_modes)
calibration_df_03 = fillna_catg_prod(calibration_df_03,loaded_modes)
test_df_03 = fillna_catg_prod(test_df_03,loaded_modes)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)


In [21]:
train_df_03.shape, calibration_df_03.shape, validation_df_03.shape, test_df_03.shape

((88744, 100), (66173, 101), (264688, 101), (538826, 100))

In [22]:
metadados = generate_metadata(train_df_03, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,funded_amnt,Explicativa,0,0.0,1300,Int32
1,num_il_tl,Explicativa,0,0.0,76,Int16
2,percent_bc_gt_75,Explicativa,0,0.0,136,float16
3,pct_tl_nvr_dlq,Explicativa,0,0.0,425,float16
4,num_tl_op_past_12m,Explicativa,0,0.0,22,Int16
5,num_tl_90g_dpd_24m,Explicativa,0,0.0,20,Int16
6,num_tl_30dpd,Explicativa,0,0.0,5,Int16
7,num_tl_120dpd_2m,Explicativa,0,0.0,4,Int16
8,num_sats,Explicativa,0,0.0,56,Int16
9,num_rev_tl_bal_gt_0,Explicativa,0,0.0,35,Int16


## Compondo tabela analítica de modelagem (ABT)

In [23]:
#### Trazer o id e target para a tabela pós dataprep

abt_train = train_df_03.merge(train_df[['default']], left_index=True, right_index=True, how='inner')

In [24]:
abt_calibration = calibration_df_03.copy()
abt_validation = validation_df_03.copy()
abt_test = test_df_03.copy()

In [25]:
abt_train.shape, abt_calibration.shape, abt_validation.shape, abt_test.shape

((88744, 101), (66173, 101), (264688, 101), (538826, 100))

## Salvando ABT`s de treino e teste pós preparação dos dados

In [26]:
# Save training data
abt_train.to_parquet('../data/processed/abt_train.parquet')

# Save validation data
abt_calibration.to_parquet('../data/processed/abt_calibration.parquet')

# Save calibration data
abt_validation.to_parquet('../data/processed/abt_validation.parquet')

# Save test data
abt_test.to_parquet('../data/processed/abt_test.parquet')

In [27]:
metadados = generate_metadata(abt_train, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,funded_amnt,Explicativa,0,0.0,1300,Int32
1,num_op_rev_tl,Explicativa,0,0.0,48,Int16
2,pub_rec_bankruptcies,Explicativa,0,0.0,9,Int16
3,percent_bc_gt_75,Explicativa,0,0.0,136,float16
4,pct_tl_nvr_dlq,Explicativa,0,0.0,425,float16
5,num_tl_op_past_12m,Explicativa,0,0.0,22,Int16
6,num_tl_90g_dpd_24m,Explicativa,0,0.0,20,Int16
7,num_tl_30dpd,Explicativa,0,0.0,5,Int16
8,num_tl_120dpd_2m,Explicativa,0,0.0,4,Int16
9,num_sats,Explicativa,0,0.0,56,Int16


In [28]:
metadados = generate_metadata(abt_test, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,funded_amnt,Explicativa,0,0.0,1560,Int32
1,num_il_tl,Explicativa,0,0.0,101,Int16
2,percent_bc_gt_75,Explicativa,0,0.0,209,float16
3,pct_tl_nvr_dlq,Explicativa,0,0.0,593,float16
4,num_tl_op_past_12m,Explicativa,0,0.0,26,Int16
5,num_tl_90g_dpd_24m,Explicativa,0,0.0,25,Int16
6,num_tl_30dpd,Explicativa,0,0.0,2,Int16
7,num_tl_120dpd_2m,Explicativa,0,0.0,2,Int16
8,num_sats,Explicativa,0,0.0,76,Int16
9,num_rev_tl_bal_gt_0,Explicativa,0,0.0,45,Int16
