## Setup enviroment

In [1]:
import pickle
import os
import gc
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, roc_auc_score, average_precision_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append(r'C:\Users\jeanr\Documents\projetos\classificacao\lending-club\global')
from util import *
import optuna
import arfs.feature_selection.allrelevant as arfsgroot
import shap
import json
import joblib
from venn_abers import VennAbersCalibrator

sns.set(style='whitegrid')

  from .autonotebook import tqdm as notebook_tqdm


## Read train, validation, calibration and test data

In [2]:
train_df = pd.read_parquet('../data/processed/train_df.parquet')
validation_df = pd.read_parquet('../data/processed/validation_df.parquet')
calibration_df = pd.read_parquet('../data/processed/calibration_df.parquet')
test_df = pd.read_parquet('../data/processed/feature_engineering_test.parquet')

## Preparando os dados

In [3]:
metadados = generate_metadata(train_df, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,verification_status_joint,Explicativa,354761,99.94,1,category
1,emp_length,Explicativa,18017,5.08,11,category
2,num_rev_accts,Explicativa,0,0.0,84,Int16
3,tax_liens,Explicativa,0,0.0,25,Int16
4,pub_rec_bankruptcies,Explicativa,0,0.0,12,Int16
5,percent_bc_gt_75,Explicativa,0,0.0,184,float16
6,pct_tl_nvr_dlq,Explicativa,0,0.0,503,float16
7,num_tl_op_past_12m,Explicativa,0,0.0,28,Int16
8,num_tl_90g_dpd_24m,Explicativa,0,0.0,23,Int16
9,num_tl_30dpd,Explicativa,0,0.0,5,Int16


## Excluindo variáveis com mais de 70% de nulos

In [4]:
missing_cutoff = 70

drop_vars_nulos = metadados[(metadados['PC_NULOS'] >= missing_cutoff)]
lista_drop_vars = list(drop_vars_nulos.FEATURE.values)
print('Variáveis que serão excluídas por alto percentual de nulos: ',lista_drop_vars)
  # retirando lista de variáveis com alto percentual de nulos
train_df_02 = train_df.drop(axis=1,columns=lista_drop_vars)
train_df_02.shape

Variáveis que serão excluídas por alto percentual de nulos:  ['verification_status_joint']


(354976, 100)

In [5]:
# Salvando a lista em um arquivo .pkl
with open('../artifacts/prd_drop_nullvars.pkl', 'wb') as f:
    pickle.dump(lista_drop_vars, f)

In [6]:
# Abrindo arquivo .pkl
with open('../artifacts/prd_drop_nullvars.pkl', 'rb') as f:
  lista_drop_vars = pickle.load(f)

## Aplicando a retirada de nulos nas outras bases

In [7]:
  # retirando lista de variáveis com alto percentual de nulos
validation_df_02 = validation_df.drop(axis=1,columns=lista_drop_vars)
validation_df_02.shape

(264688, 100)

In [8]:
  # retirando lista de variáveis com alto percentual de nulos
calibration_df_02 = calibration_df.drop(axis=1,columns=lista_drop_vars)
calibration_df_02.shape

(66173, 100)

In [9]:
# Loading features from features_list.json
with open('../artifacts/features_list.json', 'r') as f:
    features = json.load(f)

In [10]:
if 'default' in features:
    features.remove('default')
test_df = test_df[features]

In [11]:
  # retirando lista de variáveis com alto percentual de nulos
test_df_02 = test_df.drop(axis=1,columns=lista_drop_vars)
test_df_02.shape

(538826, 99)

## Tratamento de nulos

In [12]:
train_df_02 = train_df_02.drop(axis=1, columns=['default'])
train_df_02.head()

Unnamed: 0,loan_amnt,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,dti,...,income_to_loan_ratio,debt_to_income_ratio,loan_amnt_per_income,fico_avg,credit_utilization_ratio,total_credit_lines,delinquency_ratio,int_rate_to_income_ratio,public_records_impact,pct_active_bc
2161167,27000,36,0.165527,D,D2,10+ years,MORTGAGE,77000.0,Verified,14.140625,...,2.851852,0.000184,0.350649,0.503008,0.89952,42,0.096774,2e-06,0,0.428571
157251,10000,36,0.153076,C,C2,10+ years,OWN,49417.160156,Source Verified,4.179688,...,4.941716,8.5e-05,0.202359,0.502941,0.497266,19,0.0,3e-06,10000,0.181818
2285898,6500,36,0.09491,B,B2,10+ years,MORTGAGE,41600.0,Source Verified,22.125,...,6.4,0.000532,0.15625,0.502797,0.361711,27,0.058824,2e-06,0,0.5
2085252,9000,36,0.109924,B,B4,,MORTGAGE,26520.0,Verified,27.09375,...,2.946667,0.001022,0.339367,0.502985,0.575702,48,0.0,4e-06,9000,0.35
42558,10400,60,0.175659,D,D2,2 years,RENT,105680.0,Source Verified,18.515625,...,10.161538,0.000175,0.09841,0.502963,0.882476,58,0.0,2e-06,0,0.470588


In [13]:
train_df_03, means = fillna_numeric(train_df_02)

with open('../artifacts/prd_fillna_num.pkl', 'wb') as f:
  pickle.dump(means, f)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(means[col], inplace=True)


In [14]:
train_df_03, modes = fillna_categorical(train_df_02)

with open('../artifacts/prd_fillna_catg.pkl', 'wb') as f:
  pickle.dump(modes, f)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(modes[col], inplace=True)


## Aplicando tratamento de nulos nas outras bases

In [15]:
with open('../artifacts/prd_fillna_num.pkl', 'rb') as f:
  loaded_means = pickle.load(f)

In [16]:
with open('../artifacts/prd_fillna_catg.pkl', 'rb') as f:
  loaded_modes = pickle.load(f)

In [17]:
validation_df_03 = fillna_num_prod(validation_df_02,loaded_means)
validation_df_03.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mean_value, inplace=True)


(264688, 100)

In [18]:
calibration_df_03 = fillna_num_prod(calibration_df_02,loaded_means)
calibration_df_03.shape

(66173, 100)

In [19]:
test_df_03 = fillna_num_prod(test_df_02,loaded_means)
test_df_03.shape

(538826, 99)

In [20]:
train_df_03 = fillna_catg_prod(train_df_03,loaded_modes)
validation_df_03 = fillna_catg_prod(validation_df_03,loaded_modes)
calibration_df_03 = fillna_catg_prod(calibration_df_03,loaded_modes)
test_df_03 = fillna_catg_prod(test_df_03,loaded_modes)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)


In [21]:
train_df_03.shape, calibration_df_03.shape, validation_df_03.shape, test_df_03.shape

((354976, 99), (66173, 100), (264688, 100), (538826, 99))

In [22]:
metadados = generate_metadata(train_df_03, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,loan_amnt,Explicativa,0,0.0,1366,Int32
1,tax_liens,Explicativa,0,0.0,25,Int16
2,percent_bc_gt_75,Explicativa,0,0.0,184,float16
3,pct_tl_nvr_dlq,Explicativa,0,0.0,503,float16
4,num_tl_op_past_12m,Explicativa,0,0.0,28,Int16
5,num_tl_90g_dpd_24m,Explicativa,0,0.0,23,Int16
6,num_tl_30dpd,Explicativa,0,0.0,5,Int16
7,num_tl_120dpd_2m,Explicativa,0,0.0,6,Int16
8,num_sats,Explicativa,0,0.0,65,Int16
9,num_rev_tl_bal_gt_0,Explicativa,0,0.0,42,Int16


## Compondo tabela analítica de modelagem (ABT)

In [23]:
#### Trazer o id e target para a tabela pós dataprep

abt_train = train_df_03.merge(train_df[['default']], left_index=True, right_index=True, how='inner')

In [24]:
abt_calibration = calibration_df_03.copy()
abt_validation = validation_df_03.copy()
abt_test = test_df_03.copy()

In [25]:
abt_train.shape, abt_calibration.shape, abt_validation.shape, abt_test.shape

((354976, 100), (66173, 100), (264688, 100), (538826, 99))

## Salvando ABT`s de treino e teste pós preparação dos dados

In [26]:
# Save training data
abt_train.to_parquet('../data/processed/abt_train.parquet')

# Save validation data
abt_calibration.to_parquet('../data/processed/abt_calibration.parquet')

# Save calibration data
abt_validation.to_parquet('../data/processed/abt_validation.parquet')

# Save test data
abt_test.to_parquet('../data/processed/abt_test.parquet')

In [27]:
metadados = generate_metadata(abt_train, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,loan_amnt,Explicativa,0,0.0,1366,Int32
1,num_op_rev_tl,Explicativa,0,0.0,57,Int16
2,pub_rec_bankruptcies,Explicativa,0,0.0,12,Int16
3,percent_bc_gt_75,Explicativa,0,0.0,184,float16
4,pct_tl_nvr_dlq,Explicativa,0,0.0,503,float16
5,num_tl_op_past_12m,Explicativa,0,0.0,28,Int16
6,num_tl_90g_dpd_24m,Explicativa,0,0.0,23,Int16
7,num_tl_30dpd,Explicativa,0,0.0,5,Int16
8,num_tl_120dpd_2m,Explicativa,0,0.0,6,Int16
9,num_sats,Explicativa,0,0.0,65,Int16


In [28]:
metadados = generate_metadata(abt_test, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,loan_amnt,Explicativa,0,0.0,1560,Int32
1,tax_liens,Explicativa,0,0.0,16,Int16
2,percent_bc_gt_75,Explicativa,0,0.0,209,float16
3,pct_tl_nvr_dlq,Explicativa,0,0.0,593,float16
4,num_tl_op_past_12m,Explicativa,0,0.0,26,Int16
5,num_tl_90g_dpd_24m,Explicativa,0,0.0,25,Int16
6,num_tl_30dpd,Explicativa,0,0.0,2,Int16
7,num_tl_120dpd_2m,Explicativa,0,0.0,2,Int16
8,num_sats,Explicativa,0,0.0,76,Int16
9,num_rev_tl_bal_gt_0,Explicativa,0,0.0,45,Int16
