## Setup enviroment

In [2]:
import pickle
import os
import gc
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, roc_auc_score, average_precision_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append(r'/home/jeanlr/projetos/lending-club/global')
from util import *
import optuna
import arfs.feature_selection.allrelevant as arfsgroot
import shap
import json
import joblib
from venn_abers import VennAbersCalibrator

sns.set(style='whitegrid')

  from .autonotebook import tqdm as notebook_tqdm


## Read train, validation, calibration and test data

In [3]:
train_df = pd.read_parquet('../data/processed/train_df.parquet')
validation_df = pd.read_parquet('../data/processed/validation_df.parquet')
calibration_df = pd.read_parquet('../data/processed/calibration_df.parquet')
test_df = pd.read_parquet('../data/processed/feature_engineering_test.parquet')

## Preparando os dados

In [4]:
metadados = generate_metadata(train_df, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,verification_status_joint,Explicativa,86598,99.95,1,category
1,emp_length,Explicativa,4432,5.12,11,category
2,num_op_rev_tl,Explicativa,0,0.0,50,Int16
3,pub_rec_bankruptcies,Explicativa,0,0.0,11,Int16
4,percent_bc_gt_75,Explicativa,0,0.0,138,float16
5,pct_tl_nvr_dlq,Explicativa,0,0.0,421,float16
6,num_tl_op_past_12m,Explicativa,0,0.0,22,Int16
7,num_tl_90g_dpd_24m,Explicativa,0,0.0,18,Int16
8,num_tl_30dpd,Explicativa,0,0.0,5,Int16
9,num_tl_120dpd_2m,Explicativa,0,0.0,5,Int16


## Excluindo variáveis com mais de 70% de nulos

In [5]:
missing_cutoff = 70

drop_vars_nulos = metadados[(metadados['PC_NULOS'] >= missing_cutoff)]
lista_drop_vars = list(drop_vars_nulos.FEATURE.values)
print('Variáveis que serão excluídas por alto percentual de nulos: ',lista_drop_vars)
  # retirando lista de variáveis com alto percentual de nulos
train_df_02 = train_df.drop(axis=1,columns=lista_drop_vars)
train_df_02.shape

Variáveis que serão excluídas por alto percentual de nulos:  ['verification_status_joint']


(86641, 101)

In [6]:
# Salvando a lista em um arquivo .pkl
with open('../artifacts/prd_drop_nullvars.pkl', 'wb') as f:
    pickle.dump(lista_drop_vars, f)

In [7]:
# Abrindo arquivo .pkl
with open('../artifacts/prd_drop_nullvars.pkl', 'rb') as f:
  lista_drop_vars = pickle.load(f)

## Aplicando a retirada de nulos nas outras bases

In [8]:
  # retirando lista de variáveis com alto percentual de nulos
validation_df_02 = validation_df.drop(axis=1,columns=lista_drop_vars)
validation_df_02.shape

(246072, 101)

In [9]:
  # retirando lista de variáveis com alto percentual de nulos
calibration_df_02 = calibration_df.drop(axis=1,columns=lista_drop_vars)
calibration_df_02.shape

(61519, 101)

In [10]:
# Loading features from features_list.json
with open('../artifacts/features_list.json', 'r') as f:
    features = json.load(f)

In [11]:
if 'default' in features:
    features.remove('default')
test_df = test_df[features]

In [12]:
  # retirando lista de variáveis com alto percentual de nulos
test_df_02 = test_df.drop(axis=1,columns=lista_drop_vars)
test_df_02.shape

(538826, 100)

## Tratamento de nulos

In [13]:
train_df_02 = train_df_02.drop(axis=1, columns=['default'])
train_df_02.head()

Unnamed: 0,funded_amnt,funded_amnt_inv,term,int_rate,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,...,income_to_funded_ratio,debt_to_income_ratio,funded_amnt_per_income,fico_avg,credit_utilization_ratio,total_credit_lines,delinquency_ratio,int_rate_to_income_ratio,public_records_impact,pct_active_bc
1553721,5900,5900.0,36,0.129883,C,C1,2 years,MORTGAGE,67000.0,Verified,...,11.355932,0.000254,0.08806,0.502857,0.555979,48,0.0,1.93855e-06,0,0.4
1876396,13900,13904.0,36,0.229858,F,F2,3 years,RENT,72500.0,Verified,...,5.215827,0.000408,0.191724,0.502985,0.449595,33,0.052632,3.170461e-06,0,1.0
2163889,20000,20000.0,60,0.115295,B,B5,7 years,RENT,135000.0,Source Verified,...,6.75,0.000132,0.148148,0.502667,0.155219,47,0.03125,8.540401e-07,0,0.3
39523,18000,814.5,36,0.093323,B,B3,10+ years,OWN,102000.0,Not Verified,...,5.666667,0.000166,0.176471,0.502685,-10463.0,93,0.0,9.149289e-07,0,1.0
99086,14000,14000.0,36,0.211548,E,E2,< 1 year,RENT,50000.0,Verified,...,3.571429,0.000154,0.28,0.503008,0.887327,8,0.0,4.230957e-06,0,1.0


In [14]:
train_df_03, means = fillna_numeric(train_df_02)

with open('../artifacts/prd_fillna_num.pkl', 'wb') as f:
  pickle.dump(means, f)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(means[col], inplace=True)


In [15]:
train_df_03, modes = fillna_categorical(train_df_02)

with open('../artifacts/prd_fillna_catg.pkl', 'wb') as f:
  pickle.dump(modes, f)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(modes[col], inplace=True)


## Aplicando tratamento de nulos nas outras bases

In [16]:
with open('../artifacts/prd_fillna_num.pkl', 'rb') as f:
  loaded_means = pickle.load(f)

In [17]:
with open('../artifacts/prd_fillna_catg.pkl', 'rb') as f:
  loaded_modes = pickle.load(f)

In [18]:
validation_df_03 = fillna_num_prod(validation_df_02,loaded_means)
validation_df_03.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mean_value, inplace=True)


(246072, 101)

In [19]:
calibration_df_03 = fillna_num_prod(calibration_df_02,loaded_means)
calibration_df_03.shape

(61519, 101)

In [20]:
test_df_03 = fillna_num_prod(test_df_02,loaded_means)
test_df_03.shape

(538826, 100)

In [21]:
train_df_03 = fillna_catg_prod(train_df_03,loaded_modes)
validation_df_03 = fillna_catg_prod(validation_df_03,loaded_modes)
calibration_df_03 = fillna_catg_prod(calibration_df_03,loaded_modes)
test_df_03 = fillna_catg_prod(test_df_03,loaded_modes)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)


In [22]:
train_df_03.shape, calibration_df_03.shape, validation_df_03.shape, test_df_03.shape

((86641, 100), (61519, 101), (246072, 101), (538826, 100))

In [23]:
metadados = generate_metadata(train_df_03, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,funded_amnt,Explicativa,0,0.0,1304,Int32
1,num_il_tl,Explicativa,0,0.0,80,Int16
2,percent_bc_gt_75,Explicativa,0,0.0,138,float16
3,pct_tl_nvr_dlq,Explicativa,0,0.0,421,float16
4,num_tl_op_past_12m,Explicativa,0,0.0,22,Int16
5,num_tl_90g_dpd_24m,Explicativa,0,0.0,18,Int16
6,num_tl_30dpd,Explicativa,0,0.0,5,Int16
7,num_tl_120dpd_2m,Explicativa,0,0.0,5,Int16
8,num_sats,Explicativa,0,0.0,61,Int16
9,num_rev_tl_bal_gt_0,Explicativa,0,0.0,36,Int16


## Compondo tabela analítica de modelagem (ABT)

In [24]:
#### Trazer o id e target para a tabela pós dataprep

abt_train = train_df_03.merge(train_df[['default']], left_index=True, right_index=True, how='inner')

In [25]:
abt_calibration = calibration_df_03.copy()
abt_validation = validation_df_03.copy()
abt_test = test_df_03.copy()

In [26]:
abt_train.shape, abt_calibration.shape, abt_validation.shape, abt_test.shape

((86641, 101), (61519, 101), (246072, 101), (538826, 100))

## Salvando ABT`s de treino e teste pós preparação dos dados

In [27]:
# Save training data
abt_train.to_parquet('../data/processed/abt_train.parquet')

# Save validation data
abt_calibration.to_parquet('../data/processed/abt_calibration.parquet')

# Save calibration data
abt_validation.to_parquet('../data/processed/abt_validation.parquet')

# Save test data
abt_test.to_parquet('../data/processed/abt_test.parquet')

In [28]:
metadados = generate_metadata(abt_train, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,funded_amnt,Explicativa,0,0.0,1304,Int32
1,num_op_rev_tl,Explicativa,0,0.0,50,Int16
2,pub_rec_bankruptcies,Explicativa,0,0.0,11,Int16
3,percent_bc_gt_75,Explicativa,0,0.0,138,float16
4,pct_tl_nvr_dlq,Explicativa,0,0.0,421,float16
5,num_tl_op_past_12m,Explicativa,0,0.0,22,Int16
6,num_tl_90g_dpd_24m,Explicativa,0,0.0,18,Int16
7,num_tl_30dpd,Explicativa,0,0.0,5,Int16
8,num_tl_120dpd_2m,Explicativa,0,0.0,5,Int16
9,num_sats,Explicativa,0,0.0,61,Int16


In [29]:
metadados = generate_metadata(abt_test, ids=['id', 'issue_d'], targets=['default'], orderby='PC_NULOS')
metadados.head(10)

Unnamed: 0,FEATURE,USO_FEATURE,QT_NULOS,PC_NULOS,CARDINALIDADE,TIPO_FEATURE
0,funded_amnt,Explicativa,0,0.0,1560,Int32
1,num_il_tl,Explicativa,0,0.0,101,Int16
2,percent_bc_gt_75,Explicativa,0,0.0,209,float16
3,pct_tl_nvr_dlq,Explicativa,0,0.0,593,float16
4,num_tl_op_past_12m,Explicativa,0,0.0,26,Int16
5,num_tl_90g_dpd_24m,Explicativa,0,0.0,25,Int16
6,num_tl_30dpd,Explicativa,0,0.0,2,Int16
7,num_tl_120dpd_2m,Explicativa,0,0.0,2,Int16
8,num_sats,Explicativa,0,0.0,76,Int16
9,num_rev_tl_bal_gt_0,Explicativa,0,0.0,45,Int16
