## Setup enviroment

In [1]:
import os
import gc
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, roc_auc_score, average_precision_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append(r'/home/jeanlr/projetos/lending-club/global')
from util import *
import arfs.feature_selection.allrelevant as arfsgroot
import shap
import json
import joblib

sns.set(style='whitegrid')

  from .autonotebook import tqdm as notebook_tqdm


## Read the parquet file located at ../data/processed/feature_engineering_train.parquet

In [2]:
df = pd.read_parquet('../data/processed/feature_engineering_train.parquet')
df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 1765426 entries, 0 to 2925491
Columns: 152 entries, id to pct_active_bc
dtypes: Float64(6), Int16(55), Int32(16), Int64(15), category(22), datetime64[ns](9), float16(23), float32(6)
memory usage: 1.1 GB


## Split Out-Of-Time Train, Validation, Test

In [3]:
default_nao_nulo = df[df['default'].notnull()]
proporcao = default_nao_nulo['default'].value_counts(normalize=True)

# Exibir a proporção
print("Proporção de 0 e 1:")
print(proporcao)

Proporção de 0 e 1:
default
0.0    0.809375
1.0    0.190625
Name: proportion, dtype: float64


## Remove nulls on target

In [4]:
# Remove as linhas com valores nulos na coluna 'default'
df = df.dropna(subset=['default'])

# Verifica a distribuição dos valores após a remoção
print(df['default'].value_counts(normalize=True))

default
0.0    0.809375
1.0    0.190625
Name: proportion, dtype: float64


In [5]:
train_max_date = '2016-01-01'
validation_max_date = '2016-10-01'
test_start_date = '2017-03-01'

# Loading features from features_list.json
with open('../artifacts/features_list.json', 'r') as f:
    features = json.load(f)

train_df = df.loc[df['issue_d'] < train_max_date, features]
validation_df = df.loc[(df['issue_d'] >= train_max_date) & (df['issue_d'] < validation_max_date), features]

# Get Train and Validation Subsamples to speed up the process
_, train_df = train_test_split(train_df, test_size=0.10, random_state=42, stratify=train_df['default'])

# Spliting the original validatio into two shuffled datasets one for validation only and the other for calibration
validation_df, calibration_df = train_test_split(validation_df, test_size=0.20, random_state=42, stratify=validation_df['default'])

print('Train Shape: ', train_df.shape)
print('Validation shape: ', validation_df.shape)
print('Calibration shape: ', calibration_df.shape)

Train Shape:  (86641, 102)
Validation shape:  (246072, 102)
Calibration shape:  (61519, 102)


In [6]:
train_df.columns

Index(['funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'grade',
       'sub_grade', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status',
       ...
       'income_to_funded_ratio', 'debt_to_income_ratio',
       'funded_amnt_per_income', 'fico_avg', 'credit_utilization_ratio',
       'total_credit_lines', 'delinquency_ratio', 'int_rate_to_income_ratio',
       'public_records_impact', 'pct_active_bc'],
      dtype='object', length=102)

## Saving ABT's

In [7]:
# Save training data
train_df.to_parquet('../data/processed/train_df.parquet')

# Save validation data
validation_df.to_parquet('../data/processed/validation_df.parquet')

# Save calibration data
calibration_df.to_parquet('../data/processed/calibration_df.parquet')