In [119]:
# Importing general packages
import pandas as pd

In [120]:
# Reading treated dataframe
loan_df = pd.read_csv('../raw_data/treated_df.csv')

## Preprocessing Pipeline

In [121]:
loan_df.columns

X = loan_df[[
    'good_bad'
    , 'loan_amnt'
    , 'term'
    , 'int_rate'
    # , 'installment'
    , 'grade'
    , 'emp_length'
    , 'home_ownership'
    , 'annual_inc'
    , 'purpose'
    , 'dti'
    # , 'total_pymnt'
    # , 'acc_now_delinq'
    , 'tot_cur_bal'
]]

# Columns used on logit:
# good_bad ~ loan_amnt + C(term) + int_rate + installment + C(grade) + C(emp_length) + C(home_ownership) + annual_inc + C(purpose) + dti + total_pymnt + recoveries + acc_now_delinq + tot_cur_bal 

In [122]:
loan_df.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'emp_length', 'home_ownership', 'annual_inc',
       'purpose', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'acc_now_delinq', 'tot_coll_amt',
       'tot_cur_bal', 'total_rev_hi_lim', 'good_bad'],
      dtype='object')

In [123]:
# loan_df = pd.read_csv('../raw_data/treated_df.csv')
y = loan_df[['good_bad']]
X = X.drop(columns = 'good_bad')

In [124]:
X.head()

Unnamed: 0,loan_amnt,term,int_rate,grade,emp_length,home_ownership,annual_inc,purpose,dti,tot_cur_bal
0,5000,36 months,10.65,B,10 years or more,RENT,24000.0,credit card,27.65,
1,2500,60 months,15.27,C,0,RENT,30000.0,"home improvement, major purchase or car",1.0,
2,2400,36 months,15.96,C,10 years or more,RENT,12252.0,small business or educational,8.72,
3,10000,36 months,13.49,C,10 years or more,RENT,49200.0,"renewable energy, moving, house or other",20.0,
4,3000,60 months,12.69,B,1 year,RENT,80000.0,"renewable energy, moving, house or other",17.94,


In [125]:
y.head()

Unnamed: 0,good_bad
0,1
1,0
2,1
3,1
4,1


In [126]:
loan_df.select_dtypes('object').columns

Index(['term', 'grade', 'emp_length', 'home_ownership', 'purpose'], dtype='object')

In [127]:
# Separating columns by preprocessing type

cols_cat = ['term', 'home_ownership', 'purpose']

cols_ord = ['grade', 'emp_length']

cols_robust = ['int_rate'
               # , 'installment'
               , 'annual_inc'
               , 'dti' # Quantos % da sua renda mensal está comprometida com dívidas?
               # , 'total_pymnt'
               # , 'acc_now_delinq'
               , 'tot_cur_bal'
               ]

cols_std = ['loan_amnt']

In [128]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify = y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Stratify garante que a proporção do y no treino e no teste são iguais. Não balanceia o dataset

# This stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.

# For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.

# Ainda temos que BALANCEAR o dataset -> Oversampling ou Undersampling. SMOTE?
# E então, aplicar no dado de teste.

((373028, 10), (93257, 10), (373028, 1), (93257, 1))

In [129]:
X_train['emp_length'].unique()

array(['2 to 4 years', '10 years or more', '7 to 9 years', '0', '1 year',
       '5 to 6 years', 'Unemployed'], dtype=object)

In [130]:
from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder


# Impute then scale numerical values: 
# Features with no outliers
standard_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())
])

# Features with outliers
robust_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('robust_scaler', RobustScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')
ordinal_grade = OrdinalEncoder(categories = [['A', 'B', 'C', 'D', 'E', 'F', 'G']])
ordinal_emp_len = OrdinalEncoder(categories = [['Unemployed', '0', '1 year', '2 to 4 years', '5 to 6 years', '7 to 9 years', '10 years or more']])

# Parallelize the transformers
preprocessor = ColumnTransformer([
    ('standard_scaler', standard_transformer, cols_std), 
    ('robust_scaler', robust_transformer, cols_robust),
    ('cat_transformer', cat_transformer, cols_cat),
    ('ordinal_grade', ordinal_grade, ['grade']),
    ('ordinal_emp_len', ordinal_emp_len, ['emp_length'])
])

# # Adicionando o modelo na pipeline
# pipeline = Pipeline([
#     ('preprocessing', preprocessor)
#     # , ('balancing', sm) # REVISAR SMOTE AQUI
#     # , ('model', LogisticRegression(max_iter=1000))
#     ])

preprocessor

In [131]:
# Fazer o fit_transform só no treino
# Fazer transform no teste
# Verificar se o modelo deu certo -> evaluate ou scoring

X_train_transformed = preprocessor.fit_transform(X_train)

pd.DataFrame(
    X_train_transformed, 
    columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,standard_scaler__loan_amnt,robust_scaler__int_rate,robust_scaler__annual_inc,robust_scaler__dti,robust_scaler__tot_cur_bal,cat_transformer__term_ 36 months,cat_transformer__term_ 60 months,cat_transformer__home_ownership_MORTGAGE,cat_transformer__home_ownership_OTHER,cat_transformer__home_ownership_OWN,cat_transformer__home_ownership_RENT,cat_transformer__purpose_credit card,cat_transformer__purpose_debt consolidation,"cat_transformer__purpose_home improvement, major purchase or car","cat_transformer__purpose_medical, wedding or vacation","cat_transformer__purpose_renewable energy, moving, house or other",cat_transformer__purpose_small business or educational,ordinal_grade__grade,ordinal_emp_len__emp_length
0,-0.520363,-0.681818,-0.367218,-0.191601,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
1,0.806893,-1.387273,0.805195,0.48294,0.724583,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,6.0
2,2.496129,0.058182,0.851172,-1.033246,-0.4908,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,5.0
3,0.010539,-1.387273,3.264966,-1.302712,2.322495,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0
4,2.496129,0.090909,0.897149,-0.817148,0.422015,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,6.0


In [132]:
from imblearn.over_sampling import SMOTE

# SMOTE só pode ser usado com variáveis numéricas.Por isso fazemos depois do encoding.

sm = SMOTE(random_state=42)
X_train_transf_bal, y_train_bal = sm.fit_resample(X_train_transformed, y_train)

print('Resampled dataset shape %s' % y_train_bal.value_counts())

Resampled dataset shape good_bad
0           332254
1           332254
dtype: int64


In [133]:
# Logit
# from sklearn.linear_model import LogisticRegression

# model = LogisticRegression(max_iter = 1000)

# model.fit(X_train_transf_bal, y_train_bal)

# X_test_transformed = preprocessor.transform(X_test)

# model.score(X_test_transformed, y_test)

  y = column_or_1d(y, warn=True)


0.6258725886528625

In [139]:
# XGBoost
from xgboost import XGBClassifier

model = XGBClassifier()

model.fit(X_train_transf_bal, y_train_bal)

X_test_transformed = preprocessor.transform(X_test)

model.score(X_test_transformed, y_test)

0.8484617776681643

### Performance Metrics

In [140]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# False = 0: No Default
# Positive = 1: Default

# The bad situation here is when the model predicts No Default (0) when in truth there is a Default (1) => FALSE NEGATIVE.
# Therefore, RECALL is the metric that fits our problem better.

y_pred = model.predict(X_test_transformed)

print('Accuracy =', round(accuracy_score(y_test, y_pred), 4)) # Accuracy
print('Precision =', round(precision_score(y_test, y_pred), 4)) # Precision
print('Recall =', round(recall_score(y_test, y_pred), 4)) # Recall
print('F1 score =', round(f1_score(y_test, y_pred), 4)) # F1 score

# BEAUTIFUL

Accuracy = 0.8485
Precision = 0.8983
Recall = 0.9358
F1 score = 0.9167
