In [1]:
# Importing general packages
import pandas as pd
import numpy as np

In [4]:
# Reading treated dataframe
loan_df = pd.read_csv('../raw_data/treated_df.csv')

## Preprocessing Pipeline

In [7]:
loan_df.columns

loan_df = loan_df[[
    'good_bad'
    , 'loan_amnt'
    , 'term'
    , 'int_rate'
    , 'installment'
    , 'grade'
    , 'emp_length'
    , 'home_ownership'
    , 'annual_inc'
    , 'purpose'
    , 'dti'
    , 'total_pymnt'
    , 'recoveries'
    , 'acc_now_delinq'
    , 'tot_cur_bal'
]]

# Columns used on logit:
# good_bad ~ loan_amnt + C(term) + int_rate + installment + C(grade) + C(emp_length) + C(home_ownership) + annual_inc + C(purpose) + dti + total_pymnt + recoveries + acc_now_delinq + tot_cur_bal 

In [8]:
# loan_df = pd.read_csv('../raw_data/treated_df.csv')
y = loan_df[['good_bad']]
X = loan_df.drop(columns = 'good_bad')

In [9]:
X.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,emp_length,home_ownership,annual_inc,purpose,dti,total_pymnt,recoveries,acc_now_delinq,tot_cur_bal
0,5000,36 months,10.65,162.87,B,10 years or more,RENT,24000.0,credit card,27.65,5861.071414,0.0,0.0,
1,2500,60 months,15.27,59.83,C,0,RENT,30000.0,"home improvement, major purchase or car",1.0,1008.71,117.08,0.0,
2,2400,36 months,15.96,84.33,C,10 years or more,RENT,12252.0,small business or educational,8.72,3003.653644,0.0,0.0,
3,10000,36 months,13.49,339.31,C,10 years or more,RENT,49200.0,"renewable energy, moving, house or other",20.0,12226.30221,0.0,0.0,
4,3000,60 months,12.69,67.79,B,1 year,RENT,80000.0,"renewable energy, moving, house or other",17.94,3242.17,0.0,0.0,


In [None]:
y.head()

Unnamed: 0,good_bad
0,1
1,0
2,1
3,1
4,1


In [15]:
# Separating columns by preprocessing type

cols_cat = loan_df.select_dtypes('object').columns

cols_robust = ['int_rate', 'installment', 'annual_inc', 'dti', 'total_pymnt', 'recoveries', 'acc_now_delinq', 'tot_cur_bal']

cols_std = ['loan_amnt']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify = y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Stratify garante que a proporção do y no treino e no teste são iguais. Não balanceia o dataset

# This stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter stratify.

# For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.

# Ainda temos que BALANCEAR o dataset -> Oversampling ou Undersampling. SMOTE?
# E então, aplicar no dado de teste.

((373028, 14), (93257, 14), (373028, 1), (93257, 1))

In [25]:
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression

# Impute then scale numerical values: 
# Features with no outliers
standard_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())
])

# Features with outliers
robust_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('robust_scaler', RobustScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Parallelize the transformers
preprocessor = ColumnTransformer([
    ('standard_scaler', standard_transformer, cols_std), 
    ('robust_scaler', robust_transformer, cols_robust),
    ('cat_transformer', cat_transformer, cols_cat)
])

# Adicionando o modelo na pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor)
    # , ('balancing', SMOTE(random_state=42)) # REVISAR SMOTE AQUI
    , ('model', LogisticRegression())
    ])

pipeline

In [24]:
# Fazer o fit_transform só no treino
# Fazer transform no teste
# Verificar se o modelo deu certo -> evaluate ou scoring

X_train_transformed = preprocessor.fit_transform(X_train)

pd.DataFrame(
    X_train_transformed, 
    columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,standard_scaler__loan_amnt,robust_scaler__int_rate,robust_scaler__installment,robust_scaler__annual_inc,robust_scaler__dti,robust_scaler__total_pymnt,robust_scaler__recoveries,robust_scaler__acc_now_delinq,robust_scaler__tot_cur_bal,cat_transformer__term_ 36 months,...,cat_transformer__home_ownership_MORTGAGE,cat_transformer__home_ownership_OTHER,cat_transformer__home_ownership_OWN,cat_transformer__home_ownership_RENT,cat_transformer__purpose_credit card,cat_transformer__purpose_debt consolidation,"cat_transformer__purpose_home improvement, major purchase or car","cat_transformer__purpose_medical, wedding or vacation","cat_transformer__purpose_renewable energy, moving, house or other",cat_transformer__purpose_small business or educational
0,-1.39351,0.3,-0.913208,-0.750328,0.237303,-0.757876,0.0,0.0,0.58985,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.944094,-0.303636,-0.528592,-0.522956,0.774081,-0.368741,0.0,0.0,0.456799,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.648504,-0.121818,-0.252346,-0.573384,-0.38704,-0.317163,0.0,0.0,-0.4805,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.280526,1.065455,0.204096,-0.40927,0.416813,-0.511921,0.0,0.0,-0.280668,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-1.004418,-0.816364,-0.607902,-0.807171,0.169877,-0.63264,0.0,0.0,-0.46259,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


### Performance Metrics

In [38]:
# Train Pipeline
model = pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# False = 0: No Default
# Positive = 1: Default

# The bad situation here is when the model predicts No Default (0) when in truth there is a Default (1) => FALSE NEGATIVE.
# Therefore, RECALL is the metric that fits our problem better.

print('Accuracy =', round(accuracy_score(y_test, y_pred), 4)) # Accuracy
print('Precision =', round(precision_score(y_test, y_pred), 4)) # Precision
print('Recall =', round(recall_score(y_test, y_pred), 4)) # Recall
print('F1 score =', round(f1_score(y_test, y_pred), 4)) # F1 score

# BEAUTIFUL

Accuracy = 0.9522
Precision = 0.9504
Recall = 0.9984
F1 score = 0.9738


In [53]:
from sklearn.model_selection import cross_validate

# Cross-validate Pipeline
cv_scores = cross_validate(pipeline, X_train, y_train, cv=5, scoring = 'recall')
cv_scores

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

{'fit_time': array([3.50680971, 3.59750342, 3.29841256, 3.12315989, 3.02027011]),
 'score_time': array([0.181849  , 0.16251922, 0.16652513, 0.16394162, 0.16776824]),
 'test_score': array([0.99838979, 0.99822426, 0.99855533, 0.99866067, 0.99844996])}

In [52]:
# Plot precision vs recall tradeoff?

In [20]:
#### REVISAR SE ISSO VEM AQUI MESMO ###### #### REVISAR SE ISSO VEM AQUI MESMO ######
#### REVISAR SE ISSO VEM AQUI MESMO ###### #### REVISAR SE ISSO VEM AQUI MESMO ######
# BALANCING THE TRAINING SET


# SMOTE só pode ser usado com variáveis numéricas.Por isso fazemos depois do encoding.


sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_transformed, y_train)
print('Resampled dataset shape %s' % y_res.value_counts())

Resampled dataset shape good_bad
0           332254
1           332254
dtype: int64
