In [4]:
# Importing general packages
import pandas as pd

In [5]:
# Reading treated dataframe
loan_df = pd.read_csv('../raw_data/treated_df.csv')

## Preprocessing Pipeline

In [6]:
loan_df.columns

X = loan_df[[
    'loan_amnt'
    , 'term'
    , 'grade'
    # , 'installment'
    , 'emp_length'
    , 'home_ownership'
    , 'annual_inc'
    , 'purpose'
    # , 'delinq_2yrs'
    # , 'acc_now_delinq'
    # , 'total_rev_hi_lim'
    , 'tot_cur_bal'
    ]]

y = loan_df['int_rate']

In [7]:
# Separating columns by preprocessing type

cols_cat = loan_df.select_dtypes('object').columns

cols_robust = [
    'annual_inc'
    # , 'installment'
    # , 'delinq_2yrs'
    # , 'acc_now_delinq'
    # , 'total_rev_hi_lim'
    , 'tot_cur_bal'
    ]

cols_std = ['loan_amnt']

# 'loan_amnt', 'term', 'grade', 'installment', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'delinq_2yrs', 'acc_now_delinq', 'total_rev_hi_lim'

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((373028, 8), (93257, 8), (373028,), (93257,))

In [9]:
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor

# Impute then scale numerical values: 
# Features with no outliers
standard_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())
])

# Features with outliers
robust_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('robust_scaler', RobustScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse = False)

# Parallelize the transformers
preprocessor = ColumnTransformer([
    ('standard_scaler', standard_transformer, cols_std), 
    ('robust_scaler', robust_transformer, cols_robust),
    ('cat_transformer', cat_transformer, cols_cat)
])

# Adicionando o modelo na pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor)
    , ('model', XGBRegressor())
    ])

pipeline

### Performance Metrics

In [10]:
# Train Pipeline
model = pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

In [11]:
from sklearn.model_selection import cross_validate

# Cross-validate Pipeline
cv_scores = cross_validate(model, X_train, y_train, cv=5)
cv_scores

{'fit_time': array([8.35486722, 8.31295419, 8.61103749, 8.4209137 , 8.7361691 ]),
 'score_time': array([0.13365984, 0.12852573, 0.12744355, 0.127388  , 0.1284523 ]),
 'test_score': array([0.92502358, 0.92423122, 0.92462922, 0.92638852, 0.92551141])}

In [12]:
round(cv_scores['test_score'].mean(), 2)

0.93

In [13]:
from sklearn.model_selection import cross_validate
# 5-Fold Cross validate model

cv_results = cross_validate(model, X_train, y_train, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error']
                           )
pd.DataFrame(cv_results) # Cross validation output

Unnamed: 0,fit_time,score_time,test_max_error,test_r2,test_neg_mean_absolute_error,test_neg_mean_squared_error
0,8.619998,0.130035,-14.995337,0.925024,-0.967424,-1.416393
1,8.688036,0.136727,-18.180088,0.924231,-0.969927,-1.436461
2,8.888229,0.132811,-14.891439,0.924629,-0.968621,-1.428672
3,8.394525,0.128762,-14.754829,0.926389,-0.964308,-1.406528
4,8.569968,0.135856,-17.982016,0.925511,-0.967053,-1.417447


In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, max_error
import math

mse = mean_squared_error(y_test, y_pred)

rmse = math.sqrt(mse)

mae = mean_absolute_error(y_test, y_pred)

rsquared = r2_score(y_test, y_pred)

max_error = max_error(y_test, y_pred)

print('MSE =', round(mse, 4))
print('RMSE =', round(rmse, 4))
print('MAE =', round(mae, 4))
print('R2 =', round(rsquared, 4))
print('Max Error =', round(max_error, 4))

MSE = 1.4146
RMSE = 1.1894
MAE = 0.9663
R2 = 0.9255
Max Error = 18.0574


In [15]:
y.describe()

count    466285.000000
mean         13.829236
std           4.357587
min           5.420000
25%          10.990000
50%          13.660000
75%          16.490000
max          26.060000
Name: int_rate, dtype: float64