In [24]:
# Importing general packages
import pandas as pd

In [25]:
# Reading treated dataframe
loan_df = pd.read_csv('../raw_data/treated_df.csv')

## Preprocessing Pipeline

In [26]:
loan_df.columns

X = loan_df[[
    'loan_amnt'
    , 'term'
    , 'grade'
    # , 'installment'
    , 'emp_length'
    , 'home_ownership'
    , 'annual_inc'
    , 'purpose'
    # , 'delinq_2yrs'
    # , 'acc_now_delinq'
    # , 'total_rev_hi_lim'
    , 'tot_cur_bal'
    ]]

y = loan_df['int_rate']

In [27]:
# Separating columns by preprocessing type

cols_cat = loan_df.select_dtypes('object').columns

cols_robust = [
    'annual_inc'
    # , 'installment'
    # , 'delinq_2yrs'
    # , 'acc_now_delinq'
    # , 'total_rev_hi_lim'
    , 'tot_cur_bal'
    ]

cols_std = ['loan_amnt']

# 'loan_amnt', 'term', 'grade', 'installment', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'delinq_2yrs', 'acc_now_delinq', 'total_rev_hi_lim'

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((373028, 8), (93257, 8), (373028,), (93257,))

In [34]:
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor

# Impute then scale numerical values: 
# Features with no outliers
standard_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())
])

# Features with outliers
robust_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('robust_scaler', RobustScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse = False)

# Parallelize the transformers
preprocessor = ColumnTransformer([
    ('standard_scaler', standard_transformer, cols_std), 
    ('robust_scaler', robust_transformer, cols_robust),
    ('cat_transformer', cat_transformer, cols_cat)
])

# Adicionando o modelo na pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor)
    , ('model', XGBRegressor())
    ])

pipeline

### Performance Metrics

In [35]:
# Train Pipeline
model = pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

In [36]:
from sklearn.model_selection import cross_validate

# Cross-validate Pipeline
cv_scores = cross_validate(model, X_train, y_train, cv=5)
cv_scores

{'fit_time': array([9.56797671, 9.77753139, 8.80796647, 9.82687259, 8.81782413]),
 'score_time': array([0.12808037, 0.13240767, 0.12311625, 0.12874293, 0.12106872]),
 'test_score': array([0.92611924, 0.92481608, 0.92415931, 0.92543486, 0.92516391])}

In [38]:
round(cv_scores['test_score'].mean(), 2)

0.93

In [11]:
from sklearn.model_selection import cross_validate
# 5-Fold Cross validate model

cv_results = cross_validate(model, X_train, y_train, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error']
                           )
pd.DataFrame(cv_results) # Cross validation output

Unnamed: 0,fit_time,score_time,test_max_error,test_r2,test_neg_mean_absolute_error,test_neg_mean_squared_error
0,0.870388,0.102091,-17.646552,0.914112,-1.021803,-1.6415
1,0.834803,0.100489,-14.696105,0.914132,-1.021675,-1.637529
2,0.862641,0.114943,-17.711588,0.913907,-1.021519,-1.632341
3,0.86318,0.109725,-17.728769,0.913208,-1.021383,-1.64243
4,0.840949,0.099735,-14.313057,0.913503,-1.0204,-1.628548


In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, max_error
import math

mse = mean_squared_error(y_test, y_pred)

rmse = math.sqrt(mse)

mae = mean_absolute_error(y_test, y_pred)

rsquared = r2_score(y_test, y_pred)

max_error = max_error(y_test, y_pred)

print('MSE =', round(mse, 4))
print('RMSE =', round(rmse, 4))
print('MAE =', round(mae, 4))
print('R2 =', round(rsquared, 4))
print('Max Error =', round(max_error, 4))

MSE = 1.4142
RMSE = 1.1892
MAE = 0.9652
R2 = 0.9256
Max Error = 15.3597


In [13]:
y.describe()

count    466285.000000
mean         13.829236
std           4.357587
min           5.420000
25%          10.990000
50%          13.660000
75%          16.490000
max          26.060000
Name: int_rate, dtype: float64