In [1]:
# Importing general packages
import pandas as pd

In [2]:
# Reading treated dataframe
loan_df = pd.read_csv('../raw_data/treated_df.csv')

## Preprocessing Pipeline

In [3]:
loan_df.columns

X = loan_df[['loan_amnt', 'term', 'grade', 'installment', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'delinq_2yrs', 'acc_now_delinq', 'total_rev_hi_lim']]

y = loan_df['int_rate']

In [4]:
# Separating columns by preprocessing type

cols_cat = loan_df.select_dtypes('object').columns

cols_robust = ['installment', 'annual_inc', 'delinq_2yrs', 'acc_now_delinq', 'total_rev_hi_lim']

cols_std = ['loan_amnt']

# 'loan_amnt', 'term', 'grade', 'installment', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'delinq_2yrs', 'acc_now_delinq', 'total_rev_hi_lim'

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((373028, 11), (93257, 11), (373028,), (93257,))

In [23]:
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression

# Impute then scale numerical values: 
# Features with no outliers
standard_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())
])

# Features with outliers
robust_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('robust_scaler', RobustScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse = False)

# Parallelize the transformers
preprocessor = ColumnTransformer([
    ('standard_scaler', standard_transformer, cols_std), 
    ('robust_scaler', robust_transformer, cols_robust),
    ('cat_transformer', cat_transformer, cols_cat)
])

# Adicionando o modelo na pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor)
    , ('model', LinearRegression())
    ])

pipeline

In [7]:
# Fazer o fit_transform só no treino
# Fazer transform no teste
# Verificar se o modelo deu certo -> evaluate ou scoring

X_train_transformed = preprocessor.fit_transform(X_train)

pd.DataFrame(
    X_train_transformed, 
    columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,standard_scaler__loan_amnt,robust_scaler__installment,robust_scaler__annual_inc,robust_scaler__delinq_2yrs,robust_scaler__acc_now_delinq,robust_scaler__total_rev_hi_lim,cat_transformer__term_ 36 months,cat_transformer__term_ 60 months,cat_transformer__grade_A,cat_transformer__grade_B,...,cat_transformer__home_ownership_MORTGAGE,cat_transformer__home_ownership_OTHER,cat_transformer__home_ownership_OWN,cat_transformer__home_ownership_RENT,cat_transformer__purpose_credit card,cat_transformer__purpose_debt consolidation,"cat_transformer__purpose_home improvement, major purchase or car","cat_transformer__purpose_medical, wedding or vacation","cat_transformer__purpose_renewable energy, moving, house or other",cat_transformer__purpose_small business or educational
0,-1.003503,-0.553585,1.448276,0.0,0.0,2.134715,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.761905,-0.294154,0.491954,0.0,0.0,-0.233161,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-1.3659,-0.903424,-0.643678,1.0,0.0,0.082902,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.520307,-0.135207,-0.335632,0.0,0.0,0.326425,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.058251,0.268443,-0.413793,0.0,0.0,-0.233161,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


### Performance Metrics

In [24]:
# Train Pipeline
model = pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

In [25]:
from sklearn.model_selection import cross_validate

# Cross-validate Pipeline
cv_scores = cross_validate(model, X_train, y_train, cv=5)
cv_scores

{'fit_time': array([1.1235857 , 1.03957677, 1.05877352, 1.08709407, 1.10635448]),
 'score_time': array([0.13020015, 0.10562921, 0.10588956, 0.10871553, 0.12062359]),
 'test_score': array([0.91836105, 0.91821048, 0.91763118, 0.91929278, 0.91894047])}

In [None]:
round(cv_scores['test_score'].mean(), 4)

0.9185

In [26]:
from sklearn.model_selection import cross_validate
# 5-Fold Cross validate model

cv_results = cross_validate(model, X_train, y_train, cv=5, 
                            scoring=['max_error',
                                     'r2', 
                                     'neg_mean_absolute_error',
                                     'neg_mean_squared_error']
                           )
pd.DataFrame(cv_results) # Cross validation output

Unnamed: 0,fit_time,score_time,test_max_error,test_r2,test_neg_mean_absolute_error,test_neg_mean_squared_error
0,1.189728,0.116011,-15.825178,0.918361,-0.994351,-1.551986
1,1.039383,0.109562,-14.938134,0.91821,-0.991462,-1.548531
2,1.095403,0.106259,-19.935395,0.917631,-0.993188,-1.551568
3,1.067282,0.107792,-20.933707,0.919293,-0.98758,-1.538667
4,1.120983,0.131745,-14.428456,0.91894,-0.993159,-1.542818


In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, max_error
import math

mse = mean_squared_error(y_test, y_pred)

rmse = math.sqrt(mse)

mae = mean_absolute_error(y_test, y_pred)

rsquared = r2_score(y_test, y_pred)

max_error = max_error(y_test, y_pred)

print('MSE =', round(mse, 4))
print('RMSE =', round(rmse, 4))
print('MAE =', round(mae, 4))
print('R2 =', round(rsquared, 4))
print('Max Error =', round(max_error, 4))

MSE = 1.5439
RMSE = 1.2425
MAE = 0.9933
R2 = 0.9189
Max Error = 13.8315


In [21]:
y.describe()

count    466285.000000
mean         13.829236
std           4.357587
min           5.420000
25%          10.990000
50%          13.660000
75%          16.490000
max          26.060000
Name: int_rate, dtype: float64