In [1]:
import numpy as np
from scipy.stats import randint, uniform
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

import joblib

pd.pandas.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore')

In [2]:
lending_train = pd.read_csv('lending_train.csv')
lending_val = pd.read_csv('lending_val.csv')

# get our target
rate_train = lending_train['int_rate']
rate_val = lending_val['int_rate']

# get the selected features
sel_feats = pd.read_csv('lasso_sel_feats.csv')
sel_feats = sel_feats['0'].to_list()

# reduce our dataframes to just those features
lending_train = lending_train[sel_feats]
lending_val = lending_val[sel_feats]

## Baseline Model

We'll do a simple baseline model; the RMSE of true values to the median interest rates.

In [3]:
def rmse(true, predictions):
    return np.sqrt(np.mean(np.square(true - predictions)))

baseline = rmse(rate_val, rate_val.median())
print(f'Baseline rmse: {baseline: 0.6f}')

Baseline rmse:  5.242156


## Lasso Model

In [4]:
lin_model = Lasso(random_state=3)
lin_params = {'alpha': uniform(0.0005, 0.01)}

lin_rs = RandomizedSearchCV(lin_model, 
                            lin_params, 
                            n_iter=5, 
                            scoring='neg_mean_squared_error',
                            cv=5, 
                            random_state=6)
lin_rs.fit(lending_train, rate_train)

lin_best_params = lin_rs.best_params_

lin_best = Lasso(**lin_best_params, random_state=3)
lin_best.fit(lending_train, rate_train)

train_preds = lin_best.predict(lending_train)
val_preds = lin_best.predict(lending_val)

print(f'Training rmse: {rmse(rate_train, train_preds)}')
print(f'Validation rmse: {rmse(rate_val, val_preds)}\n')
print(f'Training r-squared: {r2_score(rate_train, train_preds)}')
print(f'Validation r-squared: {r2_score(rate_val, val_preds)}')

Training rmse: 4.404163801880641
Validation rmse: 4.416590789618043

Training r-squared: 0.28248755123674407
Validation r-squared: 0.2804741184879467


## Random Forest

In [5]:
rf = RandomForestRegressor(n_estimators=100, random_state=3, n_jobs=-1)
rf_params = {'max_features': [0.9, 0.8, 0.7],
             'max_depth': [9, 12, 15, 18],
             'min_samples_leaf': [3, 5, 8, 10, 12]}

rf_rs = RandomizedSearchCV(rf,
                           rf_params,
                           n_iter=5, 
                           scoring='neg_mean_squared_error',
                           cv=5,
                           random_state=6)
rf_rs.fit(lending_train, rate_train)

rf_rs_best_params = rf_rs.best_params_
print(f'Best parameters: {rf_rs_best_params}')

rf_rs_best = RandomForestRegressor(**rf_rs_best_params, random_state=3)
rf_rs_best.fit(lending_train, rate_train)

#persist model for future use
joblib.dump(rf_rs_best, 'rf_regression.pkl')

Best parameters: {'min_samples_leaf': 3, 'max_features': 0.9, 'max_depth': 15}


['rf_regression.pkl']

In [6]:
rf_train_preds = rf_rs_best.predict(lending_train)
rf_val_preds = rf_rs_best.predict(lending_val)

print(f'Training rmse: {rmse(rate_train, rf_train_preds)}')
print(f'Validation rmse: {rmse(rate_val, rf_val_preds)}\n')
print(f'Training r-squared: {r2_score(rate_train, rf_train_preds)}')
print(f'Validation r-squared: {r2_score(rate_val, rf_val_preds)}')

Training rmse: 3.256507655047351
Validation rmse: 3.960998550183124

Training r-squared: 0.6077104665237907
Validation r-squared: 0.4212627154472437
