In [1]:
#Load dataset
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes
import pandas as pd
import numpy as np

diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
X = df.copy()
y = pd.DataFrame(diabetes.target.copy(), columns=['target'])

In [2]:
#Prepare train and test sets for feature selection
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123)
print(f'Train sets shapes: {X_train.shape}, {y_train.shape}')

Train sets shapes: (375, 10), (375, 1)


In [3]:
#Import necessary components
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier


In [4]:
models = {}

#Fit models
lr = LinearRegression()
models.update({'LinearRegression': lr})

rf = RandomForestRegressor(random_state=123)
models.update({'RandomForestRegressor': rf})

gb = GradientBoostingRegressor(random_state=123)
models.update({'GradientBoostingRegressor': gb})


#Fit models and tune hyperparameters
params_rfr = {'n_estimators': [10, 50],  'max_depth': [2, 4], 'max_features': [5, 80], 'min_samples_leaf': [3, 6], 'random_state': [123]}

params_gbr = {'n_estimators': [10, 50], 'max_depth': [2, 4], 'max_features': [5, 10], 'min_samples_leaf': [5, 10], 'random_state': [123]}

rfr = RandomForestRegressor(random_state=123)
gbr = GradientBoostingRegressor(random_state=123)


grid_rfr= GridSearchCV(estimator = rfr, param_grid = params_rfr, scoring='neg_mean_absolute_error', n_jobs=4, cv= 5, refit=True, return_train_score=True)
grid_rfr.fit(X_train, y_train)
models.update({'RandomForestRegressor_tuned': grid_rfr.best_estimator_})
print(grid_rfr.best_estimator_)

grid_gbr = GridSearchCV(estimator = gbr, param_grid = params_gbr, scoring='neg_mean_absolute_error', n_jobs=4, cv= 5, refit=True, return_train_score=True)
grid_gbr.fit(X_train, y_train)
models.update({'GradientBoostingRegressor_tuned': grid_gbr.best_estimator_})
print(grid_gbr.best_estimator_)

RandomForestRegressor(max_depth=4, max_features=5, min_samples_leaf=6,
                      n_estimators=10, random_state=123)
GradientBoostingRegressor(max_depth=2, max_features=5, min_samples_leaf=10,
                          n_estimators=50, random_state=123)


In [5]:
#Baseline score
cv = cross_validate(LinearRegression(), X_train, y_train, cv=5, scoring='r2', return_train_score=True)
print('test_score: ', cv['test_score'].mean(), ' train_score: ', cv['train_score'].mean())

#Fit model with datasets and calculate scores

dataset_scores = pd.DataFrame(columns=['model', 'test_score', 'train_score'])

for model_name, model in models.items():
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
          
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
 
    row = [model_name, test_score, train_score]
    dataset_scores.loc[len(dataset_scores)+1] = row
    dataset_scores = dataset_scores.reset_index(drop=True)
   
dataset_scores.sort_values('test_score', ascending=False)

test_score:  0.4629812448078895  train_score:  0.503872956635694


Unnamed: 0,model,test_score,train_score
4,GradientBoostingRegressor_tuned,0.592157,0.59593
0,LinearRegression,0.59105,0.501008
3,RandomForestRegressor_tuned,0.583714,0.579219
2,GradientBoostingRegressor,0.568998,0.807729
1,RandomForestRegressor,0.548415,0.915738
