In [None]:
# Original Code source: Jaques Grobler: https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
# License: BSD 3 clause
# Modified by SR

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, \
                            r2_score, root_mean_squared_error


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

## The dataset

https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset

Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.

In [None]:
# Load the diabetes dataset
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True, as_frame=True, scaled=False)

diabetes_X.head()

In [None]:
diabetes_y.head()

In [None]:
# Split the data into training/testing sets
diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(
    diabetes_X, diabetes_y,
    test_size = 0.25,
    random_state=42
    )

In [None]:
diabetes_X_train

In [None]:
diabetes_y_train

In [None]:
# Explore distribution of values being predicted
diabetes_y_train.hist()

In [None]:
pd.DataFrame(diabetes_y_train).plot(kind="box")

## Storing results

Create a list to store results in.

In [None]:
results_list = []

## Linear Regression

In [None]:
def standardise_data(X_train, X_test):

    # Initialise a new scaling object for normalising input data
    sc = StandardScaler()

    # Apply the scaler to the training and test sets
    train_std=sc.fit_transform(X_train)
    test_std=sc.fit_transform(X_test)

    return train_std, test_std

diabetes_X_train_standardised, diabetes_X_test_standardised = standardise_data(
    diabetes_X_train,
    diabetes_X_test
    )

In [None]:
# Use only first feature
single_feat_train = diabetes_X_train_standardised[:, np.newaxis, 2]
single_feat_test = diabetes_X_test_standardised[:, np.newaxis, 2]

# Create linear regression object
regr = LinearRegression()

# Train the model using the training sets
regr.fit(single_feat_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(single_feat_test)

print(f"Mean absolute error: {mean_absolute_error(diabetes_y_test, diabetes_y_pred):.2f}")

print(f"Mean absolute percentage error: {mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred):.2%}" )

print("Root Mean squared error: %.2f" % root_mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

results_list.append(
    {'model': 'Linear Regression',
     'RMSE': root_mean_squared_error(diabetes_y_test, diabetes_y_pred),
     'MAPE': mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred)}
)

# Plot outputs
def plot_actual_vs_predicted(actual, predicted):
    fig, ax = plt.subplots(figsize=(6, 6))

    ax.scatter(actual, predicted, color="black")
    ax.axline((1, 1), slope=1)
    plt.xlabel('True Values')
    plt.ylabel('Predicted Values')
    plt.title('True vs Predicted Values')
    plt.show()

def plot_residuals(actual, predicted):
    residuals = actual - predicted

    plt.figure(figsize=(10, 5))
    plt.hist(residuals, bins=20)
    plt.axvline(x = 0, color = 'r')
    plt.xlabel('Residual')
    plt.ylabel('Frequency')
    plt.title('Distribution of Residuals')
    plt.show()

plot_actual_vs_predicted(diabetes_y_test, diabetes_y_pred)

In [None]:
plot_residuals(diabetes_y_test, diabetes_y_pred)

## Multiple Linear Regression

In [None]:
# Create linear regression object
regr = LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train_standardised, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test_standardised)

print(f"Mean absolute error: {mean_absolute_error(diabetes_y_test, diabetes_y_pred):.2f}")

print(f"Mean absolute percentage error: {mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred):.2%}" )

print("Root Mean squared error: %.2f" % root_mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

results_list.append(
    {'model': 'Multiple Linear Regression',
     'RMSE': root_mean_squared_error(diabetes_y_test, diabetes_y_pred),
     'MAPE': mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred)}
)

plot_actual_vs_predicted(diabetes_y_test, diabetes_y_pred)

In [None]:
plot_residuals(diabetes_y_test, diabetes_y_pred)

## Decision Tree

In [None]:
regr_dt = DecisionTreeRegressor(random_state=42)

# Train the model using the training sets
regr_dt.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr_dt.predict(diabetes_X_test)

print(f"Mean absolute error: {mean_absolute_error(diabetes_y_test, diabetes_y_pred):.2f}")


print(f"Mean absolute percentage error: {mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred):.2%}" )

print("Root Mean squared error: %.2f" % root_mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

results_list.append(
    {'model': 'Decision Tree',
     'RMSE': root_mean_squared_error(diabetes_y_test, diabetes_y_pred),
     'MAPE': mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred)}
)

plot_actual_vs_predicted(diabetes_y_test, diabetes_y_pred)

In [None]:
plot_residuals(diabetes_y_test, diabetes_y_pred)

## Random Forest

In [None]:
regr_rf = RandomForestRegressor(random_state=42)

# Train the model using the training sets
regr_rf.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr_rf.predict(diabetes_X_test)

print(f"Mean absolute error: {mean_absolute_error(diabetes_y_test, diabetes_y_pred):.2f}")


print(f"Mean absolute percentage error: {mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred):.2%}" )

print("Root Mean squared error: %.2f" % root_mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

results_list.append(
    {'model': 'Random Forest',
     'RMSE': root_mean_squared_error(diabetes_y_test, diabetes_y_pred),
     'MAPE': mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred)}
)

plot_actual_vs_predicted(diabetes_y_test, diabetes_y_pred)

In [None]:
plot_residuals(diabetes_y_test, diabetes_y_pred)

## XGBoost

In [None]:
regr_xg = XGBRegressor(random_state=42)

# Train the model using the training sets
regr_xg.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr_xg.predict(diabetes_X_test)

print(f"Mean absolute error: {mean_absolute_error(diabetes_y_test, diabetes_y_pred):.2f}")


print(f"Mean absolute percentage error: {mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred):.2%}" )

print("Root Mean squared error: %.2f" % root_mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

results_list.append(
    {'model': 'XGBoost',
     'RMSE': root_mean_squared_error(diabetes_y_test, diabetes_y_pred),
     'MAPE': mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred)}
)

plot_actual_vs_predicted(diabetes_y_test, diabetes_y_pred)

In [None]:
plot_residuals(diabetes_y_test, diabetes_y_pred)

## Light GBM

In [None]:
regr_lgbm = LGBMRegressor(random_state=42)

# Train the model using the training sets
regr_lgbm.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr_lgbm.predict(diabetes_X_test)

print(f"Mean absolute error: {mean_absolute_error(diabetes_y_test, diabetes_y_pred):.2f}")


print(f"Mean absolute percentage error: {mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred):.2%}" )

print("Root Mean squared error: %.2f" % root_mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

results_list.append(
    {'model': 'Light GBM',
     'RMSE': root_mean_squared_error(diabetes_y_test, diabetes_y_pred),
     'MAPE': mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred)}
)

plot_actual_vs_predicted(diabetes_y_test, diabetes_y_pred)

In [None]:
plot_residuals(diabetes_y_test, diabetes_y_pred)

## Catboost

In [None]:
regr_catboost = CatBoostRegressor(random_state=42)

# Train the model using the training sets
regr_catboost.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr_catboost.predict(diabetes_X_test)

print(f"Mean absolute error: {mean_absolute_error(diabetes_y_test, diabetes_y_pred):.2f}")


print(f"Mean absolute percentage error: {mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred):.2%}" )

print("Root Mean squared error: %.2f" % root_mean_squared_error(diabetes_y_test, diabetes_y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred))

results_list.append(
    {'model': 'CatBoost',
     'RMSE': root_mean_squared_error(diabetes_y_test, diabetes_y_pred),
     'MAPE': mean_absolute_percentage_error(diabetes_y_test, diabetes_y_pred)}
)

plot_actual_vs_predicted(diabetes_y_test, diabetes_y_pred)

In [None]:
plot_residuals(diabetes_y_test, diabetes_y_pred)

# Compare Results

In [None]:
results_df = pd.DataFrame(results_list)

In [None]:
results_df.sort_values('MAPE')

Let's remind ourselves - what's the scale of the output feature?

In [None]:
diabetes_y_test.hist()

In [None]:
diabetes_y_train.hist()

## Plotting Residuals

Plotting the residuals (errors) can help us to understand whether the model is consistently over or under estimating.