# 07. Tree algorithms

## Data preparation

Import libraries and scaler:

In [124]:
# Import the third party libraries
import joblib
import pandas as pd
import plotly.express as px
import seaborn as sns
from scipy.stats import randint, uniform
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
# Import scaler
scaler = joblib.load("pkls/scaler.pkl")

Import the data from the csv-file:

In [2]:
df_grouped = pd.read_csv("csv/df_grouped.csv", index_col=0)

In [3]:
df_grouped

Unnamed: 0,name,season,pts,reb,ast,blk,fga,fgm,fta,ftm,...,reb_scaled,ast_scaled,blk_scaled,fga_scaled,fgm_scaled,fta_scaled,ftm_scaled,turnover_scaled,age_scaled,eff_scaled.1
0,A.C. Green,1985,343.0,242.0,36.0,32.0,228.0,130.0,130.0,82.0,...,0.584318,-0.464939,0.510461,-0.269465,-0.069449,0.373497,0.160376,0.219002,-1.181264,-0.021090
1,A.C. Green,1986,696.0,507.0,65.0,58.0,478.0,257.0,226.0,182.0,...,2.358664,-0.180549,1.478240,0.615634,0.882597,1.306460,1.403696,0.590877,-0.936562,1.218922
2,A.C. Green,1987,834.0,616.0,81.0,32.0,561.0,284.0,346.0,266.0,...,3.088489,-0.023644,0.510461,0.909486,1.085000,2.472664,2.448084,1.045391,-0.447157,1.152998
3,A.C. Green,1988,831.0,542.0,75.0,38.0,563.0,295.0,302.0,241.0,...,2.593012,-0.082484,0.733795,0.916567,1.167461,2.045056,2.137254,0.818134,-0.447157,1.476769
4,A.C. Green,1989,728.0,494.0,64.0,36.0,553.0,255.0,276.0,207.0,...,2.271621,-0.190356,0.659350,0.881163,0.867604,1.792378,1.714526,0.694175,-0.202455,1.084648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15576,Zydrunas Ilgauskas,2008,533.0,323.0,46.0,50.0,463.0,217.0,111.0,91.0,...,1.126665,-0.366874,1.180462,0.562528,0.582740,0.188848,0.272275,0.198342,1.510459,1.145182
15577,Zydrunas Ilgauskas,2009,351.0,264.0,39.0,43.0,336.0,147.0,63.0,48.0,...,0.731622,-0.435519,0.919906,0.112898,0.057990,-0.277634,-0.262352,-0.194193,1.999864,0.040535
15578,Zydrunas Ilgauskas,2010,235.0,188.0,16.0,40.0,199.0,104.0,35.0,27.0,...,0.222753,-0.661070,0.808239,-0.372136,-0.264356,-0.549748,-0.523449,-0.214853,1.999864,-0.240238
15579,Zylan Cheatham,2019,10.0,6.0,2.0,0.0,7.0,5.0,0.0,0.0,...,-0.995855,-0.798362,-0.680650,-1.051892,-1.006502,-0.889891,-0.859146,-1.041242,-0.691859,-0.936876


## Feature engineering

Use pts, reb, ast, blk, fga, fgm, fta, ftm, turnover, age as the features, scale the x and y, and create a dataframe of scaled features:

In [4]:
scaled_features = ["pts_scaled", "reb_scaled", "ast_scaled", "blk_scaled", "fga_scaled", "fgm_scaled", "fta_scaled", "ftm_scaled", "turnover_scaled", "age_scaled"]
x_scaled = df_grouped[scaled_features]
y_scaled = df_grouped["eff_scaled"]

## Data splitting

Split the data on the training, validation and test datasets (80/10/10):

In [5]:
x_train, x_temp, y_train, y_temp = train_test_split(x_scaled.values.reshape(-1, 10), y_scaled.values.reshape(-1, 1), test_size=0.2, random_state=38)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=38)
((x_train.shape, y_train.shape), (x_val.shape, y_val.shape), (x_test.shape, y_test.shape))

(((12464, 10), (12464, 1)), ((1558, 10), (1558, 1)), ((1559, 10), (1559, 1)))

## Decision Tree (sklearn)

### Build and Train the Model

Built and train the model using Decision Tree:

In [6]:
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

### Evaluate the Model

In the scaled values:

In [7]:
y_pred_train = model.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 7.849707113940209e-05
MSE: 7.780738380826839e-06
RMSE: 0.0027893974942318346
R2-score: 0.9999922556363334


In [8]:
y_pred_val = model.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.3631503649802119
MSE: 0.2808612214635307
RMSE: 0.5299634152123435
R2-score: 0.7255873681531932


In [9]:
y_pred_test = model.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.3604379935893076
MSE: 0.28110906890770576
RMSE: 0.5301971981326437
R2-score: 0.6994403713688133


In the original values:

In [10]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.0004847357214175801
MSE: 0.0002967043801188997
RMSE: 0.017225109001655105
R2-score: 0.9999922556363334


In [11]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 2.2425289453045902
MSE: 10.710134505887128
RMSE: 3.272634184550288
R2-score: 0.7255873681531932


In [12]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 2.225779488492533
MSE: 10.719585719729404
RMSE: 3.274077842649653
R2-score: 0.6994403713688133


In [13]:
df_metrics_dt = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_dt.round(4)

Unnamed: 0,train,val,test
MAE,0.0005,2.2425,2.2258
MSE,0.0003,10.7101,10.7196
RMSE,0.0172,3.2726,3.2741
R2,1.0,0.7256,0.6994


### Estimate Hyperparameters

We can see significant overfitting. Use RandomizedSearchCV to estimate approximatelly good hyperparameters that reduce overfitting.

In [15]:
param_distributions = {
    "max_depth": randint(8, 10),
    "min_samples_split": randint(20, 100),
    "min_samples_leaf": randint(10, 100),
}
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=1000, cv=5, random_state=38)
random_search.fit(x_train, y_train)
print(random_search.best_params_)


{'max_depth': 9, 'min_samples_leaf': 42, 'min_samples_split': 33}


In the scaled values:

In [16]:
y_pred_train = random_search.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.24718412525117936
MSE: 0.12537772012277845
RMSE: 0.3540871645834941
R2-score: 0.8752084168892726


In [17]:
y_pred_val = random_search.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.2750648069941231
MSE: 0.16374349460551824
RMSE: 0.4046523132338653
R2-score: 0.8400160653423343


In [18]:
y_pred_test = random_search.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.2668681373980147
MSE: 0.14421170090808166
RMSE: 0.37975215721320355
R2-score: 0.8458099717749211


In the original values:

In [19]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.5264133239292483
MSE: 4.781052505430302
RMSE: 2.1865618000482634
R2-score: 0.8752084168892726


In [20]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.6985823256781065
MSE: 6.244061898437694
RMSE: 2.4988120974650525
R2-score: 0.8400160653423343


In [21]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.6479661881303092
MSE: 5.49925228552377
RMSE: 2.3450484612314026
R2-score: 0.8458099717749212


In [21]:
df_metrics_rs = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_rs.round(4)

Unnamed: 0,train,val,test
MAE,1.5264,1.6986,1.648
MSE,4.7811,6.2441,5.4993
RMSE,2.1866,2.4988,2.345
R2,0.8752,0.84,0.8458


### Evaluate the Model with the New Hyperparameters

Using approximate estimation of hyperparameters, manual settings help to decrease overfitting even more:

In [22]:
params = {"max_depth": 5, "min_samples_leaf": 35, "min_samples_split": 80}
model = DecisionTreeRegressor(**params)
model.fit(x_train, y_train)

In the scaled values:

In [23]:
y_pred_train = model.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.2734420146039933
MSE: 0.15052084312756708
RMSE: 0.38797015752189895
R2-score: 0.8501828372165626


In [24]:
y_pred_val = model.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.2875313226543811
MSE: 0.17565667755400088
RMSE: 0.41911415814071573
R2-score: 0.8283764097518237


In [25]:
y_pred_test = model.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.28034908680837467
MSE: 0.15414513457780082
RMSE: 0.3926132124340708
R2-score: 0.8351892218062194


In the original values:

In [26]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.6885612455470584
MSE: 5.739840008653908
RMSE: 2.395796320360708
R2-score: 0.8501828372165627


In [27]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.7755656496980197
MSE: 6.6983495751295
RMSE: 2.5881169940961906
R2-score: 0.8283764097518237


In [28]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.7312138512975113
MSE: 5.878045805517824
RMSE: 2.4244681490004822
R2-score: 0.8351892218062194


In [29]:
df_metrics_dt = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_dt.round(4)

Unnamed: 0,train,val,test
MAE,1.6886,1.7756,1.7312
MSE,5.7398,6.6983,5.878
RMSE,2.3958,2.5881,2.4245
R2,0.8502,0.8284,0.8352


## Random Forest (sklearn)

### Build and train the model

Built and train the model using Random Forest:

In [30]:
model = RandomForestRegressor()
model.fit(x_train, y_train.ravel())

### Evaluate the Model

In the scaled values:

In [31]:
y_pred_train = model.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.0907532351212552
MSE: 0.017167182691676122
RMSE: 0.1310235959347633
R2-score: 0.982913073363055


In [32]:
y_pred_val = model.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.256701104048926
MSE: 0.14568400418359717
RMSE: 0.38168574008416556
R2-score: 0.8576609088371671


In [33]:
y_pred_test = model.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.24101478196858925
MSE: 0.12193297962272535
RMSE: 0.3491890313608452
R2-score: 0.8696302071800723


In the original values:

In [34]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.5604200801244893
MSE: 0.6546394506044791
RMSE: 0.8090979239897227
R2-score: 0.982913073363055


In [35]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.5851826450807218
MSE: 5.555395907032151
RMSE: 2.3569887371457994
R2-score: 0.8576609088371671


In [36]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.4883163475280745
MSE: 4.649693559182052
RMSE: 2.156314809850837
R2-score: 0.8696302071800723


In [37]:
df_metrics_rf = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_rf.round(4)

Unnamed: 0,train,val,test
MAE,0.5604,1.5852,1.4883
MSE,0.6546,5.5554,4.6497
RMSE,0.8091,2.357,2.1563
R2,0.9829,0.8577,0.8696


### Estimate Hyperparameters Using RandomizedSearchCV

We can see significant overfitting. Use RandomizedSearchCV to estimate approximatelly good hyperparameters that reduce overfitting.

In [38]:
param_distributions = {
    "max_depth": randint(6, 10),
    "min_samples_split": randint(20, 100),
    "min_samples_leaf": randint(10, 100),
}
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=100, cv=5, random_state=38)
random_search.fit(x_train, y_train.ravel())
print(random_search.best_params_)

{'max_depth': 9, 'min_samples_leaf': 14, 'min_samples_split': 28}


In the scaled values:

In [39]:
y_pred_train = random_search.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.2203255628443258
MSE: 0.10239904664889629
RMSE: 0.31999851038543337
R2-score: 0.8980796657665224


In [40]:
y_pred_val = random_search.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.25749061978298565
MSE: 0.1480215327690686
RMSE: 0.3847356660995554
R2-score: 0.8553770500409479


In [41]:
y_pred_test = random_search.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.24355410288337082
MSE: 0.12450516092787307
RMSE: 0.352852888507198
R2-score: 0.8668800509476489


In the original values:

In [42]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.3605561214177742
MSE: 3.904802368833588
RMSE: 1.9760572787329795
R2-score: 0.8980796657665224


In [43]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.5900580687540495
MSE: 5.644533330245292
RMSE: 2.3758226638882984
R2-score: 0.8553770500409479


In [44]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.5039971817002373
MSE: 4.74777903929251
RMSE: 2.1789398888662603
R2-score: 0.8668800509476489


In [45]:
df_metrics_rs = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_rs.round(4)

Unnamed: 0,train,val,test
MAE,1.3606,1.5901,1.504
MSE,3.9048,5.6445,4.7478
RMSE,1.9761,2.3758,2.1789
R2,0.8981,0.8554,0.8669


### Estimate Hyperparameters Using GridSearchCV

Check the hyperparameters in a more narrow range:

In [46]:
param_grid = {
    "max_depth": [9, 10],
    "min_samples_split": [25, 28, 30],
    "min_samples_leaf": [10, 14, 15]
}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(x_train, y_train.ravel())
print(grid_search.best_params_)

{'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 25}


In the scaled values:

In [47]:
y_pred_train = grid_search.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.20994798106955623
MSE: 0.09381951412332029
RMSE: 0.30629971290113917
R2-score: 0.9066190892395936


In [48]:
y_pred_val = grid_search.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.2553836032946158
MSE: 0.14656240112049898
RMSE: 0.38283469163661094
R2-score: 0.8568026799438889


In [49]:
y_pred_test = grid_search.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.24157555285996987
MSE: 0.12311902554707918
RMSE: 0.35088320784426147
R2-score: 0.8683620961086349


In the original values:

In [50]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.2964723980998747
MSE: 3.577637419298239
RMSE: 1.8914643584530582
R2-score: 0.9066190892395936


In [51]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.5770468042227286
MSE: 5.588891984898488
RMSE: 2.3640837516675437
R2-score: 0.8568026799438889


In [52]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.4917792242779684
MSE: 4.69492127454196
RMSE: 2.166776701587397
R2-score: 0.8683620961086349


In [53]:
df_metrics_gs = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_gs.round(4)

Unnamed: 0,train,val,test
MAE,1.2965,1.577,1.4918
MSE,3.5776,5.5889,4.6949
RMSE,1.8915,2.3641,2.1668
R2,0.9066,0.8568,0.8684


### Evaluate the Model with the New Hyperparameters

Using approximate estimation of hyperparameters, manual settings help to decrease overfitting even more:

In [54]:
params = {"max_depth": 6, "min_samples_leaf": 10, "min_samples_split": 25}
model = RandomForestRegressor(**params)
model.fit(x_train, y_train.ravel())

In the scaled values:

In [55]:
y_pred_train = model.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.2471388767865515
MSE: 0.12529816558487397
RMSE: 0.3539748092518364
R2-score: 0.8752875995121423


In [56]:
y_pred_val = model.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.26953403867340875
MSE: 0.15797099404146833
RMSE: 0.3974556504082793
R2-score: 0.8456560289651651


In [57]:
y_pred_test = model.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.2572152831224261
MSE: 0.13558110884040953
RMSE: 0.36821340122327095
R2-score: 0.8550377336426056


In the original values:

In [58]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.5261339052601686
MSE: 4.778018837068856
RMSE: 2.1858679825343654
R2-score: 0.8752875995121423


In [59]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.6644286823253003
MSE: 6.023938033868119
RMSE: 2.4543712094685515
R2-score: 0.845656028965165


In [60]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.5883578076761293
MSE: 5.170140272734873
RMSE: 2.2737942459103184
R2-score: 0.8550377336426056


In [61]:
df_metrics_rf = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_rf.round(4)

Unnamed: 0,train,val,test
MAE,1.5261,1.6644,1.5884
MSE,4.778,6.0239,5.1701
RMSE,2.1859,2.4544,2.2738
R2,0.8753,0.8457,0.855


## Gradient Boosted Trees (sklearn)

### Build and train the model

Built and train the model using GradienBoosting:

In [62]:
model = GradientBoostingRegressor()
model.fit(x_train, y_train.ravel())

### Evaluate the Model

In the scaled values:

In [63]:
y_pred_train = model.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.23425869639270977
MSE: 0.11412675194165171
RMSE: 0.3378265116027037
R2-score: 0.8864067871378009


In [64]:
y_pred_val = model.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.2598401514678394
MSE: 0.15160273712575173
RMSE: 0.3893619615804191
R2-score: 0.8518780703399489


In [65]:
y_pred_test = model.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.24327959798555399
MSE: 0.12521079553364822
RMSE: 0.35385137492123475
R2-score: 0.8661255919190405


In the original values:

In [66]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.446596115574872
MSE: 4.3520171907171585
RMSE: 2.0861488898727143
R2-score: 0.8864067871378009


In [67]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.60456691500423
MSE: 5.781096078756041
RMSE: 2.4043909995581085
R2-score: 0.8518780703399489


In [68]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.5023020569300456
MSE: 4.774687138247858
RMSE: 2.1851057499004156
R2-score: 0.8661255919190406


In [69]:
df_metrics_gb = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_gb.round(4)

Unnamed: 0,train,val,test
MAE,1.4466,1.6046,1.5023
MSE,4.352,5.7811,4.7747
RMSE,2.0861,2.4044,2.1851
R2,0.8864,0.8519,0.8661


### Estimate Hyperparameters Using RandomizedSearchCV

In this case, the model even with default hyperparameters has significantly less overfitting. However, we try to use RandomizedSearchCV to improve the val and test metrics.

In [70]:
param_distributions = {
    "max_depth": randint(6, 10),
    "min_samples_split": randint(20, 100),
    "min_samples_leaf": randint(10, 100),
    "max_features": [None, "sqrt"]
}
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=100, cv=5, random_state=38)
random_search.fit(x_train, y_train.ravel())
print(random_search.best_params_)


{'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 30, 'min_samples_split': 66}


In the scaled values:

In [71]:
y_pred_train = random_search.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.18111033589630157
MSE: 0.07052576114353447
RMSE: 0.2655668675560535
R2-score: 0.9298039446356807


In [72]:
y_pred_val = random_search.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.2451198254326184
MSE: 0.13711995603003696
RMSE: 0.370297118581872
R2-score: 0.8660283259581041


In [73]:
y_pred_test = random_search.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.23308161146287987
MSE: 0.11611355052739405
RMSE: 0.34075438445806394
R2-score: 0.8758522954767418


In the original values:

In [74]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.1183939483674321
MSE: 2.689372295831169
RMSE: 1.6399305765279117
R2-score: 0.9298039446356807


In [75]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.5136658437080395
MSE: 5.228821426007065
RMSE: 2.2866616334751115
R2-score: 0.8660283259581041


In [76]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.4393273716032964
MSE: 4.427780159981926
RMSE: 2.1042291129964736
R2-score: 0.8758522954767418


In [77]:
df_metrics_rs = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_rs.round(4)

Unnamed: 0,train,val,test
MAE,1.1184,1.5137,1.4393
MSE,2.6894,5.2288,4.4278
RMSE,1.6399,2.2867,2.1042
R2,0.9298,0.866,0.8759


### Estimate Hyperparameters Using GridSearchCV

Check the hyperparameters in a more narrow range:

In [78]:
param_grid = {
    "max_depth": [8, 9],
    "min_samples_split": [65, 66, 70],
    "min_samples_leaf": [29, 30, 31]
}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(x_train, y_train.ravel())
print(grid_search.best_params_)

{'max_depth': 9, 'min_samples_leaf': 31, 'min_samples_split': 70}


In the scaled values:

In [79]:
y_pred_train = grid_search.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train) 
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.1775520115028197
MSE: 0.06755227841265893
RMSE: 0.2599082115144863
R2-score: 0.9327635264256111


In [80]:
y_pred_val = grid_search.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.24522261431034295
MSE: 0.13658872469473027
RMSE: 0.3695791183153213
R2-score: 0.8665473601917423


In [81]:
y_pred_test = grid_search.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.23153441413502496
MSE: 0.11459042127275963
RMSE: 0.33851206961164565
R2-score: 0.8774808134214279


In the original values:

In [82]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.0964205560245623
MSE: 2.5759839119429904
RMSE: 1.6049871999311989
R2-score: 0.9327635264256111


In [83]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.5143005863815466
MSE: 5.208563880215483
RMSE: 2.2822278326704115
R2-score: 0.8665473601917423


In [84]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.4297731066860588
MSE: 4.36969838172154
RMSE: 2.0903823529970635
R2-score: 0.8774808134214279


In [85]:
df_metrics_gs = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_gs.round(4)

Unnamed: 0,train,val,test
MAE,1.0964,1.5143,1.4298
MSE,2.576,5.2086,4.3697
RMSE,1.605,2.2822,2.0904
R2,0.9328,0.8665,0.8775


### Evaluate the Model with the New Hyperparameters

Using approximate estimation of hyperparameters, manual settings help to decrease overfitting even more:

In [87]:
params = {"max_depth": 5, "min_samples_leaf": 31, "min_samples_split": 70}
model =  GradientBoostingRegressor(**params)
model.fit(x_train, y_train.ravel())

In the scaled values:

In [88]:
y_pred_train = model.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.2145395261313963
MSE: 0.09707793119945474
RMSE: 0.31157331592974186
R2-score: 0.9033759051637653


In [89]:
y_pred_val = model.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.24793762880775047
MSE: 0.14207537223076067
RMSE: 0.37692886892722965
R2-score: 0.8611866863951521


In [90]:
y_pred_test = model.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.23602028362732275
MSE: 0.11858527102301353
RMSE: 0.3443621219341836
R2-score: 0.8732095511599891


In the original values:

In [91]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.324826142713095
MSE: 3.701891258898559
RMSE: 1.924029952703065
R2-score: 0.9033759051637653


In [92]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.5310663649253462
MSE: 5.417787256768055
RMSE: 2.3276140695502026
R2-score: 0.861186686395152


In [93]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.4574742827041105
MSE: 4.52203474888921
RMSE: 2.1265076413898
R2-score: 0.8732095511599891


In [94]:
df_metrics_gb = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_gb.round(4)

Unnamed: 0,train,val,test
MAE,1.3248,1.5311,1.4575
MSE,3.7019,5.4178,4.522
RMSE,1.924,2.3276,2.1265
R2,0.9034,0.8612,0.8732


## XGBoost

### Build and Train the Model

Built and train the model using XGBoost regression:

In [95]:
model = XGBRegressor()
model.fit(x_train, y_train)

### Evaluate the Model

In the scaled values:

In [96]:
y_pred_train = model.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.1454748973354826
MSE: 0.042982169874809135
RMSE: 0.20732141682616664
R2-score: 0.957218770456513


In [97]:
y_pred_val = model.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.25669734306402314
MSE: 0.14599423419327437
RMSE: 0.38209191851343094
R2-score: 0.8573578017261542


In [98]:
y_pred_test = model.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.23648303200784174
MSE: 0.12141436886662976
RMSE: 0.3484456469331046
R2-score: 0.8701847017641919


In the original values:

In [99]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.8983377189470861
MSE: 1.6390473061021904
RMSE: 1.280252828976445
R2-score: 0.9572187697396022


In [100]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.5851594120092416
MSE: 5.5672259555501205
RMSE: 2.3594969708711475
R2-score: 0.8573578020230643


In [101]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.4603318396668528
MSE: 4.629917233151685
RMSE: 2.151724246540826
R2-score: 0.8701847029752257


In [102]:
df_metrics_xgb = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_xgb.round(4)

Unnamed: 0,train,val,test
MAE,0.8983,1.5852,1.4603
MSE,1.639,5.5672,4.6299
RMSE,1.2803,2.3595,2.1517
R2,0.9572,0.8574,0.8702


### Estimate Hyperparameters Using RandomizedSearchCV

We can see significant overfitting. Use RandomizedSearchCV to estimate approximatelly good hyperparameters that reduce overfitting.

XGBoost has more settings and works faster than GradientBoostedRegressor, so we can use more hyperparameters.

In [103]:
param_distributions = {
    "learning_rate": uniform(0.01, 0.6),
    "n_estimators": randint(100, 1000),
    "max_depth": randint(5, 10),
    "subsample": uniform(0.5, 0.5),
    "colsample_bytree": uniform(0.5, 0.5),
    "gamma": uniform(0, 0.5),
    "reg_lambda": uniform(0.1, 1),
    "reg_alpha": uniform(0.1, 1),
    "max_leaves": randint(1, 100)
}
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=1000, cv=5, random_state=38)
random_search.fit(x_train, y_train.ravel())
print(random_search.best_params_)


{'colsample_bytree': 0.9491515787947424, 'gamma': 0.003722285742570386, 'learning_rate': 0.04156013751659483, 'max_depth': 5, 'max_leaves': 82, 'n_estimators': 650, 'reg_alpha': 0.2847501424794141, 'reg_lambda': 0.702978329113069, 'subsample': 0.769618058989056}


In the scaled values:

In [104]:
y_pred_train = random_search.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.1758401922119549
MSE: 0.06281546558517148
RMSE: 0.2506301370250024
R2-score: 0.9374781947978107


In [105]:
y_pred_val = random_search.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.24399735057471872
MSE: 0.13536486764483047
RMSE: 0.36791964835386337
R2-score: 0.8677431174141785


In [106]:
y_pred_test = random_search.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.22528416783697136
MSE: 0.11147334794377094
RMSE: 0.33387624645034414
R2-score: 0.8808135639648998


In the original values:

In [107]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.0858497235044045
MSE: 2.3953541530131175
RMSE: 1.5476931714694349
R2-score: 0.937478194074244


In [108]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.5067343457136992
MSE: 5.1618943825261665
RMSE: 2.271980277759067
R2-score: 0.8677431154533456


In [109]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.3911765280891482
MSE: 4.250834398865515
RMSE: 2.0617551743273292
R2-score: 0.8808135648428824


In [110]:
df_metrics_rs = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_rs.round(4)

Unnamed: 0,train,val,test
MAE,1.0858,1.5067,1.3912
MSE,2.3954,5.1619,4.2508
RMSE,1.5477,2.272,2.0618
R2,0.9375,0.8677,0.8808


### Evaluate the Model with the New Hyperparameters

In [None]:
Using approximate estimation of hyperparameters, manual settings help to decrease overfitting even more:

In [111]:
params = {
    "colsample_bytree": 0.9492, 
    "gamma": 0.0038, 
    "learning_rate": 0.04, 
    "max_depth": 6, 
    "max_leaves": 80, 
    "n_estimators": 650, 
    "reg_alpha": 0.3 * 30, 
    "reg_lambda": 0.7 * 30, 
    "subsample": 0.77
}
model = XGBRegressor(**params)
model.fit(x_train, y_train.ravel())

In the scaled values:

In [112]:
y_pred_train = model.predict(x_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train, y_pred_train)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 0.20620473479861984
MSE: 0.0904693041425173
RMSE: 0.30078115656157267
R2-score: 0.9099536370910648


In [113]:
y_pred_val = model.predict(x_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val, y_pred_val)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 0.245897181693794
MSE: 0.1374414374117688
RMSE: 0.37073095016705687
R2-score: 0.8657142258071787


In [114]:
y_pred_test = model.predict(x_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test, y_pred_test)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 0.2282259772921526
MSE: 0.11217559437566456
RMSE: 0.3349262521446543
R2-score: 0.8800627275454358


In the original values:

In [115]:
y_train_original = scaler.inverse_transform(y_train.reshape(-1, 1))
y_pred_train_original = scaler.inverse_transform(y_pred_train.reshape(-1, 1))
mae_train = mean_absolute_error(y_train_original, y_pred_train_original)
mse_train = mean_squared_error(y_train_original, y_pred_train_original)
rmse_train = mse_train ** (0.5)
r2_train = r2_score(y_train_original, y_pred_train_original)
print(f"MAE: {mae_train}\nMSE: {mse_train}\nRMSE: {rmse_train}\nR2-score: {r2_train}")

MAE: 1.2733570799905032
MSE: 3.4498832427463246
RMSE: 1.8573861318385911
R2-score: 0.9099536365851442


In [116]:
y_val_original = scaler.inverse_transform(y_val.reshape(-1, 1))
y_pred_val_original = scaler.inverse_transform(y_pred_val.reshape(-1, 1))
mae_val = mean_absolute_error(y_val_original, y_pred_val_original)
mse_val = mean_squared_error(y_val_original, y_pred_val_original)
rmse_val = mse_val ** (0.5)
r2_val = r2_score(y_val_original, y_pred_val_original)
print(f"MAE: {mae_val}\nMSE: {mse_val}\nRMSE: {rmse_val}\nR2-score: {r2_val}")

MAE: 1.5184661964404387
MSE: 5.241080569911943
RMSE: 2.289340640864077
R2-score: 0.8657142249595333


In [117]:
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))
y_pred_test_original = scaler.inverse_transform(y_pred_test.reshape(-1, 1))
mae_test = mean_absolute_error(y_test_original, y_pred_test_original)
mse_test = mean_squared_error(y_test_original, y_pred_test_original)
rmse_test = mse_test ** (0.5)
r2_test = r2_score(y_test_original, y_pred_test_original)
print(f"MAE: {mae_test}\nMSE: {mse_test}\nRMSE: {rmse_test}\nR2-score: {r2_test}")

MAE: 1.4093428238967822
MSE: 4.277613357608494
RMSE: 2.0682391925520833
R2-score: 0.8800627266943422


In [118]:
df_metrics_xgb = pd.DataFrame(
    index=["MAE", "MSE", "RMSE", "R2"],
    columns=["train", "val", "test"],
    data={
        "train": [mae_train, mse_train, rmse_train, r2_train],
        "val": [mae_val, mse_val, rmse_val, r2_val],
        "test": [mae_test, mse_test, rmse_test, r2_test]
    }
)
df_metrics_xgb.round(4)

Unnamed: 0,train,val,test
MAE,1.2734,1.5185,1.4093
MSE,3.4499,5.2411,4.2776
RMSE,1.8574,2.2893,2.0682
R2,0.91,0.8657,0.8801


## The Final Table of Metrics

Concatinate all dataframes with metrics into one table:

In [119]:
df_metrics = pd.concat([df_metrics_dt, df_metrics_rf, df_metrics_gb, df_metrics_xgb], axis=1)
# Set a multiindex columns
df_metrics.columns = pd.MultiIndex.from_product([
    ["Decision Tree", "Random Forest", "Gradient Boosted Tree (sklearn)", "XGBoost"],
    ["train", "val", "test"]
])
df_metrics

Unnamed: 0_level_0,Decision Tree,Decision Tree,Decision Tree,Random Forest,Random Forest,Random Forest,Gradient Boosted Tree (sklearn),Gradient Boosted Tree (sklearn),Gradient Boosted Tree (sklearn),XGBoost,XGBoost,XGBoost
Unnamed: 0_level_1,train,val,test,train,val,test,train,val,test,train,val,test
MAE,1.688561,1.775566,1.731214,1.526134,1.664429,1.588358,1.324826,1.531066,1.457474,1.273357,1.518466,1.409343
MSE,5.73984,6.69835,5.878046,4.778019,6.023938,5.17014,3.701891,5.417787,4.522035,3.449883,5.241081,4.277613
RMSE,2.395796,2.588117,2.424468,2.185868,2.454371,2.273794,1.92403,2.327614,2.126508,1.857386,2.289341,2.068239
R2,0.850183,0.828376,0.835189,0.875288,0.845656,0.855038,0.903376,0.861187,0.87321,0.909954,0.865714,0.880063


Prepare the data for plotly:

In [120]:
r2_data = df_metrics.loc["R2"]
r2_data

Decision Tree                    train    0.850183
                                 val      0.828376
                                 test     0.835189
Random Forest                    train    0.875288
                                 val      0.845656
                                 test     0.855038
Gradient Boosted Tree (sklearn)  train    0.903376
                                 val      0.861187
                                 test     0.873210
XGBoost                          train    0.909954
                                 val      0.865714
                                 test     0.880063
Name: R2, dtype: float64

In [127]:
# Extract the 3 color code in the format "#4C72B0"
deep_colors = sns.color_palette("deep", 3).as_hex() 
# Set the plot.
fig = px.bar(r2_data, 
             x=r2_data.index.get_level_values(0), 
             y="R2",
             color=r2_data.index.get_level_values(1), 
             barmode="group", 
             title="R2-score for Different Tree Models",
             labels={"x": "Model", "R2": "R2-score"},
             color_discrete_sequence=deep_colors,
             text="R2"
)
# Update the y-axis range if desired
fig.update_layout(
    yaxis=dict(range=[0.75, 0.95]),
    title_x=0.5
)
fig.update_traces(texttemplate="%{text:.4f}", textposition="outside")
# Show the plot.
fig.show()


<div style="text-align: center;">
    Fig. 7.1. R2-score for Different Tree Models.
</div>

# Conclusions

- Decison Tree has an R2-score of 0.8352.
- Random forest has an R2-score of 0.8550.
- Gradient Boosted Tree (sklearn) has an R2-score of 0.8732.
- XGBoost has an R2-score of 0.8801.