In [None]:
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [None]:
X = np.load("../data/X_train_scaled.npy", allow_pickle=True)
y = np.load("../data/y_train_scaled.npy", allow_pickle=True)

In [None]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (3104579, 26)
y shape: (3104579, 1)


In [None]:
gb = GradientBoostingRegressor()

In [None]:
param_grid = {
    "learning_rate": [0.05, 0.1],
    "n_estimators": [5, 50],
    "subsample": [0.8, 1]
}

In [None]:
grid_search = GridSearchCV(
    gb,
    param_grid,
    scoring="neg_mean_squared_error",
    cv=10,
    verbose=10
)

In [None]:
y = y.ravel()

In [None]:
np.random.seed(2907)
grid_search.fit(X, y)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
[CV 1/10; 1/8] START learning_rate=0.05, n_estimators=5, subsample=0.8..........
[CV 1/10; 1/8] END learning_rate=0.05, n_estimators=5, subsample=0.8;, score=-1.006 total time=  43.3s
[CV 2/10; 1/8] START learning_rate=0.05, n_estimators=5, subsample=0.8..........
[CV 2/10; 1/8] END learning_rate=0.05, n_estimators=5, subsample=0.8;, score=-0.813 total time=  43.6s
[CV 3/10; 1/8] START learning_rate=0.05, n_estimators=5, subsample=0.8..........
[CV 3/10; 1/8] END learning_rate=0.05, n_estimators=5, subsample=0.8;, score=-0.812 total time=  43.8s
[CV 4/10; 1/8] START learning_rate=0.05, n_estimators=5, subsample=0.8..........
[CV 4/10; 1/8] END learning_rate=0.05, n_estimators=5, subsample=0.8;, score=-0.699 total time=  44.0s
[CV 5/10; 1/8] START learning_rate=0.05, n_estimators=5, subsample=0.8..........
[CV 5/10; 1/8] END learning_rate=0.05, n_estimators=5, subsample=0.8;, score=-0.768 total time=  43.7s
[CV 6/10; 1/8] STAR

GridSearchCV(cv=10, estimator=GradientBoostingRegressor(),
             param_grid={'learning_rate': [0.05, 0.1], 'n_estimators': [5, 50],
                         'subsample': [0.8, 1]},
             scoring='neg_mean_squared_error', verbose=10)

In [None]:
print("GridSearch results:")
grid_search.cv_results_

GridSearch results:


{'mean_fit_time': array([ 44.58770137,  56.42193801, 503.12883668, 625.14808743,
         52.68514543,  64.51117384, 534.25094497, 659.52282341]),
 'std_fit_time': array([ 1.23351612,  0.41231925, 18.56694758,  5.68773909,  0.36139317,
         0.44193294,  4.04770172,  6.32628813]),
 'mean_score_time': array([0.0513463 , 0.04928932, 0.25762007, 0.2631268 , 0.05661757,
        0.05961094, 0.26111152, 0.2697145 ]),
 'std_score_time': array([0.00655927, 0.00741082, 0.0102492 , 0.01363183, 0.00664163,
        0.01181175, 0.01308236, 0.00642211]),
 'param_learning_rate': masked_array(data=[0.05, 0.05, 0.05, 0.05, 0.1, 0.1, 0.1, 0.1],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[5, 5, 50, 50, 5, 5, 50, 50],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_subsample': masked_array(

In [None]:
with open("../grids/grid_search_gradient_boosting_01.pkl", mode="wb") as f:
    pickle.dump(grid_search, f)

In [None]:
best_regressor = grid_search.best_estimator_

In [None]:
print("Best regressor", best_regressor)

Best regressor GradientBoostingRegressor(n_estimators=50, subsample=1)


In [None]:
with open("../models/model_gradient_boosting_01.pkl", mode="wb") as f:
    pickle.dump(best_regressor, f)

In [None]:
y_hat = best_regressor.predict(X)

In [None]:
r2 = r2_score(y, y_hat)
mse = mean_squared_error(y, y_hat)

print(f"R2 Score: {r2:.3f}")
print(f"MSE: {mse:.3f}")

R2 Score: 0.560
MSE: 0.440
