In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [2]:
X = np.load("../data/X_train_scaled.npy", allow_pickle=True)
y = np.load("../data/y_train_scaled.npy", allow_pickle=True)

In [3]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (3104579, 26)
y shape: (3104579, 1)


In [4]:
rf = RandomForestRegressor()

In [5]:
param_grid = {
    "bootstrap": [True, False],
    "max_depth": [2,6,12],
    "n_estimators": [5, 50],
}

In [6]:
grid_search = GridSearchCV(
    rf,
    param_grid,
    scoring="neg_mean_squared_error",
    cv=10,
    verbose=10
)

In [7]:
y = y.ravel()

In [8]:
np.random.seed(2907)
grid_search.fit(X, y)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10; 1/12] START bootstrap=True, max_depth=2, n_estimators=5...............
[CV 1/10; 1/12] END bootstrap=True, max_depth=2, n_estimators=5;, score=-0.986 total time=  25.7s
[CV 2/10; 1/12] START bootstrap=True, max_depth=2, n_estimators=5...............
[CV 2/10; 1/12] END bootstrap=True, max_depth=2, n_estimators=5;, score=-0.777 total time=  26.1s
[CV 3/10; 1/12] START bootstrap=True, max_depth=2, n_estimators=5...............
[CV 3/10; 1/12] END bootstrap=True, max_depth=2, n_estimators=5;, score=-0.663 total time=  26.1s
[CV 4/10; 1/12] START bootstrap=True, max_depth=2, n_estimators=5...............
[CV 4/10; 1/12] END bootstrap=True, max_depth=2, n_estimators=5;, score=-0.607 total time=  26.0s
[CV 5/10; 1/12] START bootstrap=True, max_depth=2, n_estimators=5...............
[CV 5/10; 1/12] END bootstrap=True, max_depth=2, n_estimators=5;, score=-0.728 total time=  25.8s
[CV 6/10; 1/12] START bootstrap=True, max_

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 6, 12],
                         'n_estimators': [5, 50]},
             scoring='neg_mean_squared_error', verbose=10)

In [9]:
print("GridSearch results:")
grid_search.cv_results_

GridSearch results:


{'mean_fit_time': array([  25.93389876,  247.11600022,   65.12240374,  644.75568738,
         101.87934301, 1021.94686739,   34.75123045,  343.92089531,
          97.00566423,  961.96774209,  156.36534138, 1599.17191017]),
 'std_fit_time': array([ 0.26509685,  8.92303769,  0.59417618,  5.8228937 ,  0.49918044,
         5.61983259,  0.39749035,  1.24517403,  0.75179426,  5.27321824,
         0.83007246, 39.01214001]),
 'mean_score_time': array([0.03386769, 0.22660906, 0.06175754, 0.51004894, 0.12733037,
        1.16862841, 0.03041189, 0.21382775, 0.06037731, 0.51408701,
        0.12608624, 1.17540734]),
 'std_score_time': array([0.00097734, 0.01324965, 0.00122357, 0.01153862, 0.00394691,
        0.03947756, 0.00094273, 0.0057709 , 0.001409  , 0.00977595,
        0.00378195, 0.0468983 ]),
 'param_bootstrap': masked_array(data=[True, True, True, True, True, True, False, False,
                    False, False, False, False],
              mask=[False, False, False, False, False, False, Fa

In [10]:
with open("../grids/grid_search_random_forest_01.pkl", mode="wb") as f:
    pickle.dump(grid_search, f)

In [11]:
best_regressor = grid_search.best_estimator_

In [12]:
print("Best regressor", best_regressor)

Best regressor RandomForestRegressor(max_depth=12, n_estimators=50)


In [13]:
with open("../models/model_random_forest_01.pkl", mode="wb") as f:
    pickle.dump(best_regressor, f)

In [14]:
y_hat = best_regressor.predict(X)

In [15]:
r2 = r2_score(y, y_hat)
mse = mean_squared_error(y, y_hat)

print(f"R2 Score: {r2:.3f}")
print(f"MSE: {mse:.3f}")

R2 Score: 0.694
MSE: 0.306
