In [1]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [2]:
X = np.load("../data/X_train_scaled.npy", allow_pickle=True)
y = np.load("../data/y_train_scaled.npy", allow_pickle=True)

In [3]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (3104579, 26)
y shape: (3104579, 1)


In [4]:
dtr = DecisionTreeRegressor()

In [5]:
param_grid = {
    "splitter":["best","random"],
    "max_depth": [2,6,12]
}

In [6]:
grid_search = GridSearchCV(
    dtr,
    param_grid,
    scoring="neg_mean_squared_error",
    cv=10,
    verbose=10
)

In [7]:
np.random.seed(2907)
grid_search.fit(X, y)

Fitting 10 folds for each of 6 candidates, totalling 60 fits
[CV 1/10; 1/6] START max_depth=2, splitter=best.................................
[CV 1/10; 1/6] END .max_depth=2, splitter=best;, score=-0.986 total time=   7.4s
[CV 2/10; 1/6] START max_depth=2, splitter=best.................................
[CV 2/10; 1/6] END .max_depth=2, splitter=best;, score=-0.777 total time=   7.5s
[CV 3/10; 1/6] START max_depth=2, splitter=best.................................
[CV 3/10; 1/6] END .max_depth=2, splitter=best;, score=-0.663 total time=   7.2s
[CV 4/10; 1/6] START max_depth=2, splitter=best.................................
[CV 4/10; 1/6] END .max_depth=2, splitter=best;, score=-0.606 total time=   6.9s
[CV 5/10; 1/6] START max_depth=2, splitter=best.................................
[CV 5/10; 1/6] END .max_depth=2, splitter=best;, score=-0.728 total time=   7.3s
[CV 6/10; 1/6] START max_depth=2, splitter=best.................................
[CV 6/10; 1/6] END .max_depth=2, splitter=best;,

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [2, 6, 12],
                         'splitter': ['best', 'random']},
             scoring='neg_mean_squared_error', verbose=10)

In [8]:
print("GridSearch results:")
grid_search.cv_results_

GridSearch results:


{'mean_fit_time': array([ 7.29052346,  1.74035494, 20.70936797,  5.11392949, 33.93177483,
         8.02619903]),
 'std_fit_time': array([0.15019602, 0.11779713, 0.33531936, 0.08396749, 0.30240844,
        0.21710341]),
 'mean_score_time': array([0.01354132, 0.01313772, 0.02033403, 0.01886063, 0.03406024,
        0.03253558]),
 'std_score_time': array([0.00095661, 0.00040644, 0.00028913, 0.00079631, 0.00065641,
        0.00135057]),
 'param_max_depth': masked_array(data=[2, 2, 6, 6, 12, 12],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_splitter': masked_array(data=['best', 'random', 'best', 'random', 'best', 'random'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 2, 'splitter': 'best'},
  {'max_depth': 2, 'splitter': 'random'},
  {'max_depth': 6, 'splitter': 'best'},
  {'max_depth': 6, 'splitter': 'random'},
  {'max

In [9]:
with open("../grids/grid_search_decision_tree_regressor_01.pkl", mode="wb") as f:
    pickle.dump(grid_search, f)

In [10]:
best_regressor = grid_search.best_estimator_

In [31]:
print("Best regressor", best_regressor)

Best regressor DecisionTreeRegressor(max_depth=12)


In [32]:
with open("../models/model_decision_tree_regressor_01.pkl", mode="wb") as f:
    pickle.dump(best_regressor, f)

In [33]:
y_hat = best_regressor.predict(X)

In [34]:
r2 = r2_score(y, y_hat)
mse = mean_squared_error(y, y_hat)

print(f"R2 Score: {r2:.3f}")
print(f"MSE: {mse:.3f}")

R2 Score: 0.673
MSE: 0.327
