In [1]:
import numpy as np
import pandas as pd
import pprint
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# setup pprint with 4 space indent
pp = pprint.PrettyPrinter(indent=4)

In [2]:

# Step 1: Load your dataset (replace with your dataset)
# Example: data = pd.read_csv('your_dataset.csv')
# X = data.drop('target', axis=1)
# y = data['target']

# For demonstration, let's create a synthetic dataset
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=42)

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 20), (200, 20), (800,), (200,))

In [3]:

# Step 3: Define the hyperparameter space for RandomizedSearchCV
random_grid = {
    'n_estimators': np.arange(100, 2001, 200),
    'max_depth': list(np.arange(10, 101, 10)) + [None],
    'min_samples_split': np.arange(2, 11, 2),
    'min_samples_leaf': np.arange(1, 5),
    'max_features': ['log2', 'sqrt', None],
    'min_impurity_decrease': [0.0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
}
pp.pprint(random_grid)


{   'max_depth': [   np.int64(10),
                     np.int64(20),
                     np.int64(30),
                     np.int64(40),
                     np.int64(50),
                     np.int64(60),
                     np.int64(70),
                     np.int64(80),
                     np.int64(90),
                     np.int64(100),
                     None],
    'max_features': ['log2', 'sqrt', None],
    'min_impurity_decrease': [0.0, 1e-05, 0.0001, 0.001, 0.01, 0.1],
    'min_samples_leaf': array([1, 2, 3, 4]),
    'min_samples_split': array([ 2,  4,  6,  8, 10]),
    'n_estimators': array([ 100,  300,  500,  700,  900, 1100, 1300, 1500, 1700, 1900])}


In [4]:

# Step 4: Perform Randomized Search
rf = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, 
                               n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

# Step 5: Identify the best parameters from Randomized Search
best_random_params = rf_random.best_params_
print(f"Best parameters from Randomized Search: {best_random_params}")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=10, max_features=log2, min_impurity_decrease=0.01, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.4s
[CV] END max_depth=10, max_features=log2, min_impurity_decrease=0.01, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=log2, min_impurity_decrease=0.01, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.5s
[CV] END max_depth=50, max_features=None, min_impurity_decrease=0.0, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END max_depth=50, max_features=None, min_impurity_decrease=0.0, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END max_depth=50, max_features=None, min_impurity_decrease=0.0, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.0s
[CV] END max_depth=70, max_features=log2, min_impuri

In [5]:
print(f"best random parameters: {best_random_params}")

best random parameters: {'n_estimators': np.int64(1100), 'min_samples_split': np.int64(4), 'min_samples_leaf': np.int64(2), 'min_impurity_decrease': 1e-05, 'max_features': None, 'max_depth': np.int64(20)}


In [6]:
# Step 9: Evaluate the final model on the test set
random_best_model = rf_random.best_estimator_
y_pred = random_best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Random Best Model Mean Squared Error on test set: {mse}")

Random Best Model Mean Squared Error on test set: 6695.326311442586


In [7]:

# Step 6: Define the hyperparameter space for GridSearchCV based on Randomized Search results
param_grid = {
    'n_estimators': [best_random_params['n_estimators'] - 100, best_random_params['n_estimators'], best_random_params['n_estimators'] + 100],
    'max_depth': [best_random_params['max_depth'] - 10, best_random_params['max_depth'], best_random_params['max_depth'] + 10],
    'min_samples_split': [best_random_params['min_samples_split'] - 1, best_random_params['min_samples_split'], best_random_params['min_samples_split'] + 1],
    'min_samples_leaf': [best_random_params['min_samples_leaf'] - 1, best_random_params['min_samples_leaf'], best_random_params['min_samples_leaf'] + 1],
    'max_features': [best_random_params['max_features']],
    'min_impurity_decrease': [best_random_params['min_impurity_decrease']]
}
pp.pprint(param_grid)


{   'max_depth': [np.int64(10), np.int64(20), np.int64(30)],
    'max_features': [None],
    'min_impurity_decrease': [1e-05],
    'min_samples_leaf': [np.int64(1), np.int64(2), np.int64(3)],
    'min_samples_split': [np.int64(3), np.int64(4), np.int64(5)],
    'n_estimators': [np.int64(1000), np.int64(1100), np.int64(1200)]}


In [8]:

# Step 7: Perform Grid Search
rf_grid = GridSearchCV(estimator=rf, param_grid=param_grid, 
                       cv=3, verbose=2, n_jobs=-1)
rf_grid.fit(X_train, y_train)

# Step 8: Identify the best parameters from Grid Search
best_grid_params = rf_grid.best_params_

pp.pprint(f"Best parameters from Grid Search: {best_grid_params}")

# Step 9: Evaluate the final model on the test set
best_model = rf_grid.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on test set Narrow Grid: {mse}")

Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END max_depth=10, max_features=None, min_impurity_decrease=1e-05, min_samples_leaf=1, min_samples_split=4, n_estimators=1000; total time=   9.9s
[CV] END max_depth=10, max_features=None, min_impurity_decrease=1e-05, min_samples_leaf=1, min_samples_split=4, n_estimators=1000; total time=   9.9s
[CV] END max_depth=10, max_features=None, min_impurity_decrease=1e-05, min_samples_leaf=1, min_samples_split=4, n_estimators=1000; total time=  10.1s
[CV] END max_depth=10, max_features=None, min_impurity_decrease=1e-05, min_samples_leaf=1, min_samples_split=3, n_estimators=1000; total time=  10.2s
[CV] END max_depth=10, max_features=None, min_impurity_decrease=1e-05, min_samples_leaf=1, min_samples_split=3, n_estimators=1000; total time=  10.2s
[CV] END max_depth=10, max_features=None, min_impurity_decrease=1e-05, min_samples_leaf=1, min_samples_split=3, n_estimators=1000; total time=  10.4s
[CV] END max_depth=10, max_features=No

  _data = np.array(data, dtype=dtype, copy=copy,


("Best parameters from Grid Search: {'max_depth': np.int64(20), "
 "'max_features': None, 'min_impurity_decrease': 1e-05, 'min_samples_leaf': "
 "np.int64(2), 'min_samples_split': np.int64(3), 'n_estimators': "
 'np.int64(1100)}')
Mean Squared Error on test set Narrow Grid: 6695.326311442586


In [9]:

# Step 8: Perform Full Grid Search using original hyperparameter space from Random Search
rf_grid_full = GridSearchCV(estimator=rf, param_grid=random_grid, 
                       cv=3, verbose=2, n_jobs=-1)
rf_grid_full.fit(X_train, y_train)

# Step 8: Identify the best parameters from Grid Search
best_grid_full_params = rf_grid_full.best_params_
pp.pprint(f"Best parameters from Grid Search: {best_grid_full_params}")

# Step 9: Evaluate the final model on the test set
best_model = rf_grid_full.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on test set Full Grid: {mse}")

Fitting 3 folds for each of 39600 candidates, totalling 118800 fits
[CV] END max_depth=10, max_features=log2, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s[CV] END max_depth=10, max_features=log2, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s

[CV] END max_depth=10, max_features=log2, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=10, max_features=log2, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.1s
[CV] END max_depth=10, max_features=log2, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.1s
[CV] END max_depth=10, max_features=log2, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.1s
[CV] END max_depth=10, max_features=log2, min_impu

KeyboardInterrupt: 