# Randomized Grid Search for Hyperparameter Optimization of the Random Forest Regressor
In this notebook, after we found out that the RF regressor is the best model for our dataset, we will perform a randomized grid search to find the best hyperparameters for the RF regressor. We will use the `RandomizedSearchCV` class from the `sklearn` library to perform the randomized grid search. We will use the `mean_squared_error` as the scoring metric for the grid search.

## Imports

In [1]:
# Loop printing
from tqdm import tqdm

# Data management
import pandas as pd

# Test and train split and mean squared error metric
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Randomized search for hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Random forest regressor
from sklearn.ensemble import RandomForestRegressor

## Data Loading

In [2]:
df = pd.read_csv('../../data/no_outliers.csv', sep=';', index_col=1)
df = df.rename(columns={'Unnamed: 0': 'Timestamp'})
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour + df['Timestamp'].dt.minute / 60

df = df.drop(columns=df.columns[:9])
df = df.drop(columns=df.columns[1:10])
df

Unnamed: 0_level_0,Power_Total,SWD,SWDtop,CU,CM,CD,TT2M,ST,RH2m,WS10m,WS100m,PREC,SNOW,Month,Day,Hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
577035.0,0.0,0.0,0.0,0.00,0.0,0.0,8.38,8.38,93.34,3.46,9.10,0.0,0.0,1,1,2.25
577038.0,0.0,0.0,0.0,0.00,0.0,0.0,8.52,8.52,92.18,3.81,9.52,0.0,0.0,1,1,2.50
577041.0,0.0,0.0,0.0,0.00,0.0,0.0,8.66,8.66,90.91,4.15,9.66,0.0,0.0,1,1,2.75
577044.0,0.0,0.0,0.0,0.00,0.0,0.0,8.76,8.76,90.18,4.36,9.50,0.0,0.0,1,1,3.00
577047.0,0.0,0.0,0.0,0.00,0.0,0.0,8.74,8.74,90.39,4.38,9.16,0.0,0.0,1,1,3.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667201.0,0.0,0.0,0.0,0.76,0.0,0.0,13.14,13.14,72.48,7.39,13.79,0.0,0.0,12,31,21.50
667204.0,0.0,0.0,0.0,1.00,0.0,0.0,13.13,13.13,72.07,7.46,13.93,0.0,0.0,12,31,21.75
667207.0,0.0,0.0,0.0,1.00,0.0,0.0,13.12,13.12,71.67,7.52,14.04,0.0,0.0,12,31,22.00
667210.0,0.0,0.0,0.0,1.00,0.0,0.0,13.10,13.10,71.36,7.55,14.10,0.0,0.0,12,31,22.25


In [3]:
target = df.drop(columns=df.columns[1:])
target

Unnamed: 0_level_0,Power_Total
id,Unnamed: 1_level_1
577035.0,0.0
577038.0,0.0
577041.0,0.0
577044.0,0.0
577047.0,0.0
...,...
667201.0,0.0
667204.0,0.0
667207.0,0.0
667210.0,0.0


In [4]:
features = df.drop(columns=['Power_Total'])
features

Unnamed: 0_level_0,SWD,SWDtop,CU,CM,CD,TT2M,ST,RH2m,WS10m,WS100m,PREC,SNOW,Month,Day,Hour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
577035.0,0.0,0.0,0.00,0.0,0.0,8.38,8.38,93.34,3.46,9.10,0.0,0.0,1,1,2.25
577038.0,0.0,0.0,0.00,0.0,0.0,8.52,8.52,92.18,3.81,9.52,0.0,0.0,1,1,2.50
577041.0,0.0,0.0,0.00,0.0,0.0,8.66,8.66,90.91,4.15,9.66,0.0,0.0,1,1,2.75
577044.0,0.0,0.0,0.00,0.0,0.0,8.76,8.76,90.18,4.36,9.50,0.0,0.0,1,1,3.00
577047.0,0.0,0.0,0.00,0.0,0.0,8.74,8.74,90.39,4.38,9.16,0.0,0.0,1,1,3.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667201.0,0.0,0.0,0.76,0.0,0.0,13.14,13.14,72.48,7.39,13.79,0.0,0.0,12,31,21.50
667204.0,0.0,0.0,1.00,0.0,0.0,13.13,13.13,72.07,7.46,13.93,0.0,0.0,12,31,21.75
667207.0,0.0,0.0,1.00,0.0,0.0,13.12,13.12,71.67,7.52,14.04,0.0,0.0,12,31,22.00
667210.0,0.0,0.0,1.00,0.0,0.0,13.10,13.10,71.36,7.55,14.10,0.0,0.0,12,31,22.25


## Splitting into Train, Validation and Test Sets

In [5]:
features_train, features_temp, target_train, target_temp = train_test_split(features, target, test_size=0.25, random_state=42)
features_val, features_test, target_val, target_test = train_test_split(features_temp, target_temp, test_size=0.5, random_state=42)

print('Training features shape:', features_train.shape)
print('Validation features shape:', features_val.shape)
print('Testing features shape:', features_test.shape)
print('Training target shape:', target_train.shape)
print('Validation target shape:', target_val.shape)
print('Testing target shape:', target_test.shape)

Training features shape: (21621, 15)
Validation features shape: (3603, 15)
Testing features shape: (3604, 15)
Training target shape: (21621, 1)
Validation target shape: (3603, 1)
Testing target shape: (3604, 1)


## Randomized Grid Search

In [6]:
# Ravel the target arrays
target_train = target_train.values.ravel()
target_val = target_val.values.ravel()
target_test = target_test.values.ravel()

In [7]:
# Define the parameter grid with a larger hyperparameter space
param_dist = {
    'n_estimators': randint(10, 1000),          # Number of trees in the forest
    'max_features': ['sqrt', 'log2'],   # Number of features to consider at every split
    'max_depth': [None] + list(randint(5, 100).rvs(20)),  # Maximum number of levels in tree
    'min_samples_split': randint(2, 20),         # Minimum number of samples required to split a node
    'min_samples_leaf': randint(1, 20),          # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]                   # Method of selecting samples for training each tree
}

# Create a Random Forest Regressor
rf_regressor = RandomForestRegressor()

# Create randomized search
random_search = RandomizedSearchCV(estimator=rf_regressor, 
                                   param_distributions=param_dist, 
                                   n_iter=200,
                                   cv=5, 
                                   verbose=2, 
                                   random_state=42, 
                                   n_jobs=-1)

#random_search.fit(features_train, target_train)

#best_params = random_search.best_params_
#best_estimator = random_search.best_estimator_

#print('Best hyperparameters:', best_params)
#print('Best estimator:', best_estimator)

## Testing the Best Model

In [8]:
rf = RandomForestRegressor(n_estimators=242,
                            max_features='log2',
                            max_depth=74,
                            min_samples_split=6,
                            min_samples_leaf=1,
                            bootstrap=False)

rf.fit(features_train, target_train)

# Predict the target values
target_pred_train = rf.predict(features_train)
target_pred_val = rf.predict(features_val)

# Calculate the mean squared error
print('Mean squared error Training:', mean_squared_error(target_train, target_pred_train))
print('Mean squared error Validation:', mean_squared_error(target_val, target_pred_val))

Mean squared error Training: 59744989.53365708
Mean squared error Validation: 994384179.2456275


In [9]:
# export in pickle
import pickle

with open('rf_model.pkl', 'wb') as file:
    pickle.dump(rf, file)