<a href="https://colab.research.google.com/github/hjn14133/Machine-Learning/blob/main/Random_Forest_Hyperparameter_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd

# Load the data
features = pd.read_csv('temps.csv')

features = features.drop(['forecast_noaa', 'forecast_acc', 'forecast_under',
                          'month', 'day', 'week'] , 
              axis = 1)

print(features.shape)
features.head()

(348, 6)


Unnamed: 0,year,temp_2,temp_1,average,actual,friend
0,2016,45,45,45.6,45,29
1,2016,44,45,45.7,44,61
2,2016,45,44,45.8,41,56
3,2016,44,41,45.9,40,53
4,2016,41,40,46.0,44,41


In [3]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 42)

from pprint import pprint

# Look at parameters used by current forest
print('Parameters currently in use: \n')
pprint(rf.get_params())

Parameters currently in use: 

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [5]:
# Try a set of hyperparameters by creating Random Grid
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Number of trees in rabdom forest
n_estimators = [int(x) for x in np.linspace(start = 200, 
                                             stop = 2000, 
                                             num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [6]:
# Use random grid to search for best hyperparameters
# First creat the base model to tune

rf = RandomForestRegressor()

# Random search of parameters using 3 fold cross validation
# Search across 100 different combinations and use all available cores

rf_random = RandomizedSearchCV(estimator= rf, 
                               param_distributions= random_grid,
                               n_iter = 100, 
                               cv=3, 
                               verbose=2,
                               random_state = 42,
                               n_jobs = -1)

In [12]:
labels = np.array(features['actual'])

features= features.drop('actual', axis = 1)

# Training and Testing Sets

from sklearn.model_selection import train_test_split

# Split data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features,
                                                                            labels,
                                                                            test_size = 0.25,
                                                                            random_state = 42)

In [13]:
# Fit the random search model (Running the models... 6 mins in total)
rf_random.fit(train_features, train_labels)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   44.1s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  5.9min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [14]:
rf_random.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'n_estimators': 1800}

In [15]:
# Compare base model with the best random search model
# Evaluate Random Search 
def evaluate(model, test_features, test_labels):
  predictions = model.predict(test_features)
  errors = abs(predictions - test_labels)
  mape = 100 * np.mean(errors/test_labels)
  accuracy = 100 - mape
  print('Model Performance')
  print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
  print('Accuracy = {:0.2f}%.'.format(accuracy))
  return accuracy

In [17]:
base_model = RandomForestRegressor(n_estimators= 10,
                                   random_state = 42)
base_model.fit(train_features, train_labels)
base_accuracy = evaluate(base_model, test_features, test_labels)

Model Performance
Average Error: 4.0345 degrees.
Accuracy = 93.51%.


In [18]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, test_features, test_labels)

Model Performance
Average Error: 3.4980 degrees.
Accuracy = 94.47%.


In [20]:
print('Improvement of {:0.2f}%'.format(100*(random_accuracy- base_accuracy)/base_accuracy))

Improvement of 1.02%
