In [1]:
%matplotlib inline
###https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py


# Comparing randomized search and grid search for hyperparameter estimation


Compare randomized search and grid search for optimizing hyperparameters of a
random forest.
All parameters that influence the learning are searched simultaneously
(except for the number of estimators, which poses a time / quality tradeoff).

The randomized search and the grid search explore exactly the same space of
parameters. The result in parameter settings is quite similar, while the run
time for randomized search is drastically lower.

The performance is slightly worse for the randomized search, though this
is most likely a noise effect and would not carry over to a held-out test set.

Note that in practice, one would not search over this many different parameters
simultaneously using grid search, but pick only the ones deemed most important.



In [2]:
import numpy as np

from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier


In [3]:
# get some data
digits = load_digits()
X, y = digits.data, digits.target


In [4]:
# Training and Testing Sets
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(X, y,test_size = 0.3, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (1257, 64)
Training Labels Shape: (1257,)
Testing Features Shape: (540, 64)
Testing Labels Shape: (540,)


In [5]:
# Evaluate the Default Model

base_model = RandomForestClassifier()
base_model.fit(train_features, train_labels)
pred_labels = base_model.predict(test_features)

from sklearn.metrics import accuracy_score
pa_mp = accuracy_score(test_labels, pred_labels, normalize=False)
print("Classification accuracy of RF (default) is", pa_mp/len(test_labels))


Classification accuracy of RF (default) is 0.9722222222222222


In [6]:
# specify parameters and distributions to sample from
clf = RandomForestClassifier()

param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=5)

start = time()
random_search.fit(train_features, train_labels)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

random_search.best_params_

RandomizedSearchCV took 33.30 seconds for 20 candidates parameter settings.


{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 6,
 'min_samples_split': 3}

In [7]:
best_random = random_search.best_estimator_
best_random.fit(train_features, train_labels)
pred_labels = best_random.predict(test_features)
pa_mp = accuracy_score(test_labels, pred_labels, normalize=False)
print("Classification accuracy of RF (Random Search) is", pa_mp/len(test_labels))

Classification accuracy of RF (Random Search) is 0.975925925925926


In [8]:
# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
start = time()
grid_search.fit(train_features, train_labels)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))

grid_search.best_params_

GridSearchCV took 119.40 seconds for 72 candidate parameter settings.


{'bootstrap': False,
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 10,
 'min_samples_split': 3}

In [9]:
best_grid = grid_search.best_estimator_
best_grid.fit(train_features, train_labels)
pred_labels = best_grid.predict(test_features)
pa_mp = accuracy_score(test_labels, pred_labels, normalize=False)
print("Classification accuracy of RF (Grid Search) is", pa_mp/len(test_labels))

Classification accuracy of RF (Grid Search) is 0.975925925925926
