In [10]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, LeaveOneOut, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint
np.random.seed(1)

In [11]:
iris =datasets.load_iris()
noise = np.random.normal(iris.data.mean(),1,iris.data.shape)
#iris.data = iris.data + noise

X_train, X_test, labels_train, labels_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=1)
print(str(X_train.shape) + " - " + str(X_test.shape))

(120, 4) - (30, 4)


# SVM and Grid Search

In [12]:
# Optimize the parameters by cross-validation
parameters = [
    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [0.01, 1, 10, 100]},
    {'kernel': ['linear'], 'C': [0.01, 1, 10, 100]}
]

loo = LeaveOneOut()

# Grid search object with SVM classifier.
clf = GridSearchCV(SVC(), parameters, cv=10)
clf.fit(X_train, labels_train)

print("Best parameters set found on training set:")
print(clf.best_params_)
print()

means_valid = clf.cv_results_['mean_test_score']
stds_valid = clf.cv_results_['std_test_score']
means_train = clf.cv_results_['mean_train_score']

print("Grid scores:")
for mean_valid, std_valid, mean_train, params in zip(means_valid, stds_valid, means_train, clf.cv_results_['params']):
    print("Validation: %0.3f (+/-%0.03f), Training: %0.3f  for %r" % (mean_valid, std_valid, mean_train, params))
print()

labels_test, labels_predicted = labels_test, clf.predict(X_test)
print("Test Accuracy [%0.3f]" % ((labels_predicted == labels_test).mean()))

Best parameters set found on training set:
{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}

Grid scores:
Validation: 0.367 (+/-0.482), Training: 0.367  for {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}
Validation: 0.367 (+/-0.482), Training: 0.367  for {'C': 0.01, 'gamma': 0.0001, 'kernel': 'rbf'}
Validation: 0.692 (+/-0.462), Training: 0.692  for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
Validation: 0.367 (+/-0.482), Training: 0.367  for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
Validation: 0.933 (+/-0.249), Training: 0.934  for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
Validation: 0.692 (+/-0.462), Training: 0.692  for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
Validation: 0.975 (+/-0.156), Training: 0.976  for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Validation: 0.933 (+/-0.249), Training: 0.934  for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
Validation: 0.867 (+/-0.340), Training: 0.889  for {'C': 0.01, 'kernel': 'linear'}
Validation: 0.975 (+/-0.156), Training: 0.983  for {'C':

# Random Forest and Random Search

In [4]:
from sklearn.model_selection import KFold, PredefinedSplit, ShuffleSplit

In [13]:
# cv parameter of RandomizedSearchCV or GridSearchCV can be fed with a customized cross-validation object.
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=1)
                  
# Optimize the parameters by cross-validation.
parameters = {
        "max_depth": sp_randint(2, 4),
        "max_features": sp_randint(1, 4),
        "min_samples_split": sp_randint(2, 10),
        "min_samples_leaf": sp_randint(2, 10),
        'n_estimators': [1,3,5,10],
    }

# Random search object with SVM classifier.
clf = RandomizedSearchCV(
        estimator=RandomForestClassifier(random_state=1),
        param_distributions=parameters,
        n_iter=10,
        cv=10,
        random_state=1,
    )
clf.fit(X_train, labels_train)

print("Best parameters set found on training set:")
print(clf.best_params_)
print()

means_valid = clf.cv_results_['mean_test_score']
stds_valid = clf.cv_results_['std_test_score']
means_train = clf.cv_results_['mean_train_score']

print("Grid scores:")
for mean_valid, std_valid, mean_train, params in zip(means_valid, stds_valid, means_train, clf.cv_results_['params']):
    print("Validation: %0.3f (+/-%0.03f), Training: %0.3f  for %r" % (mean_valid, std_valid, mean_train, params))
print()

labels_test, labels_predicted = labels_test, clf.predict(X_test)
print("Test Accuracy [%0.3f]" % ((labels_predicted == labels_test).mean()))

Best parameters set found on training set:
{'max_depth': 3, 'max_features': 1, 'min_samples_leaf': 2, 'min_samples_split': 9, 'n_estimators': 3}

Grid scores:
Validation: 0.942 (+/-0.068), Training: 0.958  for {'max_depth': 3, 'max_features': 1, 'min_samples_leaf': 2, 'min_samples_split': 9, 'n_estimators': 3}
Validation: 0.758 (+/-0.069), Training: 0.820  for {'max_depth': 3, 'max_features': 2, 'min_samples_leaf': 9, 'min_samples_split': 2, 'n_estimators': 1}
Validation: 0.783 (+/-0.144), Training: 0.870  for {'max_depth': 3, 'max_features': 1, 'min_samples_leaf': 9, 'min_samples_split': 7, 'n_estimators': 1}
Validation: 0.883 (+/-0.085), Training: 0.917  for {'max_depth': 2, 'max_features': 2, 'min_samples_leaf': 4, 'min_samples_split': 6, 'n_estimators': 5}
Validation: 0.925 (+/-0.082), Training: 0.944  for {'max_depth': 3, 'max_features': 3, 'min_samples_leaf': 6, 'min_samples_split': 5, 'n_estimators': 1}
Validation: 0.900 (+/-0.062), Training: 0.938  for {'max_depth': 2, 'max_fea

## Random Search vs. Grid Search

![title](randomVsgrid.png)


Image source and further reading: Bergstra, J., & Bengio, Y. (2012). Random search for hyper-parameter optimization. Journal of Machine Learning Research, 13(Feb), 281-305.

In [11]:
#Example Code: http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html
import numpy as np

from time import time
from scipy.stats import randint as sp_randint

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

# get some data
digits = load_digits()
X, y = digits.data, digits.target

# build a classifier
clf = RandomForestClassifier(n_estimators=20)


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 216
random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))
report(grid_search.cv_results_)

RandomizedSearchCV took 29.74 seconds for 216 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.934 (std: 0.005)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 5, 'min_samples_leaf': 2, 'min_samples_split': 8}

Model with rank: 1
Mean validation score: 0.934 (std: 0.011)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 2, 'min_samples_split': 3}

Model with rank: 3
Mean validation score: 0.932 (std: 0.008)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 6, 'min_samples_leaf': 1, 'min_samples_split': 5}

GridSearchCV took 27.85 seconds for 216 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.935 (std: 0.003)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 10}

Model with rank: 2
Mean validation score: 0.9