In [17]:
# Here, I'm going to use Random Search and Grid Search, with cross validation
# Random Search, I'm using both sklearn and hyperopt

from hpsklearn import HyperoptEstimator, any_classifier, any_preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_iris
from hyperopt import tpe
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [10]:
iris = load_iris()

X = iris.data
y = iris.target

print X[4:10]
print y[4:10]

[[ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1.4  0.3]
 [ 5.   3.4  1.5  0.2]
 [ 4.4  2.9  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]]
[0 0 0 0 0 0]


In [11]:
# Split the data into train and test through random shuffling

np.random.seed(410)
test_size = int(0.2 * len(y))
indices = np.random.permutation(len(X))
print indices[4:10]

[112  86   2 131  15  79]


In [12]:
X_train = X[ indices[:-test_size]]
y_train = y[ indices[:-test_size]]
X_test = X[ indices[-test_size:]]
y_test = y[ indices[-test_size:]]

print X_train.shape, X_test.shape

(120, 4) (30, 4)


In [24]:
# Hyopt Random Search - Initiate the search space
estim = HyperoptEstimator(classifier=any_classifier('my_clf'),
                          preprocessing=any_preprocessing('my_pre'),
                          algo=tpe.suggest,
                          max_evals=100,
                          trial_timeout=120)

In [14]:
estim.fit(X_train, y_train, n_folds=10)  # n_folds is for k-fold cross validation, when it's -1, it uses leave-one out

In [16]:
print(estim.score(X_test, y_test))
print(estim.best_model())  # otherput the best model with optimized params

0.766666666667
{'learner': AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.38657085838, n_estimators=428, random_state=4), 'preprocs': (MinMaxScaler(copy=True, feature_range=(-1.0, 1.0)),), 'ex_preprocs': ()}


In [30]:
# sklearn Random search
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint as sp_randint  # random 

n_iter_search = 20
clf = RandomForestClassifier(n_estimators=20)

param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 4),  # must beetween [0, ]
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=10)
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=20, n_jobs=1,
          param_distributions={'bootstrap': [True, False], 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1153a2f50>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1153a2550>, 'criterion': ['gini', 'entropy'], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1153a2dd0>, 'max_depth': [3, None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
     

In [33]:
random_search.score(X_test, y_test)

0.93333333333333335

In [39]:
# sklearn Grid Search
param_grid = {"max_depth": [3, None],
              "max_features": [1, 2, 4],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'bootstrap': [True, False], 'min_samples_leaf': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'criterion': ['gini', 'entropy'], 'max_features': [1, 2, 4], 'max_depth': [3, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [40]:
grid_search.score(X_test, y_test)

0.90000000000000002