### Import Libraries

In [1]:
import numpy as np

from xgboost.sklearn import XGBClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.cross_validation import StratifiedKFold

from scipy.stats import randint, uniform

### Set Random Seed

In [3]:
seed = 342
np.random.seed(seed)

### Prepare Data

In [4]:
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=8,
    n_classes=3,
    n_redundant=3,
    n_repeated=2,
    shuffle=True,
    random_state=seed)

## Select Params to Tune

### Tree Booster

**eta**=[0 .. 0.3 .. 1] - step shrinkage used in update to prevents overfitting. After each boosting step eta shrinks instance weights. Lower value makes learning process more conservative (slower learning)

**gamma**=[0 .. ∞] - minimum loss reduction required to make a further partition on a leaf node of the tree. Higher value makes the algorithm more conservative

**max_depth**=[1 .. 6 .. ∞] - maximum depth of each tree

**min_child_weight**=[0 .. 1 .. ∞] - minimum sum of instance weight needed in a tree node. Further partitioning from that node is abandoned when a sum is not obtained. Higher value makes the algorithm more conservative

**max_delta_step**=[0 .. ∞] - maximum delta step we allow each tree’s weight estimation to be. If the value is set to 0, it means there is no constraint. If it is set to a positive value it can help making the update step more conservative

**subsample**=[0,1] - subsample ratio of randomly selected training instances used to grow trees

**colsample_bytree**=[0,1] - subsample ratio of columns when construction each tree

**lambda**=[1] - L2 regularization term on weights

**alpha**=[0] - L1 regularization term on weights

In [8]:
params_fixed = {
    'objective': 'binary:logistic',
    'silent': 1
}

In [9]:
params_dist_grid = {
    'max_depth': [1, 2, 3, 4],
    'gamma': [0, 0.5, 1],
    'n_estimators': randint(1, 1001), # uniform discrete random distribution
    'learning_rate': uniform(), # guassian distribution
    'subsample': uniform(), # guassian distribution
    'colsample_bytree': uniform()
}

### Train and Tune

In [6]:
cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=True)

In [10]:
rs_grid = RandomizedSearchCV(
    estimator=XGBClassifier(**params_fixed, seed=seed),
    param_distributions=params_dist_grid,
    n_iter=10,
    cv=cv,
    scoring='accuracy',
    random_state=seed
)

In [11]:
rs_grid.fit(X, y)

RandomizedSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[2 2 ... 0 2], n_folds=10, shuffle=True, random_state=True),
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=342, silent=1,
       subsample=1),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_depth': [1, 2, 3, 4], 'gamma': [0, 0.5, 1], 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001A694D7DC18>, 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001A694D7DA20>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001A694D7D6D8>, 'colsample_bytree': <sc

In [12]:
rs_grid.grid_scores_

[mean: 0.47300, std: 0.05105, params: {'colsample_bytree': 0.06503439684192913, 'gamma': 0, 'learning_rate': 0.82231421953113, 'max_depth': 3, 'n_estimators': 492, 'subsample': 0.11676744056370758},
 mean: 0.62900, std: 0.05771, params: {'colsample_bytree': 0.11848249237448605, 'gamma': 1, 'learning_rate': 0.13214054942810016, 'max_depth': 1, 'n_estimators': 689, 'subsample': 0.4325346125891868},
 mean: 0.67000, std: 0.05170, params: {'colsample_bytree': 0.37621772642449514, 'gamma': 0, 'learning_rate': 0.610870226429942, 'max_depth': 4, 'n_estimators': 392, 'subsample': 0.1523931947190449},
 mean: 0.61000, std: 0.05442, params: {'colsample_bytree': 0.20992824607318106, 'gamma': 1, 'learning_rate': 0.4089849433509952, 'max_depth': 1, 'n_estimators': 574, 'subsample': 0.7099300190073073},
 mean: 0.70000, std: 0.03058, params: {'colsample_bytree': 0.22187963515640408, 'gamma': 1, 'learning_rate': 0.829247179484142, 'max_depth': 2, 'n_estimators': 116, 'subsample': 0.936106086335447},
 me

In [13]:
rs_grid.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8058014316376573, gamma=0,
       learning_rate=0.4636309538821305, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=281, n_jobs=1,
       nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=342, silent=1,
       subsample=0.7652628330253548)

In [14]:
rs_grid.best_params_

{'colsample_bytree': 0.8058014316376573,
 'gamma': 0,
 'learning_rate': 0.4636309538821305,
 'max_depth': 4,
 'n_estimators': 281,
 'subsample': 0.7652628330253548}

In [15]:
rs_grid.best_score_

0.769

### Going Forward
- think about the number of models that will be created before starting a param search
- iteratively repeat searches refining the search space each time