In [8]:
from hyperopt import hp
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import xgboost as xgb

from xgbutils import model_selection

In [2]:
data = load_boston()
X = DataFrame(data["data"], columns=data["feature_names"])
y = Series(data["target"])

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25)

dtrain = xgb.DMatrix(data=X_tr, label=y_tr, feature_names=X_tr.columns)
dtest = xgb.DMatrix(data=X_te, label=y_te, feature_names=X_te.columns)

In [3]:
param_space = {"max_depth": hp.quniform("max_depth", 2, 20, 1),
               "subsample": hp.quniform("subsample", 0.5, 1.0, 0.05),
               "min_child_weight": hp.quniform("min_child_weight", 0.0, 10.0, 0.1),
               "colsample_by_tree": hp.quniform("colsample_by_tree", 0.25, 1.0, 0.05)}
 
common_params = {"params": {"eta": 0.1}, "num_boost_round": 5000, 
                 "early_stopping_rounds": 50, "verbose_eval": False}

fit_params = dict(common_params.items() + {"evals": [(dtrain, "train")]}.items())
cv_params = dict(common_params.items() + {"nfold": 8, "metrics": "mae"}.items())

In [4]:
pscv = model_selection.ParamSearchCV(param_space=param_space, num_evals=50, cv_metric="mae",
                                     fit_params=fit_params, cv_params=cv_params)
pscv.fit(dtrain)
print pscv.fit_params
y_hat_gbm = pscv.predict(dtest)

{'evals': [(<xgboost.core.DMatrix object at 0x7ff1fa464b10>, 'train')], 'num_boost_round': 192, 'params': {'subsample': 0.55, 'colsample_by_tree': 0.35000000000000003, 'eta': 0.1, 'max_depth': 4, 'min_child_weight': 1.3}, 'early_stopping_rounds': 50, 'verbose_eval': False}


In [5]:
rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
rf.fit(X_tr, y_tr)
y_hat_rf = rf.predict(X_te)

In [9]:
print "GBM loss: %0.2f" % mean_absolute_error(y_te, y_hat_gbm)
print "RF loss: %0.2f" % mean_absolute_error(y_te, y_hat_rf)

GBM loss: 2.35
RF loss: 2.39
