# Skopt vs Hyperopt

## Importing and preprocessing data

In [1]:
import pandas as pd
import numpy as np
import pickle
import lightgbm as lgb
import warnings

from time import time
from hyperopt import hp, tpe, fmin, Trials
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt import gbrt_minimize
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import log_loss
from utils import FeatureTools

warnings.filterwarnings("ignore")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df = pd.read_csv("data/adult.data")
df['target'] = (df['income_bracket'].apply(lambda x: '>50K' in x)).astype(int)
df.drop('income_bracket', axis=1, inplace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,target
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


I have coded a preprocessor class before that does the work for us.

In [3]:
dataprocessor = pickle.load(open("data/dataprocessors/dataprocessor_0_.p", "rb"))
all_features = dataprocessor.colnames
categorical_features = dataprocessor.cat_cols + dataprocessor.crossed_columns

print("the features column names are: {}".format(all_features))
print("the categorical columns are: {}".format(categorical_features))

the features column names are: ['age', 'workclass', 'fnlwgt', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'education_occupation', 'native_country_occupation']
the categorical columns are: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country', 'education_occupation', 'native_country_occupation']


the `dataprocessor` is already train, so we simply need to `transform`

In [4]:
train_data = dataprocessor.transform(df)

train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,target,education_occupation,native_country_occupation
0,0.30137,0,0.044302,0,0,0,0,0,0,0.02174,0.0,0.397959,0,0,0,0
1,0.452055,1,0.048238,0,1,1,1,0,0,0.0,0.0,0.122449,0,0,1,1
2,0.287671,2,0.138113,1,2,2,0,0,0,0.0,0.0,0.397959,0,0,2,2
3,0.493151,2,0.151068,2,1,2,1,1,0,0.0,0.0,0.397959,0,0,3,2
4,0.150685,2,0.221488,0,1,3,2,1,1,0.0,0.0,0.397959,1,0,4,3


In [6]:
# np arrays
X_train = train_data[[c for c in train_data.columns if c is not 'target']].values
y_train = train_data['target'].values

# lgb Dataset object
lgtrain = lgb.Dataset(X_train,
    label=y_train,
    feature_name=all_features,
    categorical_feature=categorical_features,
    free_raw_data=False)

In [7]:
# model and fit params
params = dict(learning_rate=0.01,
    num_boost_round=300,
    num_leaves = 255,
    verbose=-1,
    is_unbalance=True)
fit_params = dict(feature_name=all_features,
        categorical_feature=categorical_features)

## 1. First experiment. Sklearn wrap up vs lightgbm methods

In [12]:
clf = lgb.LGBMClassifier(**params, silent=True)
start = time()
score = cross_val_score(clf,
    X_train, y_train,
    scoring='neg_log_loss',
    cv=StratifiedKFold(random_state=1981),
    fit_params=fit_params)
sklearn_runtime = time() - start
print(sklearn_runtime)

7.932429075241089


In [13]:
start = time()
cv_result = lgb.cv(params,
    lgtrain,
    metrics='binary_logloss',
    nfold=3,
    stratified=True, 
    seed=1981)
lightgbm_runtime = time() - start
print(lightgbm_runtime)

7.038502931594849


LightGBM methods seem to be a bit faster. Let's now compare `Hyperopt` and `Skopt`

## Hyperopt vs Skopt

The first thing to comment is that while Hyperopt offers the `hp.quniform(label, low, high, q)` parameter expressions, there is not such a thing for Skopt. One has `Categorical`, but you have to pass all values. In other words, When using hyperopt one could use:

    'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20)

but when using Skopt one would have to do:

    Categorical(np.arange(50, 500, 20))
    
Because I want to keep the comparison as light and direct as possible, I will just use `Real` parameters with uniform distributions.

### 1. Hyperopt

With Hyperopt we will use the [TPE](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) algorithm.

In [14]:
hp_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),
    'subsample': hp.uniform('subsample', 0.5, 1.),
    }

In [20]:
def objective(params):
    clf = lgb.LGBMClassifier(**params, is_unbalance=True, verbose=-1, silent=True)
    score = cross_val_score(clf,
        X_train, y_train,
        scoring='f1',
        cv=StratifiedKFold(random_state=3),
        fit_params=fit_params).mean()
    return 1-score
trials = Trials()
best = fmin(fn=objective,
            space=hp_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

100%|██████████| 50/50 [00:39<00:00,  1.18it/s, best loss: 0.28343034788381305]


### 2. SKopt

Since TPE is a Bayesian method we will first compare with the `BayesSearchCV` method in `Skopt`

In [16]:
hh_space = dict(
        learning_rate = Real(0.01, 0.3),
        min_child_weight = Real(0.1, 10),
        colsample_bytree= Real(0.5, 1.),
        subsample=Real(0.5, 1.),
    )

In [17]:
clf = lgb.LGBMClassifier(is_unbalance=True, verbose=-1, silent=True)
start = time()
opt = BayesSearchCV(clf,
    search_spaces=hh_space,
    scoring='f1',
    cv=StratifiedKFold(random_state=3),
    fit_params=fit_params,
    n_iter=50,
    n_jobs=-1)
opt.fit(X_train, y_train)
skopt_bayes_runtime = time()-start
print(skopt_bayes_runtime)

63.14497208595276


`Skopt`'s seems to be a significantly slower than hyperopt even with no verbosity. Let's see if performs better:

In [19]:
print('best SKOPT F1 score: {}'.format(opt.best_score_))

best SKOPT F1 score: 0.7174372197995806


which is almost identical to the one obtained with `Hyperopt`

In [25]:
# Remember hyperopt minimises 1-score. 
print('best HYPEROPT F1 score: {}'.format(1-trials.best_trial['result']['loss']))

best HYPEROPT F1 score: 0.716569652116187


The conclusion at this stage is that `Hyperopt` is faster than `Skopt` with the same performance. However, the `TPE` algorithm is a tree based algorithm, so let's also compare with the `gbrt_minimize` method (Sequential optimization using gradient boosted trees) in `Skopt`. Here the syntax is a bit different to that of `BayesSearchCV`. 

In [26]:
# the space has to be tuples like these
hh_space_gbrt  = [Real(0.01, 0.3, 'uniform', name='learning_rate'),
          Real(0.1, 10, 'uniform', name='min_child_weight'),
          Real(0.5, 1., 'uniform', name='colsample_bytree'),
          Real(0.5, 1., 'uniform', name='subsample')]

In [30]:
# Let's adapt the objective
def gbrt_objective(params):
    tmp_params = {}
    tmp_params['learning_rate'], tmp_params['min_child_weight'], \
    tmp_params['colsample_bytree'], tmp_params['subsample'], = params[0], params[1], params[2], params[3]
    clf = lgb.LGBMClassifier(**tmp_params, is_unbalance=True, verbose=-1, silent=True)
    score = cross_val_score(clf,
        X_train, y_train,
        scoring='f1',
        cv=StratifiedKFold(random_state=3),
        fit_params=fit_params).mean()
    return 1-score

In [31]:
start=time()
sk_best = gbrt_minimize(gbrt_objective,
    hh_space_gbrt,
    n_calls=50,
    verbose=False,
    n_jobs=-1)
skopt_gbrt_runtime = time()-start
print(skopt_gbrt_runtime)

54.64228296279907


Faster than `BayesSearchCV`, but still, slower than `Hyperopt`. Let's see if the results are any better

In [36]:
print('best SKOPT GBRT F1 score: {}'.format(1-sk_best.fun))

best SKOPT GBRT F1 score: 0.7173483496134895


## CONCLUSION

`Hyperopt`'s TPE performs as good as Skopt `gbrt_minimize` and `BayesSearchCV` methods and is significantly faster.