In [1]:
'''
Based on: https://github.com/fmfn/BayesianOptimization/blob/master/examples/xgboost_example.py
Computes the best parameters for XGB model optimization
'''

# imports
import pandas as pd
import numpy as np
import os
from bayes_opt import BayesianOptimization
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit

# data columns used for the booster
factors = ['property_id', 'bedrooms', 'bathrooms', 'sqft','longitude', 'latitude','zipcode', 'elevation', 'garage'
                          ,'year_built', 'level','dist_to_park','dist_to_golf_course', 'has_pool'
                          ,'date_closed','multifamily', 'hoa_fees', 'lot']

In [4]:
def XGBcv(max_depth, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, alpha):
    folds = 5
    paramt = {
        'alpha': max(alpha, 0),
        'gamma': max(gamma, 0),
        'max_depth': int(max_depth),
        'eval_metric': 'mae',
        'verbose_eval': None,
        'eta': 0.1,
        'objective': 'reg:linear',
        'silent': True,
        'subsample': max(min(subsample, 1), 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'min_child_weight': int(min_child_weight),
        'max_delta_step': max_delta_step.astype(int),
        'seed': 2017,
        'updater': 'grow_gpu' 
    }

    out = xgb.cv(paramt,
           dtrain,
           num_boost_round=3000,
           folds=tscv.split(dtrain),
           callbacks=[xgb.callback.early_stop(50)])
    
    return -out['test-mae-mean'].values[-1]


In [None]:
params = { 'max_depth': (5, 15),
                 'gamma': (0.0, 10.0),
                 'min_child_weight': (1, 20),
                 'max_delta_step': (0, 5),
                 'subsample': (0.5, 1.0),
                 'colsample_bytree' :(0.1, 1.0),
                 'alpha': (0, 10)
               }

XGB_BOpt = BayesianOptimization(XGBcv, params)

df = pd.read_csv('CSV_backups/PH-sales.csv')

msk = np.random.rand(len(df)) < 0.03  # pick 3% of the dataset for a quick run, 100% would be entire dataset
df = df[msk]

dtrain = xgb.DMatrix(df[factors].values, label=df.price, feature_names=factors)
tscv = TimeSeriesSplit(n_splits=5)


XGB_BOpt.maximize(init_points=5, n_iter=25)

[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_delta_step |   max_depth |   min_child_weight |   subsample | 




Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
Stopping. Best iteration:
[206]	train-mae:13483.7+190.233	test-mae:32287.3+321.554

    1 | 00m22s | [35m-32287.31901[0m | [32m   8.2713[0m | [32m            0.3191[0m | [32m   7.2275[0m | [32m          0.3052[0m | [32m    11.5806[0m | [32m            7.0324[0m | [32m     0.5735[0m | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
Stopping. Best iteration:
[311]	train-mae:1964.77+111.847	test-mae:34171.8+392.018

    2 | 00m33s | -34171.81641 |    3.4938 |             0.2910 |    2.4727 |           0.9579 |     12.4056 |             1.9743 |      0.8772 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
    3 | 02m09s | -194486.71875 |    9.6910 | 