In [1]:
# imports
import pandas as pd
import numpy as np
import datetime as dt
import contextlib

import os
from bayes_opt import BayesianOptimization

# this allows plots to appear directly in the notebook
%matplotlib inline

import xgboost as xgb

# data columns used for the booster
factors = ['property_id', 'bedrooms', 'bathrooms', 'sqft','longitude', 'latitude','zipcode', 'elevation', 'garage'
                          ,'year_built', 'level','dist_to_park','dist_to_golf_course', 'has_pool'
                          ,'date_closed','multifamily', 'hoa_fees', 'lot']

In [3]:
def XGBcv(max_depth, gamma, min_child_weight, max_delta_step, subsample,
          colsample_bytree, folds):
    paramt = {
        'gamma': gamma,
        'booster': 'gbtree',
        'max_depth': max_depth.astype(int),
        'eta': 0.1,
        # Use the line below for classification
        'objective': 'reg:linear',
        # DO NOT use the line below when doing classification
        #'num_class': 12,
        'silent': True,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'min_child_weight': min_child_weight,
        'max_delta_step': max_delta_step.astype(int),
        'seed': 101,
        'updater': 'grow_gpu' 
    }

    cv_score = 0

    print(" Search parameters (%d-fold validation):\n %s" % (folds, paramt),
          file=log_file)
    log_file.flush()

    # Do not optimize the number of boosting rounds, as early stopping will take care of that

    out = xgb.cv(paramt,
           dtrain,
           num_boost_round=20000,
           stratified=True,
           nfold=folds,
           verbose_eval=None,
           early_stopping_rounds=20,
           metrics="mae",
           show_stdv=True)
    
    print(out, file=log_file)
    # All relevant things in XGboost output are in stdout, so we screen result[1]
    # for a line with "cv-mean". This line signifies the end of output and contains CV values.
    # Next we split the line to extract CV values. We also print the whole CV run into file

    print('', file=log_file)
    cv_score = out.iloc[-1]['train-mae-mean']


    # The CV metrics function in XGboost can be lots of things. Some of them need to be maximized, like AUC.
    # If the metrics needs to be minimized, e.g, logloss, the return line below should be a negative number
    # as Bayesian Optimizer only knows how to maximize the function

    return (-1.0 * cv_score)

def run_bayes_search_for_best_params(city, speed='fast'):
    # Create a file to store XGBoost output
    # New lines are added to this file rather than overwriting it
    log_file = open("XGBoost-output-from-BOpt.txt", 'a')

    fast_params = {'max_depth': (4, 6),
                    'gamma': (0.0001, 0.005),
                    'min_child_weight': (1, 2),
                    'max_delta_step': (0, 1),
                    'subsample': (0.2, 0.4),
                    'colsample_bytree': (0.2, 0.4),
                    'folds': 5 }
    
    slow_params = { 'max_depth': (4, 15),
                     'gamma': (0.0001, 2.0),
                     'min_child_weight': (1, 10),
                     'max_delta_step': (0, 5),
                     'subsample': (0.2, 1.0),
                     'colsample_bytree' :(0.2, 1.0),
                     'folds': 10
                   }
    
    if speed == 'fast':
        params = fast_params
    else:
        params = slow_params
    
    XGB_BOpt = BayesianOptimization(XGBcv, params)

    df = pd.read_csv('CSV_backups/' + city + '-sales.csv')
    dtrain = xgb.DMatrix(df[factors].values, label=df.price, feature_names=factors)


    print('\n', file=log_file)
    log_file.flush()

    print('Running Bayesian Optimization ...\n')
    XGB_BOpt.maximize(init_points=5, n_iter=5)

    print('\nFinal Results')
    print('XGBOOST: %f' % XGB_BOpt.res['max']['max_val'])
    print('\nFinal Results', file=log_file)
    print('XGBOOST: %f' % XGB_BOpt.res['max']['max_val'], file=log_file)
    log_file.flush()
    log_file.close()
    


In [4]:
run_bayes_search_for_best_params('PH', 'fast')

Running Bayesian Optimization ...

[31mInitialization[0m
[94m----------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     folds |     gamma |   max_delta_step |   max_depth |   min_child_weight |   subsample | 


TypeError: 'int' object is not subscriptable