In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import config
from sklearn.model_selection import StratifiedKFold, train_test_split

df = pd.read_csv(config.CLEAN_FILE)
targets = df['response']
features = df.drop('response', axis=1)

# kfold = StratifiedKFold(n_splits=5, shuffle=True)

x_train, x_test, y_train, y_test = train_test_split(features, targets, test_size=0.2)

space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'eta':  hp.uniform('eta', 0.05, 0.30),
        'base_score': hp.uniform('base_score', 0.06, 0.95),
    }

def objective(space):
    clf=xgb.XGBClassifier(objective='binary:logistic',
                    max_depth = int(space['max_depth']), 
                    gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),
                    reg_lambda = float(space['reg_lambda']),
                    min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']),
                    eta=float(space['eta']),
                    base_score=float(space['base_score'])
    )
    
    evaluation = [( x_train, y_train), ( x_test, y_test)]
    
    clf.fit(x_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 5,
                        trials = Trials())

print(best_hyperparams)


  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]




SCORE:
0.9085801436583686
 20%|██        | 1/5 [00:20<01:20, 20.11s/trial, best loss: -0.9085801436583686]




SCORE:
0.9085801436583686
 40%|████      | 2/5 [00:39<00:59, 19.81s/trial, best loss: -0.9085801436583686]




SCORE:
0.9085801436583686
 60%|██████    | 3/5 [01:00<00:40, 20.11s/trial, best loss: -0.9085801436583686]




SCORE:
0.9085801436583686
 80%|████████  | 4/5 [01:04<00:13, 13.91s/trial, best loss: -0.9085801436583686]




SCORE:
0.9085801436583686
100%|██████████| 5/5 [01:20<00:00, 16.13s/trial, best loss: -0.9085801436583686]
{'base_score': 0.5074025941621864, 'colsample_bytree': 0.7805503460343888, 'eta': 0.24528128897339257, 'gamma': 8.815330570605372, 'max_depth': 7.0, 'min_child_weight': 1.0, 'reg_alpha': 170.0, 'reg_lambda': 0.8244076191380255}
