In [None]:
pip install --user ax-platform

In [None]:
import lightgbm as lgb
import numpy as np
import scipy
import random
import os
import shutil
import gc
import sys
import uuid
import functools
import gc
import json
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

from ax import *
from ax.plot.scatter import plot_fitted
from ax.utils.notebook.plotting import render, init_notebook_plotting
from ax.utils.stats.statstools import agresti_coull_sem
from sklearn.metrics import roc_auc_score, roc_curve

fname = ["isEnr", "channel", "tDrift", "avse", "dcr","noise","active mass"]

In [None]:
#Setting reproducability
manualSeed = 158138

np.random.seed(manualSeed)
random.seed(manualSeed)

FIRST_ARM=500
IT_ARM=20
ITRATION=40
BKG_FACTOR = 1.0

In [None]:
def run_trial(params, dataset_tuple):
    params["neg_bagging_fraction"] *= params["pos_bagging_fraction"]

    lgb_train, lgb_eval, X_test, Y_test = dataset_tuple

    gbm = lgb.train(params,lgb_train,valid_sets=lgb_eval,early_stopping_rounds=5, categorical_feature=["isEnr","channel"])


    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

    rg=np.arange(0.0,1.0,0.01)
    plt.hist(y_pred[Y_test==1], label="Signal", bins=rg, histtype="step", density=True)
    plt.hist(y_pred[Y_test==0], label="Background",bins=rg, histtype="step", density=True)
    plt.legend()
    plt.xlabel("BDT output")
    plt.ylabel("% per 0.01 bin(a.u.)")
    plt.savefig("BDT_output.png")
    plt.cla()
    plt.clf()
    plt.close()

    fpr, tpr, thr = roc_curve(Y_test, y_pred)
    fpra, tpra, thra= roc_curve(Y_test,  X_test[:,3])
    
    avsecut = np.argmin(np.abs(thra+1.0))
    bdtcut = np.argmin(np.abs(tpr-tpra[avsecut]))
    performance_improvement = (1-fpr[bdtcut]) - (1-fpra[avsecut])
    print(performance_improvement*100.0)
    return performance_improvement

In [None]:
signaldata = np.load("sig.npy")
bkgdata = np.load("bkg.npy")

#split signal dataset
test_split = 0.3
indices = np.arange(signaldata.shape[0])
np.random.shuffle(indices)
train_index = indices[int(len(indices)*test_split):]
test_index = indices[:int(len(indices)*test_split)]
signal_train = signaldata[train_index]
signal_test = signaldata[test_index]
siglabel_train = np.ones(signal_train.shape[0])
siglabel_test = np.ones(signal_test.shape[0])

#split bkg dataset
indices = np.arange(bkgdata.shape[0])
np.random.shuffle(indices)
train_index = indices[int(len(indices)*test_split):]
test_index = indices[:int(len(indices)*test_split)]
bkg_train = bkgdata[train_index]
bkg_test = bkgdata[test_index]
bkglabel_train = np.zeros(bkg_train.shape[0])
bkglabel_test = np.zeros(bkg_test.shape[0])

#shuffle train dataset
X_train = np.concatenate([signal_train, bkg_train],axis = 0)
Y_train = np.concatenate([siglabel_train, bkglabel_train],axis = 0)
train_index = np.arange(len(X_train))
np.random.shuffle(train_index)
X_train = X_train[train_index]
Y_train = Y_train[train_index]
X_test = np.concatenate([signal_test, bkg_test],axis = 0)
Y_test = np.concatenate([siglabel_test, bkglabel_test],axis = 0)

#split test into valid and test data then shuffle
test_index = np.arange(len(X_test))
np.random.shuffle(test_index)
X_test = X_test[test_index]
Y_test = Y_test[test_index]
if len(X_test)%2 == 1:
    X_test = X_test[:-1]
    Y_test = Y_test[:-1]
X_val, X_test = np.split(X_test,2)
Y_val, Y_test = np.split(Y_test,2)

In [None]:
lgb_train = lgb.Dataset(X_train, Y_train,free_raw_data=False, feature_name = fname
)
lgb_eval = lgb.Dataset(X_val, Y_val, reference=lgb_train,free_raw_data=False, feature_name = fname
)
dataset_tuple = (lgb_train, lgb_eval, X_test, Y_test)


In [None]:
#List of Parameters
p1 = ChoiceParameter(name="boosting", values=["gbdt", "dart", "goss"], parameter_type=ParameterType.STRING)
p2 = RangeParameter(name="num_iterations", lower=20, upper=1000, parameter_type=ParameterType.INT)
p3  = RangeParameter(name="learning_rate", lower=1e-4, upper=0.5, parameter_type=ParameterType.FLOAT)
p4 = RangeParameter(name="num_leaves", lower=2, upper=300, parameter_type=ParameterType.INT)
p5 = FixedParameter(name="objective", value="binary", parameter_type=ParameterType.STRING)
p6 = FixedParameter(name="metric", value="binary_logloss", parameter_type=ParameterType.STRING)
p7 = FixedParameter(name="verbose", value=0, parameter_type=ParameterType.INT)
p8 = FixedParameter(name="seed", value=manualSeed, parameter_type=ParameterType.INT)
p9 = RangeParameter(name="bagging_freq", lower=3, upper=100, parameter_type=ParameterType.INT)
p10 = RangeParameter(name="pos_bagging_fraction",lower=0.4, upper=1.0, parameter_type=ParameterType.FLOAT)
p11 = FixedParameter(name="neg_bagging_fraction", value=float(len(signal_train))/float(len(bkg_train)), parameter_type=ParameterType.FLOAT)
p12 = RangeParameter(name="min_data_in_leaf", lower=5, upper=100, parameter_type=ParameterType.INT)
p13 = RangeParameter(name="drop_rate", lower=0.0, upper=0.9, parameter_type=ParameterType.FLOAT)
p14 = RangeParameter(name="min_gain_to_split", lower=0.0, upper=1.0, parameter_type=ParameterType.FLOAT)
p15 = FixedParameter(name="bagging_seed", value=manualSeed, parameter_type=ParameterType.INT)
p16 = RangeParameter(name="max_bin", lower=24, upper=1024, parameter_type=ParameterType.INT)

In [None]:
search_space = SearchSpace(
    parameters=[p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15, p16],
)

experiment = Experiment(
    name="hyper_parameter_optimization",
    search_space=search_space,
)

sobol = Models.SOBOL(search_space=experiment.search_space)
generator_run = sobol.gen(FIRST_ARM)

In [None]:
class cd:
    '''
    Context manager for changing the current working directory
    '''
    def __init__(self, newPath):
        self.newPath = newPath

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

class MyRunner(Runner):
    def __init__(self):
        '''
        nothing
        '''

    def run(self, trial):
        arm_result = []
        for arm_name, arm in trial.arms_by_name.items():
            params = arm.parameters
            print(arm.parameters)
            # train_loader = data_utils.DataLoader(self.dataset, batch_size=params["BATCH_SIZE"], sampler=self.train_sampler, drop_last=True, num_workers = 0)
            # test_loader = data_utils.DataLoader(self.dataset, batch_size=params["BATCH_SIZE"], sampler=self.test_sampler, drop_last=True, num_workers = 0)
            auc = run_trial(params, dataset_tuple)
            arm_result.append(float(auc))
        return {"name": str(trial.index), "auc": arm_result}

class BoothMetric(Metric):
    def fetch_trial_data(self, trial):  
        records = []
        auc_result = trial.run_metadata["auc"]
        index = 0
        for arm_name, arm in trial.arms_by_name.items():
            params = arm.parameters
            records.append({
                "arm_name": arm_name,
                "metric_name": self.name,
                "mean": auc_result[index],
                "sem": 0.0,
                "trial_index": trial.index
            })
            index += 1
        return Data(df=pd.DataFrame.from_records(records))


In [None]:
#run search
# VERSION="bdt_sepdep"
# hpsearch_dir = 'hpsearch_' + str(VERSION)
# if os.path.exists(hpsearch_dir):
#     shutil.rmtree(hpsearch_dir)
# os.mkdir(hpsearch_dir)
# hpsearch_dir = os.getcwd() + '/' + hpsearch_dir

# with cd(hpsearch)
experiment.runner = MyRunner()
experiment.new_batch_trial(generator_run=generator_run)

experiment.trials[0].run()

optimization_config = OptimizationConfig(
    objective = Objective(
        metric=BoothMetric(name="booth"), 
        minimize=False,
    )
)

experiment.optimization_config = optimization_config
for i in range(1, ITRATION):

    data = experiment.fetch_data()
    gpei = Models.GPEI(experiment=experiment, data=data)
    generator_run = gpei.gen(IT_ARM)
    experiment.new_batch_trial(generator_run=generator_run)
    experiment.trials[i].run()
    data = experiment.fetch_data()
    df = data.df
    print(df)
    best_arm_name = df.arm_name[df['mean'] == df['mean'].max()].values[0]
    best_arm = experiment.arms_by_name[best_arm_name]
    print(best_arm)
    json_field = best_arm.parameters
    json_field["improvement"] = df['mean'].max() * 100
    with open('data.json', 'w') as fp:
        json.dump(json_field, fp)
    df.to_json(r'arms.json')