In [5]:
import sys
sys.path.append("../..")
import X_py_boost

### Datasets

In [6]:
datasets_NC = {
    'electricity': 44120,
    'covertype': 44121,
    'pol': 44122,
    'house_16H': 44123,
    'kdd_ipums_la_97-small': 44124,
    'MagicTelescope': 44125,
    'bank-marketing': 44126,
    'phoneme': 44127,
    'MiniBooNE': 44128,
    'Higgs': 44129,
    'eye_movements': 44130,
    'jannis': 44131,
    'credit': 44089,
    'california': 44090,
    'wine': 44091
}

datasets_NR = {
    'cpu_act': 44132,
    'pol': 44133,
    'elevators': 44134,
    'isolet': 44135,
    'wine_quality': 44136,
    'Ailerons': 44137,
    'houses': 44138,
    'house_16H': 44139,
    'diamonds': 44140,
    'Brazilian_houses': 44141,
    'Bike_Sharing_Demand': 44142,
    'nyc-taxi-green-dec-2016': 44143,
    'house_sales': 44144,
    'sulfur': 44145,
    'medical_charges': 44146,
    'MiamiHousing2016': 44147,
    'superconduct': 44148,
    'california': 44025,
    'fifa': 44026,
    'year': 44027

}

datasets_CC = {
    'electricity': 44156,
    'eye_movements': 44157,
    'KDDCup09_upselling': 44158,
    'covertype': 44159,
    'rl': 44160,
    'road-safety': 44161,
    'compass': 44162,
}

datasets_CR = {
    'yprop_4_1': 44054,
    'analcatdata_supreme': 44055,
    'visualizing_soil': 44056,
    'black_friday': 44057,
    'diamonds': 44059,
    'Mercedes_Benz_Greener_Manufacturing': 44061,
    'Brazilian_houses': 44062,
    'Bike_Sharing_Demand': 44063,
    'OnlineNewsPopularity': 44064,
    'nyc-taxi-green-dec-2016': 44065,
    'house_sales': 44066,
    'particulate-matter-ukair-2017': 44068,
    'SGEMM_GPU_kernel_performance': 44069
}


### Callbacks

In [7]:
from X_py_boost.callbacks.callback import Callback

class MultipleMetricsHistory(Callback):
    
    def __init__(self, metric_list):
        
        # assume list of callable(y_true, y_pred)
        self.metric_list = metric_list
        self.history = None
        
    def before_train(self, build_info):
        
        self.history = []
        # postprocess fn defined by loss function
        self.postprocess_fn = build_info['model'].loss.postprocess_output
        
    def after_iteration(self, build_info):
        
        iter_info = []
        # single train dataset
        train = build_info['data']['train']
        # lists of val datasets
        valid = build_info['data']['valid']
        # calc metric on train data
        y_pred = self.postprocess_fn(train['ensemble'])
        
        metrics = []
        for fn in self.metric_list:
            metrics.append(float(fn(train['target'], y_pred)))
        iter_info.append(metrics)
        
        # calc metrics on validation sets
        for y_true, y_pred in zip(valid['target'], valid['ensemble']):
            y_pred = self.postprocess_fn(y_pred)
            metrics = []
            for fn in self.metric_list:
                metrics.append(float(fn(y_true, y_pred)))
            iter_info.append(metrics)

        self.history.append(iter_info)
        
        return False
        
    def after_train(self, build_info):
        
        self.history = np.array(self.history)

class TimeHistory(Callback):

    def __init__(self):
        
        # assume list of callable(y_true, y_pred)
        self.start_time = None
        self.history = None
    
    def before_train(self, build_info):
        
        self.history = []
        self.start_time = time.time()

    def after_iteration(self, build_info):
        
        self.history.append(time.time() - self.start_time)
        
        return False

    def after_train(self, build_info):
        
        self.history = np.array(self.history)


### Training Functions

In [9]:
lambdas = [10, 100, 1000, 10000, 100000]
ds_names = ['Higgs', 'Epsilon', 'covertype', 'MiniBooNE', 'electricity']

In [3]:
import numpy as np
import pickle
import time

In [None]:
metrics = {}
times = {}

In [None]:
for ds_name in ds_names:
    
    metrics[ds_name] = {}
    times[ds_name] = {}
    npzfile = np.load('data/' + ds_name + '.npz', allow_pickle=True)
    X_train, X_test, y_train, y_test = npzfile['X_train'], npzfile['X_test'], npzfile['y_train'], npzfile['y_test']
    eval_sets = [{'X': X_test, 'y': y_test},]
    for lambda_l2 in lambdas:
        
        metrics_history = MultipleMetricsHistory( [AccuracyMetric(), BCEMetric()] )
        time_history = TimeHistory()
        model = GradientBoosting(loss='bce', order=3, ntrees=ntrees, max_depth=100000, lambda_l2=lambda_l2, es=1000, callbacks=[metrics_history, time_history], verbose=1000)
        model.fit(X_train, y_train, eval_sets=eval_sets)
        metrics[ds_name][lambda_l2] = metrics_history.history
        times[ds_name][lambda_l2] = time_history.history


In [None]:
with open(f'results/{self.params["ds_name"]}/callbacks/{self.params["order"]}.json', 'wb') as f:
            pickle.dump(callbacks, f)