In [7]:
%load_ext autoreload
%autoreload 

import sys
sys.path.append("../..")
import X_py_boost

import joblib
import cupy as cp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import numpy as np

init_notebook_mode(connected=True)


def get_results(model):
    return list(np.array(model.history).reshape(len(model.history)))

In [9]:
datasets_NC = {
    'electricity': 44120,
    'covertype': 44121,
    'pol': 44122,
    'house_16H': 44123,
    'kdd_ipums_la_97-small': 44124,
    'MagicTelescope': 44125,
    'bank-marketing': 44126,
    'phoneme': 44127,
    'MiniBooNE': 44128,
    'Higgs': 44129,
    'eye_movements': 44130,
    'jannis': 44131,
    'credit': 44089,
    'california': 44090,
    'wine': 44091
}

### Datasets

### Training Functions

In [10]:
def import_dataset_NC(ds_name):
    npzfile = np.load('data/' + ds_name + '.npz')
    X_train, X_test, X_valid, y_train, y_test, y_valid = npzfile['X_train'], npzfile['X_test'], npzfile['X_valid'], npzfile['y_train'], npzfile['y_test'], npzfile['y_valid']
    eval_sets=[{'X': X_train, 'y': y_train},]

    return(X_train, X_test, X_valid, y_train, y_test, y_valid)
    

In [11]:
from sklearn.model_selection import train_test_split

## Get splitted dataset

def split_X_y(X, y):

    X_train, X_t, y_train, y_t = train_test_split(X, y, test_size=0.30, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_t, y_t, test_size=0.50, random_state=42) 
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    return(X_train, X_val, X_test, y_train, y_val, y_test)

In [12]:


def train_models(X_train, X_val, X_test, y_train, y_val, y_test, loss='bce', ntrees=1500, max_depth=6, lambda_l2=0.1):
    from X_py_boost import GradientBoosting
    import time
    eval_sets=[{'X': X_val, 'y': y_val},]

    if loss == 'bce':

        model2 = GradientBoosting(loss='bce', order=2, ntrees=ntrees, es=500, max_depth=max_depth, lambda_l2=lambda_l2, verbose=1000)
        model3 = GradientBoosting(loss='bce', order=3, ntrees=ntrees, es=500, max_depth=max_depth, lambda_l2=lambda_l2, verbose=1000)
        model4 = GradientBoosting(loss='bce', order=4,  ntrees=ntrees, es=500, max_depth=max_depth, lambda_l2=lambda_l2, verbose=1000)

        start2 = time.time()
        model2.fit(X_train, y_train, eval_sets=eval_sets)
        end2 = time.time()

        start3 = time.time()
        model3.fit(X_train, y_train, eval_sets=eval_sets)
        end3 = time.time()

        start4 = time.time()
        model4.fit(X_train, y_train, eval_sets=eval_sets)
        end4 = time.time()

        time = (end2 - start2, end3 - start3, end4 - start4)

    return( model2, model3, model4, time)

In [13]:
def train_model_NC(ds_name, params=None):
        
    X, y = import_dataset_NC(ds_name=ds_name)
    X_train, X_val, X_test, y_train, y_val, y_test = split_X_y(X, y)
    model2, model3, model4, time = train_models(X_train, X_val, X_test, y_train, y_val, y_test, loss=params['loss'], ntrees=params['ntrees'], max_depth=params['max_depth'])
    return(model2, model3, model4, time)




In [14]:
import plotly.graph_objects as go

params = {
    'ds_name': 'electricity',
    'loss_function': 'bce',
    'ntrees': 1500,
    'max_depth': 8
}

def plot_models(model2, model3, model4, params=params):


    y_G2 = get_results(model2)
    y_G3 = get_results(model3)
    y_G4 = get_results(model4)
    xs = [i for i in range(max(len(y_G2), len(y_G3), len(y_G4)))]

    fig1 = go.Figure(layout_title_text=f"2nd and 4th order training: x=#iter, y=accuracy, ds={params['ds_name']}, loss=BCE, max_depth={params['max_depth']}")
    fig1.add_trace(go.Scatter(x=xs, y=(y_G2),
                        mode='lines',
                        name='2nd order'))
    fig1.add_trace(go.Scatter(x=xs, y=(y_G3),
                        mode='lines',
                        name='3rd order'))

    fig1.add_trace(go.Scatter(x=xs, y=(y_G4),
                        mode='lines',
                        name='4th order'))
    fig1.show()

def plot_models_NC(models_NC):

    for ds_name  in datasets_NC:

        model2, model3, model4, time = models_NC[ds_name]
        params['ds_name'] = ds_name
        plot_models(model2, model3, model4, params)

In [15]:
def get_models_NC(params=None):

    models = {}

    for ds_name in datasets_NC:

        models[ds_name] = train_model_NC(ds_name, params)

    return(models)


In [16]:
def get_speed_acceleration(model1, model2):
    
    from scipy.interpolate import interp1d
    import numpy as np
    
    y1 = 1 - np.array(get_results(model1))
    y2 = 1 - np.array(get_results(model2))
    
    l_cut = min(len(y1), len(y2))
    
    y1 = y1[:l_cut]
    y2 = y2[:l_cut]
    
    xs = [i for i in range(l_cut)]

    log_y1 = - np.log(y1) 
    log_y2 = - np.log(y2) 

    f1 = interp1d(log_y1, xs)
    f2 = interp1d(log_y2, xs)
    
    xnew = np.linspace(max(min(log_y1), min(log_y2)), min(max(log_y1), max(log_y2)), 10000)
    
    iter1 = f1(xnew)
    iter2 = f2(xnew)
    
    percent_better = iter1/iter2 * 100 - 100
    fig = go.Figure(layout_title_text=f"-")
    fig.add_trace(go.Scatter(x=xnew, y=percent_better,
                        mode='lines',
                        name="2nd vs 4th order on covertype dataset, x=log(1-accuracy), y=min{iter | metric(iter) > accuracy}"))
    fig.show()

    
    return(iter1, iter2, percent_better)

### Training - NC

In [17]:
params = {
    'ds_name': 'electricity',
    'loss': 'bce',
    'ntrees': 1500,
    'max_depth': 6
}


models = get_models_NC(params=params)

NameError: name 'import_dataset_NC' is not defined

#### Training Curves

In [None]:
plot_models_NC(models)

NameError: name 'plot_models_NC' is not defined

In [None]:

for ds_name in datasets_NC:
    times[ds_name] = models[ds_name][3]

In [None]:
times

{'electricity': (11.214192390441895, 11.939833879470825, 5.3503577709198),
 'covertype': (19.744782209396362, 21.480321884155273, 23.28338360786438),
 'pol': (7.932242393493652, 8.13894772529602, 4.260971546173096),
 'house_16H': (5.342682361602783, 5.666813611984253, 4.230088949203491),
 'kdd_ipums_la_97-small': (6.141393184661865,
  6.374958038330078,
  5.4900829792022705),
 'MagicTelescope': (6.372896432876587, 5.932502508163452, 4.156835079193115),
 'bank-marketing': (6.492688894271851, 6.565255165100098, 4.467662572860718),
 'phoneme': (7.130889177322388, 6.748611688613892, 5.367152690887451),
 'MiniBooNE': (8.194819450378418, 8.495051860809326, 4.446772336959839),
 'Higgs': (22.485391855239868, 27.639577388763428, 12.016436338424683),
 'eye_movements': (8.360252380371094, 8.574718713760376, 5.382624626159668),
 'jannis': (7.611664533615112, 8.11400818824768, 4.284156560897827),
 'credit': (4.648128271102905, 4.726266145706177, 4.099913597106934),
 'california': (8.951210737228394