In [1]:
%load_ext autoreload
%autoreload 
"""
import sys
sys.path.append("../")
import X_py_boost
"""
import cupy as cp

In [10]:
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import numpy as np

init_notebook_mode(connected=True)


def get_results(model):
    return list(np.array(model.history).reshape(len(model.history)))

### Datasets

In [3]:
datasets_NC = {
    'electricity': 44120,
    'covertype': 44121,
    'pol': 44122,
    'house_16H': 44123,
    'kdd_ipums_la_97-small': 44124,
    'MagicTelescope': 44125,
    'bank-marketing': 44126,
    'phoneme': 44127,
    'MiniBooNE': 44128,
    'Higgs': 44129,
    'eye_movements': 44130,
    'jannis': 44131,
    'credit': 44089,
    'california': 44090,
    'wine': 44091
}

datasets_NR = {
    'cpu_act': 44132,
    'pol': 44133,
    'elevators': 44134,
    'isolet': 44135,
    'wine_quality': 44136,
    'Ailerons': 44137,
    'houses': 44138,
    'house_16H': 44139,
    'diamonds': 44140,
    'Brazilian_houses': 44141,
    'Bike_Sharing_Demand': 44142,
    'nyc-taxi-green-dec-2016': 44143,
    'house_sales': 44144,
    'sulfur': 44145,
    'medical_charges': 44146,
    'MiamiHousing2016': 44147,
    'superconduct': 44148,
    'california': 44025,
    'fifa': 44026,
    'year': 44027

}

datasets_CC = {
    'electricity': 44156,
    'eye_movements': 44157,
    'KDDCup09_upselling': 44158,
    'covertype': 44159,
    'rl': 44160,
    'road-safety': 44161,
    'compass': 44162,
}

datasets_CR = {
    'yprop_4_1': 44054,
    'analcatdata_supreme': 44055,
    'visualizing_soil': 44056,
    'black_friday': 44057,
    'diamonds': 44059,
    'Mercedes_Benz_Greener_Manufacturing': 44061,
    'Brazilian_houses': 44062,
    'Bike_Sharing_Demand': 44063,
    'OnlineNewsPopularity': 44064,
    'nyc-taxi-green-dec-2016': 44065,
    'house_sales': 44066,
    'particulate-matter-ukair-2017': 44068,
    'SGEMM_GPU_kernel_performance': 44069
}


### Training Functions

In [4]:
from sklearn.datasets import fetch_openml

## Get X, y values given the dataset's name

def import_dataset_NC(ds_name=None):

    print(f"loading {ds_name}")

    dataset = fetch_openml(data_id=datasets_NC[ds_name])
    data = dataset['data']
    label = dataset['target']
    X = data.values.astype('float32')

    if ds_name == 'electricity':
        y = (label == 'UP').values.astype('int32')

    if ds_name in ['pol', 'house_16H', 'kdd_ipums_la_97-small']:
        y = (label == 'P').values.astype('int32')

    if ds_name in ['MagicTelescope']:
        y = (label == 'h').values.astype('int32')
    
    if ds_name in ['MiniBooNE', 'california', 'wine']:
        y = (label == 'True').values.astype('int32')

    if ds_name in ['bank-marketing', 'phoneme']:
        y = (label == '2').values.astype('int32')
    
    if ds_name in ['covertype', 'Higgs', 'eye_movements', 'jannis', 'credit']:
        y = label.values.astype('int32')

    return(X, y)

In [5]:
from sklearn.model_selection import train_test_split

## Get splitted dataset

def split_X_y(X, y):

    X_train, X_t, y_train, y_t = train_test_split(X, y, test_size=0.30, random_state=42)
    X_test, X_val, y_test, y_val = train_test_split(X_t, y_t, test_size=0.50, random_state=42) 
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    return(X_train, X_val, X_test, y_train, y_val, y_test)

In [6]:
from X_py_boost import GradientBoosting

def train_models(X_train, X_val, X_test, y_train, y_val, y_test, loss='bce', ntrees=1500, max_depth=8):

    eval_sets=[{'X': X_val, 'y': y_val},]

    if loss == 'bce':
        model2 = GradientBoosting(loss='bce', order=2, ntrees=ntrees, es=500, max_depth=max_depth, lambda_l2=0.01, verbose=1000)
        model3 = GradientBoosting(loss='bce', order=3, ntrees=ntrees, es=500, max_depth=max_depth, lambda_l2=0.01, verbose=1000)

        model2.fit(X_train, y_train, eval_sets=eval_sets)
        model3.fit(X_train, y_train, eval_sets=eval_sets)

    return( (model2, model3, model4),  )

In [7]:
def train_model_NC(ds_name, params=None):
        
    X, y = import_dataset_NC(ds_name=ds_name)
    X_train, X_test, y_train, y_test = split_X_y(X, y)
    model2, model4 = train_models(X_train, X_test, y_train, y_test, loss=params['loss'], ntrees=params['ntrees'], max_depth=params['max_depth'])
    return(model2, model4)




In [8]:
import plotly.graph_objects as go

params = {
    'ds_name': 'electricity',
    'loss_function': 'bce',
    'ntrees': 1500,
    'max_depth': 8
}

def plot_models(model2, model4, params=params):


    y_G2 = get_results(model2)
    y_G4 = get_results(model4)
    xs = [i for i in range(max(len(y_G4), len(y_G2)))]

    fig1 = go.Figure(layout_title_text=f"2nd and 4th order training: x=#iter, y=accuracy, ds={params['ds_name']}, loss={params['loss_function']}, max_depth={params['max_depth']}")
    fig1.add_trace(go.Scatter(x=xs, y=(y_G2),
                        mode='lines',
                        name='2nd order'))
    fig1.add_trace(go.Scatter(x=xs, y=(y_G4),
                        mode='lines',
                        name='4th order'))
    fig1.show()

def plot_models_NC(models_NC):

    for ds_name  in datasets_NC:
        model2, model4 = models_NC[ds_name]
        params['ds_name'] = ds_name
        plot_models(model2, model4, params)

In [9]:
def get_models_NC(params=None):

    models = {}

    for ds_name in datasets_NC:

        models[ds_name] = train_model_NC(ds_name, params)

    return(models)


In [10]:
def get_speed_acceleration(model1, model2):
    
    from scipy.interpolate import interp1d
    import numpy as np
    
    y1 = 1 - np.array(get_results(model1))
    y2 = 1 - np.array(get_results(model2))
    
    l_cut = min(len(y1), len(y2))
    
    y1 = y1[:l_cut]
    y2 = y2[:l_cut]
    
    xs = [i for i in range(l_cut)]

    log_y1 = - np.log(y1) 
    log_y2 = - np.log(y2) 

    f1 = interp1d(log_y1, xs)
    f2 = interp1d(log_y2, xs)
    
    xnew = np.linspace(max(min(log_y1), min(log_y2)), min(max(log_y1), max(log_y2)), 10000)
    
    iter1 = f1(xnew)
    iter2 = f2(xnew)
    
    percent_better = iter1/iter2 * 100 - 100
    fig = go.Figure(layout_title_text=f"-")
    fig.add_trace(go.Scatter(x=xnew, y=percent_better,
                        mode='lines',
                        name="2nd vs 4th order on covertype dataset, x=log(1-accuracy), y=min{iter | metric(iter) > accuracy}"))
    fig.show()

    
    return(iter1, iter2, percent_better)

### Training - NC

#### Training Curves

In [6]:
X, y = import_dataset_NC('covertype')
X_train, X_val, X_test, y_train, y_val, y_test = split_X_y(X, y)
eval_sets=[{'X': X_train, 'y': y_train},]

loading covertype
X_train shape: (396621, 10)
X_test shape: (84990, 10)


In [7]:
from py_boost import GradientBoosting

model2 = GradientBoosting(loss='bce', ntrees=1000, es=500, max_depth=6, lambda_l2=0.01, verbose=1000)
"""model3 = GradientBoosting(loss='bce', order=3, ntrees=1000, es=500, max_depth=6, lambda_l2=0.01, verbose=1000)
model4 = GradientBoosting(loss='bce', order=4, ntrees=1000, es=500, max_depth=6, lambda_l2=0.01, verbose=1000)
"""
model2.fit(X_train, y_train, eval_sets=eval_sets)
"""model3.fit(X_train, y_train, eval_sets=eval_sets)
model4.fit(X_train, y_train, eval_sets=eval_sets)"""

[08:40:02] Stdout logging level is INFO.
[08:40:02] GDBT train starts. Max iter 1000, early stopping rounds 500
[08:40:02] Iter 0; Sample 0, BCE = 0.6769835150414101; 
[08:40:14] Iter 999; Sample 0, BCE = 0.2803655520293805; 


'model3.fit(X_train, y_train, eval_sets=eval_sets)\nmodel4.fit(X_train, y_train, eval_sets=eval_sets)'

In [9]:
y_G2 = get_results(model2)
"""y_G3 = get_results(model3)
y_G4 = get_results(model4)"""
xs = [i for i in range(len(y_G2))]

fig1 = go.Figure(layout_title_text=f"-")
fig1.add_trace(go.Scatter(x=xs, y=(y_G2),
                    mode='lines',
                    name='2nd order'))
"""fig1.add_trace(go.Scatter(x=xs, y=(y_G3),
                    mode='lines',
                    name='3rd order'))
fig1.add_trace(go.Scatter(x=xs, y=(y_G4),
                    mode='lines',
                    name='4th order'))"""
fig1.show()

In [35]:
from X_py_boost.gpu.losses import BCEMetric

loss = BCEMetric()
print(loss())

TypeError: __call__() missing 2 required positional arguments: 'y_true' and 'y_pred'

In [None]:
from X_py_boost.gpu.losses import BCEMetric

loss = BCEMetric()
print(loss())

TypeError: __call__() missing 2 required positional arguments: 'y_true' and 'y_pred'