In [1]:
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import numpy as np

init_notebook_mode(connected=True)


def get_results(model):
    return list(np.array(model.history).reshape(len(model.history)))

In [2]:
## Higgs dataset

import pandas as pd

df = pd.read_csv('HIGGS.csv', nrows=300000)
df.columns = ['label', 'lepton pT', 'lepton eta', 'lepton phi', 'missing energy magnitude', 'missing energy phi', 'jet 1 pt', 'jet 1 eta', 'jet 1 phi', 'jet 1 b-tag', 'jet 2 pt', 'jet 2 eta', 'jet 2 phi', 'jet 2 b-tag', 'jet 3 pt', 'jet 3 eta', 'jet 3 phi', 'jet 3 b-tag', 'jet 4 pt', 'jet 4 eta', 'jet 4 phi', 'jet 4 b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']
df["label"] = df["label"].apply(int)

df_train = df

X = df_train.iloc[:, 1:].to_numpy()
y = df_train['label'].to_numpy()

# Pre-processing

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

sc = StandardScaler()
X_std = sc.fit_transform(X)

X_t, X_test, y_t, y_test = train_test_split(X_std, y, test_size=0.20, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_t, y_t, test_size=0.20, random_state=42)

print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(192000, 28) (48000, 28) (60000, 28)
(192000,) (48000,) (60000,)


In [4]:
%load_ext autoreload
%autoreload 

import sys
sys.path.append("../")
import X_py_boost

import cupy as cp

In [8]:
from X_py_boost import GradientBoosting
import time

m_d=8
eval_sets=[{'X': X_train, 'y': y_train},]
## Order 2

start = time.time()

model2 = GradientBoosting(loss='bce4', ntrees=10000, max_depth=m_d, lambda_l2=0.001, verbose=1000)
model2.fit(X_train, y_train, eval_sets=eval_sets)

end = time.time()
time2 = end - start
time_per_iter = time2 / len(get_results(model2))

print(f"time per iteration: {time_per_iter}")

[13:26:58] Stdout logging level is INFO.
[13:26:58] GDBT train starts. Max iter 10000, early stopping rounds 100
[13:26:58] Iter 0; Sample 0, BCE = 0.6808722394887633; 
[13:27:20] Iter 1000; Sample 0, BCE = 0.2979169106630678; 
[13:27:42] Iter 2000; Sample 0, BCE = 0.1775872988357166; 
[13:28:16] Iter 3000; Sample 0, BCE = 0.1022526836127296; 
[13:28:37] Iter 4000; Sample 0, BCE = 0.0579561273764996; 
[13:28:56] Iter 5000; Sample 0, BCE = 0.03175186578447297; 
[13:29:16] Iter 6000; Sample 0, BCE = 0.01748727604656775; 
[13:29:45] Iter 7000; Sample 0, BCE = 0.009626502441611881; 
[13:30:04] Iter 8000; Sample 0, BCE = 0.005275171896412787; 
[13:30:24] Iter 9000; Sample 0, BCE = 0.0028938717238380777; 
[13:30:43] Iter 9999; Sample 0, BCE = 0.0015892086136149082; 
time per iteration: 0.022440025353431703


In [9]:
from X_py_boost import GradientBoosting
import time

m_d=8
eval_sets=[{'X': X_train, 'y': y_train},]
## Order 2

start = time.time()

model2 = GradientBoosting(loss='bce2', ntrees=10000, max_depth=m_d, lambda_l2=0.001, verbose=1000)
model2.fit(X_train, y_train, eval_sets=eval_sets)

end = time.time()
time2 = end - start
time_per_iter = time2 / len(get_results(model2))

print(f"time per iteration: {time_per_iter}")

[13:31:00] Stdout logging level is INFO.
[13:31:00] GDBT train starts. Max iter 10000, early stopping rounds 100
[13:31:00] Iter 0; Sample 0, BCE = 0.6807538053307209; 
[13:31:21] Iter 1000; Sample 0, BCE = 0.30178592673476917; 
[13:31:42] Iter 2000; Sample 0, BCE = 0.18780567407077722; 
[13:32:03] Iter 3000; Sample 0, BCE = 0.11453959955884514; 
[13:32:23] Iter 4000; Sample 0, BCE = 0.06943461362642313; 
[13:32:42] Iter 5000; Sample 0, BCE = 0.041780478410820034; 
[13:33:02] Iter 6000; Sample 0, BCE = 0.025383991734773275; 
[13:33:21] Iter 7000; Sample 0, BCE = 0.015454139176161355; 
[13:33:41] Iter 8000; Sample 0, BCE = 0.009437699747234136; 
[13:34:00] Iter 9000; Sample 0, BCE = 0.005750510658980616; 
[13:34:20] Iter 9999; Sample 0, BCE = 0.003545961907847208; 
time per iteration: 0.020005656361579895


In [10]:
from X_py_boost import GradientBoosting
import time

m_d=8
eval_sets=[{'X': X_train, 'y': y_train},]
## Order 2

start = time.time()

model2 = GradientBoosting(loss='bce3', ntrees=10000, max_depth=m_d, lambda_l2=0.001, verbose=1000)
model2.fit(X_train, y_train, eval_sets=eval_sets)

end = time.time()
time2 = end - start
time_per_iter = time2 / len(get_results(model2))

print(f"time per iteration: {time_per_iter}")

[13:34:20] Stdout logging level is INFO.
[13:34:20] GDBT train starts. Max iter 10000, early stopping rounds 100
[13:34:20] Iter 0; Sample 0, BCE = 0.6807689830584379; 
[13:34:24] Early stopping at iter 179, best iter 79, best_score 0.4984669871993616
time per iteration: 0.021450765972030897


In [None]:
£