In [1]:
import py_boost

In [2]:
## loading the covertype dataset

from sklearn.datasets import fetch_openml

covertype = fetch_openml(data_id=44121)

data = covertype['data']
label = covertype['target']

X = data.values.astype('float32')
y = label.values.astype('int32')

In [3]:
from sklearn.model_selection import train_test_split

X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=0.90, random_state=42)

eval_sets=[{'X': X_t, 'y': y_t},]
print("X_t shape:", X_t.shape)
print("X_test shape:", X_test.shape)

X_t shape: (56660, 10)
X_test shape: (509942, 10)


In [4]:
import matplotlib.pyplot as plt

plt.scatter(X[:, 1],X[:, 7], marker="o", c=y)


ModuleNotFoundError: No module named 'matplotlib'

In [5]:
from py_boost import GradientBoosting

eval_sets=[{'X': X_t, 'y': y_t},]

model = GradientBoosting(loss='bce', metric='bce', ntrees=100, max_depth=6, verbose=100)
model.fit(X_t, y_t, eval_sets=[{'X': X_t, 'y': y_t}])

[12:58:13] Stdout logging level is INFO.
[12:58:13] GDBT train starts. Max iter 100, early stopping rounds 100
[12:58:13] Iter 0; Sample 0, BCE = 0.6768789650395793; 
[12:58:14] Iter 99; Sample 0, BCE = 0.42705920119571367; 


<py_boost.gpu.boosting.GradientBoosting at 0x7f6239848f60>

In [6]:
def get_leaf_indices(model, x, iteration=0):
    """
    Returns a dictionnary, 
    """

    leaves = model.predict_leaves(x, iterations=[iteration])
    leaves = leaves.reshape((leaves.shape[1]))

    dic = {}

    for leaf in leaves:
        dic[leaf] = (leaves == leaf)

    return dic

def get_predictions(model, x, iteration=0):
    pred = model.predict_staged(X_t, iterations=[iteration])
    return( pred.reshape((pred.shape[1])) )



In [7]:
dic = get_leaf_indices(model, X_t, iteration=7)
len(dic)

61

In [8]:
import cupy as cp
import numpy as np

def sigmoid(y_pred):
    return(1 / (1 + np.exp(-y_pred)))

def inv_sigmoid(pred):
    return( np.log(pred) - np.log(1 - pred) )

def cross_entropy(y_true, y_pred):
    
    pred = sigmoid(y_pred)

    return - np.log(y_true * pred + (1 - y_true) * (1 - pred))


In [9]:
leaves = get_leaf_indices(model, X_t, iteration = 7)

preds = get_predictions(model, X_t, iteration = 6)

y_pred = inv_sigmoid(preds)

In [10]:
it = 50

leaves = get_leaf_indices(model, X_t, iteration = it)
preds6 = get_predictions(model, X_t, iteration = it)
preds7 = get_predictions(model, X_t, iteration = it + 1)




In [11]:
leaf = 50

preds6[leaves[leaf]] - preds7[leaves[leaf]]

array([ 0.00493321,  0.00142929, -0.00089429, -0.00147241,  0.00418118,
       -0.00159413,  0.00357674,  0.00126654,  0.00243431,  0.00190893,
       -0.00107524,  0.00293494,  0.00205462,  0.00609072,  0.00289176,
       -0.00085817,  0.00131726,  0.00189492,  0.00248629,  0.00210542,
        0.00126636, -0.00097604,  0.00240682, -0.0008129 ,  0.00229812,
       -0.0009609 ,  0.0016364 ,  0.00265109, -0.0009609 ,  0.0016364 ,
        0.00165163,  0.00154889,  0.00293594,  0.00122219,  0.00126654,
        0.00273364,  0.00626813,  0.00081439,  0.00260235, -0.00097604,
        0.00223951, -0.00159353,  0.00206877, -0.00139494], dtype=float32)

In [12]:
pred = get_predictions(model, X_t, iteration = 50)[leaves[20]]
y_pred = inv_sigmoid(pred)

y_true = y_t[leaves[20]]

def l(w):
    return(cross_entropy(y_true, y_pred + w).mean())

def get_grads(y, pred):
    grad = (pred - y).mean()
    grad2 = (pred * (1 - pred)).mean()
    grad3 = (pred * (1 - pred) * (1 - 2 * pred)).mean()
    grad4 = (pred * (1 - pred) * (1 - 2 * pred) * (1 - 3 * pred)).mean()
    return(grad, grad2, grad3, grad4)

In [13]:
y_true.mean()

0.6920685416817295

In [14]:
lambda_l2 = 0.00
grad, grad2, grad3, grad4 = get_grads(y=y_true, pred=pred)
print(grad, grad2, grad3, grad4)
w2 = - grad / (grad2 + lambda_l2)
w3 = w2 * (1 + grad * grad3 / (grad2 + lambda_l2) ** 2)

print(f'weight 2nd order:{w2}')
print(f'weight 3nd order:{w3}')


-0.030256929853894034 0.20366542 -0.06204669 0.08102973
weight 2nd order:0.14856193944232352
weight 3nd order:0.155285765243486


In [15]:
it = 50

leaves = get_leaf_indices(model, X_t, iteration = it)
li = list([])

for leaf in leaves:

    pred = get_predictions(model, X_t, iteration = it)[leaves[leaf]]
    y_pred = inv_sigmoid(pred)

    y_true = y_t[leaves[leaf]]
    def l(w):
        return(cross_entropy(y_true, y_pred + w).mean())

    grad, grad2, grad3, grad4 = get_grads(y=y_true, pred=pred)
    w2 = - grad / (grad2 + lambda_l2)
    w3 = w2 * (1 + grad * grad3 / (grad2 + lambda_l2) ** 2)
    li.append(l(w2) - l(w3))

np.sum(li)

    

0.2716940934341696

In [16]:
np.sum(cross_entropy(y_true, y_pred))

0.45289177928284885

In [None]:
import plotly.io as pio
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import numpy as np

leaf=5
pred = get_predictions(model, X_t, iteration = it)[leaves[leaf]]
y_pred = inv_sigmoid(pred)

y_true = y_t[leaves[leaf]]
def l(w):
    return(cross_entropy(y_true, y_pred + w).mean())

xs = np.linspace(-1, 1, 1000)

results = [l(w) for w in xs]


fig1 = go.Figure(layout_title_text="-")
fig1.add_trace(go.Scatter(x=xs, y=results,
                        mode='lines',
                        name='2nd order'))


