In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

from sklearn.metrics import log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_hastie_10_2
from sklearn.model_selection import train_test_split

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [2]:
X_all = np.random.randn(5000, 1)
y_all = (X_all[:, 0] > 0)*2 - 1

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.5, random_state=42)
print(X_all[:5])
print(y_all[:5])

[[-1.30419271]
 [-2.09445903]
 [ 1.23369958]
 [ 0.50924743]
 [-2.15262101]]
[-1 -1  1  1 -1]


In [3]:
clf = DecisionTreeClassifier(max_depth=1)
clf.fit(X_train, y_train)

print ('Accuracy for a single decision stump: {}'.format(clf.score(X_test, y_test)))

Accuracy for a single decision stump: 1.0


In [4]:
clf = GradientBoostingClassifier(n_estimators=5000, learning_rate=0.01, max_depth=3, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict_proba(X_test)[:, 1]
print("Test logloss: {}".format(log_loss(y_test, y_pred)))

Test logloss: 9.992007221626413e-16


In [5]:
def compute_loss(y_true, scores_pred):
    '''
        Since we use raw scores we will wrap log_loss 
        and apply sigmoid to our predictions before computing log_loss itself
    '''
    return log_loss(y_true, sigmoid(scores_pred))
    

'''
    Get cummulative sum of *decision function* for trees. i-th element is a sum of trees 0...i-1.
    We cannot use staged_predict_proba, since we want to maniputate raw scores
    (not probabilities). And only in the end convert the scores to probabilities using sigmoid
'''
cum_preds = np.array([x for x in clf.staged_decision_function(X_test)])[:, :, 0] 

print ("Logloss using all trees:           {}".format(compute_loss(y_test, cum_preds[-1, :])))
print ("Logloss using all trees but last:  {}".format(compute_loss(y_test, cum_preds[-2, :])))
print ("Logloss using all trees but first: {}".format(compute_loss(y_test, cum_preds[-1, :] - cum_preds[0, :])))

Logloss using all trees:           9.992007221626413e-16
Logloss using all trees but last:  9.992007221626413e-16
Logloss using all trees but first: 9.992007221626413e-16


In [6]:
print(cum_preds.shape)

(5000, 2500)


In [12]:
print(cum_preds[:, y_test == 1][:, 0])

[2.31680538e-02 4.29390365e-02 6.25187344e-02 ... 3.67382171e+01
 3.67382171e+01 3.67382171e+01]


In [13]:
y_pred = clf.predict_proba(X_test)
print(y_pred[:5])

[[1.00000000e+00 9.62557825e-23]
 [0.00000000e+00 1.00000000e+00]
 [0.00000000e+00 1.00000000e+00]
 [1.00000000e+00 9.62557825e-23]
 [0.00000000e+00 1.00000000e+00]]


In [14]:
print(cum_preds[-2, :])

[-50.68503318  36.7382171   36.7382171  ... -50.68503318  36.7382171
 -50.68503318]


In [15]:
print(cum_preds[-1, :])

[-50.69503318  36.7382171   36.7382171  ... -50.69503318  36.7382171
 -50.69503318]
