In [7]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline 
%config InlineBackend.figure_format = 'retina'
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import *
from sklearn.tree import export_graphviz

In [8]:
class SimpleGradientBoostingRegressor(BaseEstimator):

    def mse_antigradient(self, y_hat, y_true):
        return 2 * (y_true - y_hat)
    
    def mae_antigradient(self, y_hat, y_true):
        return np.sign(y_true - y_hat)
    
    def rmsle_antigradient(self, y_true, y_hat):
        eps = np.finfo(np.float64).eps
        return np.log1p(y_true) - np.log1p(np.clip(y_hat, eps, None))
    
    def huber_antigradient(self, y_hat, y_true):

        ath_quantile = self.ath_quantile
        delta_huber = np.quantile(abs(y_true - y_hat), ath_quantile)
        huber_vector = np.where(abs(y_true - y_hat) <= delta_huber, y_true - y_hat, delta_huber * np.sign(y_true - y_hat))

        return huber_vector
    
    def __init__(self, n_estimators=100, learning_rate=.1, loss='mse',
                        ath_quantile=None, constant='zero', max_depth=3,
                        subsample=1.0, random_state=17, debug=True):
        
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.loss = loss
        self.ath_quantile = ath_quantile
        self.constant = constant
        self.max_depth = max_depth
        self.subsample = subsample
        self.random_state = random_state
        self.debug = debug

        self.trees = []
        self.obj_by_iter = []

        if self.debug:
            self.pseudoresiduals = []
            self.idx_sampled = []
            self.tree_pred = []

    def fit(self, X, y):
        if self.constant == 'zero':
            prediction = 0

        if self.constant == 'mean':
            prediction = np.mean(y)

        if self.constant == 'median':
            prediction = np.median(y)
        
        if self.debug:
            self.const_pred = np.array([prediction] * len(X), float)

        if self.loss == 'mse':
            antigradient = self.mse_antigradient
        elif self.loss == 'mae':
            antigradient = self.mae_antigradient
        elif self.loss == 'rmsle':
            antigradient = self.rmsle_antigradient
        elif self.loss == 'huber':
            antigradient = self.huber_antigradient
        
        for iter_num in range(self.n_estimators):
            pseudoresid = antigradient(prediction, y)

            if self.debug:
                self.pseudoresiduals.append(pseudoresid)

            np.random.seed(iter_num)
            idx = np.random.choice(X.shape[0],
                                        int(self.subsample * X.shape[0]),
                                        replace=False)
            X_sampled = np.take(X, idx, axis=0)
            pseudoresid_sampled = np.take(pseudoresid, idx, axis=0)

            if self.debug:
                self.idx_sampled.append(idx)

            tree = DecisionTreeRegressor(max_depth=self.max_depth,
                                            random_state=self.random_state)
            tree.fit(X, pseudoresid)

            pred = tree.predict(X)

            if self.debug:
                self.tree_pred.append(pred)

            self.trees.append(tree)

            prediction += self.learning_rate * tree.predict(X)

            obj = mean_squared_error(y, prediction)
            self.obj_by_iter.append(obj)
        
        return self

    def predict(self, X):
        if self.constant == 'zero':
            pred = np.zeros(X.shape[0])

        elif self.constant == 'mean':
            pred = np.mean(y) * np.ones(X.shape[0])

        elif self.constant == 'median':
            pred = np.median(y) * np.ones(X.shape[0])

        for iter_num in range(self.n_estimators):
            pred += self.learning_rate * self.trees[iter_num].predict(X)

        return pred

In [9]:
data = pd.DataFrame({
    'income':[1000, 30000,20000, 95000, 10000, 5000, 1200, 0, 0, 35000],
    'target':[0, 0, 0.15, 0.05, 0, 1.5, 1.2, 0, 1, 0.5]
})

In [10]:
data

Unnamed: 0,income,target
0,1000,0.0
1,30000,0.0
2,20000,0.15
3,95000,0.05
4,10000,0.0
5,5000,1.5
6,1200,1.2
7,0,0.0
8,0,1.0
9,35000,0.5


In [11]:
y = data.pop('target').values
feat_labels = data.columns
X = data.values

In [12]:
X

array([[ 1000],
       [30000],
       [20000],
       [95000],
       [10000],
       [ 5000],
       [ 1200],
       [    0],
       [    0],
       [35000]])

In [13]:
boost = SimpleGradientBoostingRegressor(
    n_estimators=2, learning_rate=.8,
    constant='mean', loss='mse',
    max_depth=1, random_state=42
)

In [14]:
boost.fit(X, y)

SimpleGradientBoostingRegressor(constant='mean', learning_rate=0.8, max_depth=1,
                                n_estimators=2, random_state=42)

In [15]:
pred = boost.predict(X)
pred

array([-0.01866667,  0.36228571,  0.36228571,  0.36228571,  0.36228571,
        1.32228571,  1.32228571, -0.01866667, -0.01866667,  0.36228571])

In [16]:
tree_0 = boost.trees[0]

In [17]:
tree_0

DecisionTreeRegressor(max_depth=1, random_state=42)