In [19]:
import numpy as np
import pandas as pd
from dt import DecisionTree

In [20]:
class MeanBaseEstimator:
    def fit(self, X, y):
        self.avg = np.mean(y)

    def predict(self, X):
        pred = np.empty(X.shape[0], dtype=np.float64)
        pred.fill(self.avg)
        return pred

class MSELoss:
    def __call__(self, y, y_pred):
        return np.mean((y - y_pred) ** 2)

    def base_estimator(self):
        return MeanBaseEstimator()

    def grad(self, y, y_pred):
        # simple derivative
        return -2 / len(y) * (y - y_pred)

In [21]:
    
class GradientBoostedDecisionTree:
    def __init__(
        self,
        n_iter,
        max_depth=None,
        learning_rate=1,
        step_size="constant",
    ):
        """
        A gradient boosted ensemble of decision trees.
        Notes
        Parameters
        ----------
        n_iter : int
            The number of iterations / weak estimators to use when fitting each
            dimension / class of `Y`.
        """
        self.weights = None
        self.learners = None
        self.out_dims = None
        self.n_iter = n_iter
        self.base_estimator = None
        self.max_depth = max_depth
        self.step_size = step_size
        self.learning_rate = learning_rate
    
    
    def fit(self, X, Y):
        """
        Fit the gradient boosted decision trees on a dataset.
        """
        # set loss function
        loss = MSELoss()
        # if Y array has only one dimension then make sure to get array with two dimensions
        # where the first one can be whatever fits and the secon one has to be 1.
        Y = Y.reshape(-1, 1) if len(Y.shape) == 1 else Y
        N, M = X.shape
        # this is usually 1
        self.out_dims = Y.shape[1]
        # each iteration is one row of learners with the same length as the output y
        # so basically matrix with n_iter rows and out_dims columns which is most often 1.
        self.learners = np.empty((self.n_iter, self.out_dims), dtype=object)
        # the same is valid for the weights
        # as we have a weight for each learner
        self.weights = np.ones((self.n_iter, self.out_dims))
        # all but the first (i.e. zero positioned) row
        self.weights[1:, :] *= self.learning_rate

        # fit the base estimator
        Y_pred = np.zeros((N, self.out_dims))
        # this usually sets the first learner to the value of the base estimator
        # for us this would be the average of the Y values
        # removed k loop from original
        
        # this calls the MeanBaseEstimator() in the case of MSELoss
        # or the ClassProbEstimator() in the case of CrossEntropyLoss
        # here we consider only MSE loss
        t = loss.base_estimator()
        # in our case this takes the mean/avg of Y's column k
        t.fit(X, Y[:, 0])
        # now we predict the values by adding the vector of means
        # onto the Y_pred column vector
        Y_pred[:, 0] += t.predict(X)
        # the prediction is just the avg value
        # now we save the base estimator to the zeroth row and k-th column of the learners
        # which contains one row for each iteration
        self.learners[0, 0] = t

        # incrementally fit each learner on the negative gradient of the loss
        # wrt the previous fit (pseudo-residuals)
        for i in range(1, self.n_iter):
            # out dims is usually 1 so removed it 

            y, y_pred = Y[:, 0], Y_pred[:, 0]
            # use derivative of MSE loss to obtain negative gradient
            neg_grad = -1 * loss.grad(y, y_pred)
            
            # take decision tree discussed in previous post
            # use MSE as the surrogate loss when fitting to negative gradients
            t = DecisionTree(
                classifier=False, max_depth=self.max_depth, criterion="mse"
            )

            # fit current learner to negative gradients
            t.fit(X, neg_grad)
            # save trained learner for each iteration
            self.learners[i, 0] = t

            # compute step size and weight for the current learner
            step = 1.0
            h_pred = t.predict(X)

            # update weights and our overall prediction for Y
            self.weights[i, 0] *= step
            Y_pred[:, 0] += self.weights[i, 0] * h_pred

    def predict(self, X):
        """
        Use the trained model to classify or predict the examples in `X`.
        Parameters 
        """
        Y_pred = np.zeros((X.shape[0], self.out_dims))
        for i in range(self.n_iter):
            # removed k loop from original
            Y_pred[:, 0] += self.weights[i, 0] * self.learners[i, 0].predict(X)

        return Y_pred

In [22]:
Y = np.random.uniform(0, 100, 100)
X = np.random.uniform(0, 100, (100,4))

In [23]:
t = GradientBoostedDecisionTree(n_iter=100)
t.fit(X,Y)
t.predict(X)

array([[31.93102011],
       [68.77078514],
       [75.75057859],
       [78.93915104],
       [29.70717605],
       [67.13328175],
       [68.05603813],
       [29.81417086],
       [27.70733316],
       [40.99513131],
       [ 6.68114214],
       [80.00674184],
       [10.08887845],
       [51.41639291],
       [41.49629209],
       [85.00323326],
       [20.45265419],
       [14.15023088],
       [23.30708667],
       [22.09649838],
       [10.46878876],
       [63.17523835],
       [28.11963647],
       [ 8.94342996],
       [68.25249424],
       [83.88226478],
       [10.04116925],
       [44.33899683],
       [48.5088378 ],
       [45.00627327],
       [15.80353042],
       [18.88602579],
       [19.93964957],
       [34.74207532],
       [35.54844761],
       [52.99488511],
       [38.81251851],
       [41.52990788],
       [ 9.0703191 ],
       [66.93066188],
       [86.67453959],
       [64.29881549],
       [82.03537414],
       [ 8.56565582],
       [11.73444802],
       [72