Builiding Ababoost model from scratch for toy dataset.

The name AdaBoost stands for Adaptive Boosting, and it refers to a particular boosting algorithm in which we fit a sequence ofdecision trees with a single node and two leaves and weight their contribution to the final prediction. 

Make a toy dataset

In [None]:
from sklearn.datasets import make_gaussian_quantiles
from sklearn.model_selection import train_test_split
import numpy as np

def make_toy_dataset(n: int = 100, random_seed: int = None):
    """ Generate a toy dataset for evaluating AdaBoost classifiers """
    
    n_per_class = int(n/2)
    
    if random_seed:
        np.random.seed(random_seed)

    X, y = make_gaussian_quantiles(n_samples=n, n_features=2, n_classes=2)
    
    return X, y*2-1

X, y = make_toy_dataset(n=10, random_seed=10)


In [None]:
class AdaBoost:

    def __init__(self):
        self.stumps = None
        self.stump_weights = None
        self.errors = None
        self.sample_weights = None

    def _check_X_y(self, X, y):
        """ Validate assumptions about format of input data"""
        assert set(y) == {-1, 1}, 'Response variable must be ±1'
        return X, y

Fit the model

In [None]:
from sklearn.tree import DecisionTreeClassifier

def fit(self, X: np.ndarray, y: np.ndarray, iters: int):

    X, y = self._check_X_y(X, y)
    n = X.shape[0]
    self.sample_weights = np.zeros(shape=(iters, n))
    self.stumps = np.zeros(shape=iters, dtype=object)
    self.stump_weights = np.zeros(shape=iters)
    self.errors = np.zeros(shape=iters)
    self.sample_weights[0] = np.ones(shape=n) / n
    for t in range(iters):
        curr_sample_weights = self.sample_weights[t]
        stump = DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2)
        stump = stump.fit(X, y, sample_weight=curr_sample_weights)
        stump_pred = stump.predict(X)
        err = curr_sample_weights[(stump_pred != y)].sum()# / n
        stump_weight = np.log((1 - err) / err) / 2
        new_sample_weights = (curr_sample_weights * np.exp(-stump_weight * y * stump_pred))
        new_sample_weights /= new_sample_weights.sum()
        if t+1 < iters:
            self.sample_weights[t+1] = new_sample_weights

        # save results of iteration
        self.stumps[t] = stump
        self.stump_weights[t] = stump_weight
        self.errors[t] = err

    return self

Predict the values

In [None]:
def predict(self, X):
    """ Make predictions using already fitted model """
    stump_preds = np.array([stump.predict(X) for stump in self.stumps])
    return np.sign(np.dot(self.stump_weights, stump_preds))

Test error for the model

In [None]:
from sklearn.metrics import mean_squared_error


AdaBoost.fit = fit
AdaBoost.predict = predict
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = AdaBoost().fit(X_train, y_train, iters=10)

train_err = (clf.predict(X_test) != y_test).mean()

print("mean square error",mean_squared_error(y_test, clf.predict(X_test)))

mean square error 2.0
