<a href="https://colab.research.google.com/github/jinisaweaklearner/DS-ML-Paper-Note-Code/blob/master/src/AdaboostClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Import Packages



In [2]:
from sklearn.ensemble import AdaBoostRegressor
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.metrics import mean_absolute_error
from sklearn.tree import export_graphviz
from pydotplus import graph_from_dot_data
from IPython.display import Image
from sklearn.tree import export_text
from sklearn.tree import _tree

from sklearn.datasets import make_gaussian_quantiles
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# https://geoffruddock.com/adaboost-from-scratch-in-python/

# Load Data

In [3]:
def make_toy_dataset(n: int = 100, random_seed: int = None):
    """ Generate a toy dataset for evaluating AdaBoost classifiers """
    
    n_per_class = int(n/2)
    
    if random_seed:
        np.random.seed(random_seed)

    X, y = make_gaussian_quantiles(n_samples=n, n_features=2, n_classes=2)
    
    return X, y*2-1

X, y = make_toy_dataset(n=10, random_seed=10) 

# Modelling

In [4]:
n_trees = 3

In [5]:
class AdaBoostClassification:
    """ AdaBoost enemble classifier from scratch """

    def __init__(self):
        self.stumps = None
        self.stump_weights = None
        self.errors = None
        self.sample_weights = None

    def _check_X_y(self, X, y):
        """ Validate assumptions about format of input data"""
        assert set(y) == {-1, 1}, 'Response variable must be ±1'
        return X, y

def fit(self, X: np.ndarray, y: np.ndarray, iters: int):
    """ Fit the model using training data """

    X, y = self._check_X_y(X, y)
    n = X.shape[0]

    # init numpy arrays
    self.sample_weights = np.zeros(shape=(iters, n))
    self.stumps = np.zeros(shape=iters, dtype=object)
    self.stump_weights = np.zeros(shape=iters)
    self.errors = np.zeros(shape=iters)

    # initialize weights uniformly
    self.sample_weights[0] = np.ones(shape=n) / n

    print('ground Truth:', y)

    for t in range(iters):

        print(t)

        # fit  weak learner
        curr_sample_weights = self.sample_weights[t]
        print('weights: ',curr_sample_weights)
        stump = DecisionTreeClassifier(max_depth=1, max_leaf_nodes=2,random_state=42)
        stump = stump.fit(X, y, sample_weight=curr_sample_weights)

        # calculate error and stump weight from weak learner prediction
        stump_pred = stump.predict(X)
        print('prediction: ',stump_pred)
        err = curr_sample_weights[(stump_pred != y)].sum() 
        print('error: ',round(err,2))
        stump_weight = np.log((1 - err) / err) / 2

        # update sample weights
        new_sample_weights = (
            curr_sample_weights * np.exp(-stump_weight * y * stump_pred)
        )
        
        # normalize weights
        new_sample_weights /= new_sample_weights.sum()

        # If not final iteration, update sample weights for t+1
        if t+1 < iters:
            self.sample_weights[t+1] = new_sample_weights

        print('total weights:', stump_weight)
        # print(new_sample_weights,'\n')
        # save results of iteration
        self.stumps[t] = stump
        self.stump_weights[t] = stump_weight
        self.errors[t] = err

    return self

def predict(self, X):
    """ Make predictions using already fitted model """
    # get all predictions of each iteration
    stump_preds = np.array([stump.predict(X) for stump in self.stumps])

    # predictions(10*3) * weights (3)
    pred_prob = np.dot(self.stump_weights, stump_preds)
    return pred_prob, np.sign(pred_prob)  

AdaBoostClassification.fit = fit
AdaBoostClassification.predict = predict      

In [7]:
clf = AdaBoostClassification().fit(X, y, iters=n_trees)

ground Truth: [-1  1  1 -1 -1 -1  1  1 -1  1]
0
weights:  [0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]
prediction:  [-1  1  1 -1 -1 -1 -1  1 -1 -1]
error:  0.2
total weights: 0.6931471805599453
1
weights:  [0.0625 0.0625 0.0625 0.0625 0.0625 0.0625 0.25   0.0625 0.0625 0.25  ]
prediction:  [1 1 1 1 1 1 1 1 1 1]
error:  0.31
total weights: 0.39422868018213514
2
weights:  [0.1        0.04545455 0.04545455 0.1        0.1        0.1
 0.18181818 0.04545455 0.1        0.18181818]
prediction:  [-1 -1 -1 -1 -1 -1 -1  1 -1  1]
error:  0.27
total weights: 0.49041462650586315


In [None]:
pred_prob,final_pred = clf.predict(X)
train_err = (final_pred != y).mean()
print('final_pred',final_pred)
print('pred_prob',pred_prob)
print(f'Train error: {train_err:.1%}')

final_pred [-1.  1.  1. -1. -1. -1. -1.  1. -1.  1.]
pred_prob [-0.78933313  0.59696123  0.59696123 -0.78933313 -0.78933313 -0.78933313
 -0.78933313  1.57779049 -0.78933313  0.19149613]
Train error: 10.0%


## cross check the result by using packages

In [9]:
clf_package = AdaBoostClassifier(n_estimators=n_trees, random_state=42)
clf_package.fit(X, y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=3, random_state=42)

In [10]:
clf_package.estimator_weights_

array([1., 1., 1.])

In [11]:
for i in clf_package.staged_predict(X):
    print(i)

[-1  1  1 -1 -1 -1 -1  1 -1 -1]
[-1  1  1 -1 -1 -1 -1  1 -1 -1]
[-1  1  1 -1 -1 -1 -1  1 -1  1]


In [8]:
# for i in clf_package.staged_predict_proba(X):
#     print(i)