# AdaBoost Algorithm from scratch


### 1. Implementation

In [1]:
# calcul du taux de mal classé 
def erreur(y, y_pred, w_i):
    return (sum(w_i * (np.not_equal(y, y_pred)).astype(int)))/sum(w_i)

# calcul de alpha
def alpha(error):
    return np.log((1 - error) / error)

# réajustation des poids
def poids(w_i, alpha, y, y_pred):
    return w_i * np.exp(alpha * (np.not_equal(y, y_pred)).astype(int))

In [2]:
# Define AdaBoost class
class AdaBoost:
    
    def __init__(self):
        self.alphas = [] # liste des alphas
        self.G_M = [] # liste des classifieurs
        self.M = None # nb d'itérations
        self.training_errors = [] # liste des erreurs
        self.prediction_errors = []

    def fit(self, X, y, M = 100):
        
        self.alphas = [] 
        self.training_errors = []
        self.M = M

        # boosting itérations 
        for m in range(0, M):
            
            # (a) ajuster la règle faible sur l'échantillon
            if m == 0:
                w_i = np.ones(len(y)) * 1 / len(y)  
            else:
                # réajustation 
                w_i = poids(w_i, alpha_m, y, y_pred)
            
            # Fit weak classifieur
            G_m = DecisionTreeClassifier(max_depth = 1)     
            G_m.fit(X, y, sample_weight = w_i)
            y_pred = G_m.predict(X)
            
            self.G_M.append(G_m) 

            # (b) calcul de l'erreur
            error_m = erreur(y, y_pred, w_i)
            self.training_errors.append(error_m)

            # (c) calcul de alpha
            alpha_m = alpha(error_m)
            self.alphas.append(alpha_m)

        assert len(self.G_M) == len(self.alphas)
        
    def predict(self, X):

        # Initialisationn dataframe avec les weak predictions pour chaque observation
        weak_preds = pd.DataFrame(index = range(len(X)), columns = range(self.M)) 

        # Prediction de la classe pour chaque weak classifieur, pondéré par alpha_m
        for m in range(self.M):
            y_pred_m = self.G_M[m].predict(X) * self.alphas[m]
            weak_preds.iloc[:,m] = y_pred_m

        # Calcul de la prédiction finale
        y_pred = (1 * np.sign(weak_preds.T.sum())).astype(int)

        return y_pred

### 2. Tests

Dans ce qui suit, nous allons tester l'algorithme sur le Spambase dataset.

In [3]:
#importing libraries

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics      import f1_score, accuracy_score

In [4]:

# Dataset
df = pd.read_csv('./spambase.data', header = None)

# Column names
names = pd.read_csv('./spambase.names', sep = ':', skiprows=range(0, 33), header = None)
col_names = list(names[0])
col_names.append('Spam')

# Rename df columns
df.columns = col_names

# Convert classes in target variable to {-1, 1}
df['Spam'] = df['Spam'] * 2 - 1

# Train - test split
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns = 'Spam').values, 
                                                    df['Spam'].values, 
                                                    train_size = 3065, 
                                                    random_state = 2) 

In [5]:


# Fit model
adb = AdaBoost()
adb.fit(X_train, y_train, M = 400)

# Predict on test set
y_pred = adb.predict(X_test)

train_accuracy = accuracy_score(y_train, adb.predict(X_train))
test_accuracy = accuracy_score(y_test,  y_pred)

In [6]:
train_accuracy

0.9507340946166395

In [7]:
test_accuracy

0.9440104166666666