In [1]:
#https://www.kdnuggets.com/2020/12/implementing-adaboost-algorithm-from-scratch.html
#https://github.com/jinxin0924/multi-adaboost/blob/master/multi_AdaBoost.py

In [143]:
from sklearn.datasets import make_gaussian_quantiles
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import AdaBoostClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score,mean_squared_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate


In [144]:
X, y = make_gaussian_quantiles(
    n_samples=13000, n_features=10, n_classes=2, random_state=1
)

y = np.where(y==0,-1,1)

n_split = 3000

X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]


In [145]:
#make a binary 
y 


array([ 1, -1, -1, ...,  1,  1, -1])

In [146]:
#------------------------------- BINARY CLASSIFICATIONS ---------------------- #

class BinaryClassAdaboost():
    """
    """
    
    def __init__(self, n_estimators:int):
        """
        Initialialisation of Adaboost class
        Parameters: 
            n_estimators: int:  number of weak learners 
        """
        self.n_estimators = n_estimators
        self.list_WL = [] #list with model
        self.list_alpha = [] #list with weight of model 
        self.estimator_errors = []

        
        
    def fit(self, X, y):
        """
        Fit model 
        Parameters: 
            X: array: data
            y: array: vector of class labels where yi E Y= {1,..., k} and k = 2
        """
        ## Step 1: Initialize the weights to a constant
        n_samples = X.shape[0]                
        w = []
        ##Weights are initialized to 1/Number of samples: 
        w_t = [1/n_samples for x in range(n_samples)]       
              
        
        ## Step 2: Classify with ramdom sampling of data using a weak learner
        #Construction des weaklearner
        
        #for each weak learner
        for t in range(self.n_estimators):

            #Choose and Call the Base/Weak learner
            #A decision tree with one depth has one node and is called a stump or weak learner
            WL = DecisionTreeClassifier(max_depth=1)
            #Fit the stump model with the ramdom samples
            WL.fit(X, y, sample_weight=w_t)
            #Get the predicted classes
            y_pred = WL.predict(X)
            
            ##Step 3: Compute error of weak learner
            eps = self.error_wl(w_t, y_pred, y)
        
            # if the error of the weak learner is higher then 0.5 (worse then random guess) 
            #don't take into account this learner weight
            if eps > 0.5:
                break
            
            #Step 4: Calculate the performance of the weak learner
            #Performance of the weak learner(α) = 0.5* ln (1 – error/error)
            #Calculate alpha for this weak learner
            
            alpha_t = 0.5 * np.log((1- eps) / eps)
            

            #Step 5: Update weight
            #With the alpha performance (α) the weights of the wrongly classified records are increased
            #and the weights of the correctly classified records decreased.
            y_temp = np.multiply(y, y_pred)
            y_temp2 = -alpha_t * y_temp 
            normalized_w_t = np.multiply(w_t, np.exp(y_temp2))

            #normalizing the weigths for the sum to be equal do 1
            w_t = normalized_w_t / sum(normalized_w_t)
            
            #store the alpha performance of each weak learner
            self.list_alpha.append(alpha_t)
            #store each weak learner
            self.list_WL.append(WL)
            self.estimator_errors.append(eps)

            
            
        return self

    def predict(self, X):
        """
        predict output of Adaboost 
        Paramters: 
            X: array: data
        Return: 
            y_pred: array: data
        """
        #The final prediction is a compromise between all the weak learners predictions
        list_y_pred = []
        
        #for each weak learner get their prediction

        for WL, w in zip(self.list_WL, self.list_alpha):
            #Final prediction is obtained by the weighted by alpha sum of each weak learner prediction
            list_y_pred.append(WL.predict(X) * w)
         
        #the array of all the predictions

        arr_y_pred = np.array(sum(list_y_pred))
 
        #get -1 if y_pred < 0 or 1 if y_pred > 0
        y_pred = np.sign(arr_y_pred)
        
        return y_pred 
        
    def error_wl(self, w_t, y_pred, y):
        """
        error of current weaklearner
        Parameters:
            w_t: array:  weight of observation
            y_pred: array: output of wl 
            y: array: labels
        Return: 
            eps: float: error of wl 
        """
        
        ind_err = []
        for i in range(y_pred.shape[0]):
            if y_pred[i] != y[i]:
                ind_err.append(1) 
            else: 
                ind_err.append(0) 
    
        w_ind_err = np.multiply(w_t,ind_err)
        
        eps = np.sum(w_ind_err)
    
        return eps
    
        

In [147]:
model = BinaryClassAdaboost(50)
model.fit(X_train, y_train)

<__main__.BinaryClassAdaboost at 0x1557d5ee760>

In [148]:
y_test.shape

(10000,)

In [149]:
model.estimator_errors

[0.4453333333333333,
 0.45503224320589164,
 0.44445759445665955,
 0.4556735130262076,
 0.44941358271011267,
 0.4580110239318191,
 0.4557613595596748,
 0.4618767897781757,
 0.4497835001423618,
 0.4591059982372616,
 0.442193423963022,
 0.4569609939511425,
 0.4516404681312472,
 0.45984163358780866,
 0.459888454839247,
 0.4644383851560971,
 0.45049811411202706,
 0.4595648556904214,
 0.45264116872913435,
 0.4604456926222529,
 0.45471180437785697,
 0.4615909526179792,
 0.4500300577869357,
 0.45936422023190715,
 0.4600126868903682,
 0.46461123573077373,
 0.4555243343892089,
 0.46195397025026275,
 0.4585606907194274,
 0.4636469591937208,
 0.4654182285082159,
 0.4683934484202562,
 0.4592611585794638,
 0.4644006169148185,
 0.46910109786572474,
 0.4715570444761682,
 0.4578928681950091,
 0.4643728597536746,
 0.45848266719688685,
 0.4647095102106533,
 0.45216495822969616,
 0.46234833072509707,
 0.4519100349028713,
 0.4625290597868963,
 0.4607495941890463,
 0.4659156432244539,
 0.46613355494224107,


In [150]:
y_pred = model.predict(X_test)

In [151]:
accuracy_score(y_test, y_pred)

0.7797

In [152]:
#-------------------------------Multiclass LASSIFICATIONS ---------------------- #


class MultiClassAdaBoost(object):
    '''
    Parameters
    -----------
    base_estimator: object
        The base model from which the boosted ensemble is built.
    n_estimators: integer, optional(default=50)
        The maximum number of estimators
    learning_rate: float, optional(default=1)
    Attributes
    -------------
    estimators_: list of base estimators
    estimator_weights_: array of floats
        Weights for each base_estimator
    estimator_errors_: array of floats
        Classification error for each estimator in the boosted ensemble.
    '''

    def __init__(self, n_estimators, learning_rate):
        self.n_estimators = n_estimators
        self.list_WL = [] #list with model
        self.list_alpha = [] #list with weight of model 
        self.learning_rate_ = learning_rate
        self.estimator_errors = []


    def fit(self, X, y):
        
        ## Step 1: Initialize the weights to a constant
        n_samples = X.shape[0]                
        w = []
        ##Weights are initialized to 1/Number of samples: 
        w_t = [1/n_samples for x in range(n_samples)]       
        
        # So in boost we have to ensure that the predict results have the same classes sort
        self.classes_ = np.array(sorted(list(set(y))))
        self.n_classes_ = len(self.classes_)
        
        
        ## Step 2: Classify with ramdom sampling of data using a weak learner
        #Construction des weaklearner
        
        #for each weak learner
        for t in range(self.n_estimators):
          
            #Choose and Call the Base/Weak learner
            #A decision tree with one depth has one node and is called a stump or weak learner
            WL = DecisionTreeClassifier(max_depth=1)
            #Fit the stump model with the ramdom samples
            WL.fit(X, y, sample_weight=w_t)
            
            y_pred = WL.predict(X)
            
            ##Step 3: Compute error of weak learner
            incorrect = y_pred != y
            estimator_error = np.dot(incorrect, w_t) / np.sum(w_t, axis=0)
            
            # if worse than random guess, stop boosting
            if estimator_error >= 1 - 1 / self.n_classes_:
                break

            # update alphe performance
            alpha_t = self.learning_rate_ * np.log((1 - estimator_error) / estimator_error) + np.log(
            self.n_classes_ - 1)
        

            # update sample weight
            w_t *= np.exp(alpha_t * incorrect)
            sample_weight_sum = np.sum(w_t, axis=0)

            # normalize sample weight
            w_t /= sample_weight_sum
            
            #store the alpha performance of each weak learner
            self.list_alpha.append(alpha_t)
            #store each weak learner
            self.list_WL.append(WL)
            # append error
            self.estimator_errors.append(estimator_error)

        return self


    def predict(self, X):
        n_classes = self.n_classes_
        classes = self.classes_[:, np.newaxis]

        
        pred = sum((estimator.predict(X) == classes).T * w
                   for estimator, w in zip(self.list_WL,
                                           self.list_alpha))

        pred /= sum(self.list_alpha)
        if n_classes == 2:
            pred[:, 0] *= -1
            pred = pred.sum(axis=1)
            return self.classes_.take(pred > 0, axis=0)

        return self.classes_.take(np.argmax(pred, axis=1), axis=0)

In [153]:
X, y = make_gaussian_quantiles(
    n_samples=13000, n_features=10, n_classes=3, random_state=1
)

n_split = 3000

X_train, X_test = X[:n_split], X[n_split:]
y_train, y_test = y[:n_split], y[n_split:]


In [154]:
model = MultiClassAdaBoost(50, 0.001)

In [155]:
model.fit(X_train,y_train)

<__main__.MultiClassAdaBoost at 0x1557d644e50>

In [156]:
model.estimator_errors

[0.6206666666666666,
 0.5607244464445126,
 0.5425086325243259,
 0.6135858385334544,
 0.5420652603391231,
 0.5753284241536961,
 0.5980185330265171,
 0.5228302785275332,
 0.6149390808877095,
 0.5830957803836047,
 0.528565635420615,
 0.621167852390974,
 0.558744002173177,
 0.5518668979561658,
 0.5880170445179811,
 0.5753958658886382,
 0.5648511884839151,
 0.574048731816833,
 0.5547125312859807,
 0.610022567747886,
 0.5683839167090066,
 0.5269065279833075,
 0.5941977070913176,
 0.5890595919048741,
 0.5308747449010472,
 0.5865884718972433,
 0.5716288377529896,
 0.566215104511706,
 0.5827911724637516,
 0.5568144831692023,
 0.5893321469449455,
 0.5688988919088552,
 0.5691204835172557,
 0.5934365368086099,
 0.5577232500786842,
 0.5573800302804418,
 0.5853415762196916,
 0.6032579124770993,
 0.5350394212701343,
 0.5668446094103218,
 0.5886818493781664,
 0.580121140895584,
 0.5514267565014686,
 0.574535257001408,
 0.5859925923893254,
 0.5623737841690859,
 0.5631126216695282,
 0.5725577156821924,


In [157]:
y_pred = model.predict(X_test)

In [158]:
accuracy_score(y_test, y_pred)

0.4919

In [None]:
## Explore Weak Learner (use different weak learners or with different parameters)


In [None]:
## Explore Number of Trees


In [159]:
## Optimizing 
start_time=time.time()

pipe = make_pipeline(RandomForestClassifier())
param_grid = [
  {'randomforestclassifier__bootstrap': [True, False], 
   'randomforestclassifier__max_depth': [80, 90, 100, 110],
   'randomforestclassifier__max_features': ['auto', 'log2'],
   'randomforestclassifier__n_estimators': [100, 200, 300, 1000]
  }
    
 ]
cv = KFold(n_splits=10, shuffle=True)
grid = GridSearchCV(pipe, param_grid, cv=cv, scoring='accuracy')

grid3 = grid.fit(X, labels)

end_time=time.time()
print("total time",end_time-start_time)

NameError: name 'time' is not defined

In [160]:
##Comparing with other algo

# Define dictionary with performance metrics

scores = ['accuracy','precision_weighted','recall_weighted','f1_weighted',]
# Instantiate the machine learning classifiers
log_model = LogisticRegression(max_iter=10000)
svc_model = LinearSVC(dual=False)
dtr_model = DecisionTreeClassifier()
rfc_model = RandomForestClassifier()
gnb_model = GaussianNB()
ada_model = AdaBoostClassifier()
#my_ada_model = MultiClassAdaBoost(100, 0.001)


# Define the models evaluation function
def models_evaluation(X, labels, folds):
    
    '''
    X : data set features
    labels : data set target
    folds : number of cross-validation folds
    
    '''
    
    # Perform cross-validation to each machine learning classifier
    log = cross_validate(log_model, X, labels, cv=folds, scoring=scores)
    svc = cross_validate(svc_model, X, labels, cv=folds, scoring=scores)
    dtr = cross_validate(dtr_model, X, labels, cv=folds, scoring=scores)
    rfc = cross_validate(rfc_model, X, labels, cv=folds, scoring=scores)
    gnb = cross_validate(gnb_model, X, labels, cv=folds, scoring=scores)
    ada = cross_validate(ada_model, X, labels, cv=folds, scoring=scores)
    #my_ada = cross_validate(my_ada_model, X, labels, cv=folds, scoring=scores)

    
    # Create a data frame with the models perfoamnce metrics scores
    models_scores_table = pd.DataFrame({'Logistic Regression':[log['test_accuracy'].mean(),
                                                               log['test_precision_weighted'].mean(),
                                                               log['test_recall_weighted'].mean(),
                                                               log['test_f1_weighted'].mean()],
                                       
                                      'Support Vector Classifier':[svc['test_accuracy'].mean(),
                                                                   svc['test_precision_weighted'].mean(),
                                                                   svc['test_recall_weighted'].mean(),
                                                                   svc['test_f1_weighted'].mean()],
                                       
                                      'Decision Tree':[dtr['test_accuracy'].mean(),
                                                       dtr['test_precision_weighted'].mean(),
                                                       dtr['test_recall_weighted'].mean(),
                                                       dtr['test_f1_weighted'].mean()],
                                       
                                      'Random Forest':[rfc['test_accuracy'].mean(),
                                                       rfc['test_precision_weighted'].mean(),
                                                       rfc['test_recall_weighted'].mean(),
                                                       rfc['test_f1_weighted'].mean()],
                                       
                                      'Gaussian Naive Bayes':[gnb['test_accuracy'].mean(),
                                                              gnb['test_precision_weighted'].mean(),
                                                              gnb['test_recall_weighted'].mean(),
                                                              gnb['test_f1_weighted'].mean()], 
                                       
                                       
                                       'Adaboost Classifier':[ada['test_accuracy'].mean(),
                                                              ada['test_precision_weighted'].mean(),
                                                              ada['test_recall_weighted'].mean(),
                                                              ada['test_f1_weighted'].mean()]
                                       },
                                      
                                      index=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    
    # Add 'Best Score' column
    models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)
    
    # Return models performance metrics scores data frame
    return(models_scores_table)
  
# Run models_evaluation function
models_evaluation(X, y, 10)

Unnamed: 0,Logistic Regression,Support Vector Classifier,Decision Tree,Random Forest,Gaussian Naive Bayes,Adaboost Classifier,Best Score
Accuracy,0.335,0.335,0.641923,0.793385,0.913077,0.734615,Gaussian Naive Bayes
Precision,0.335082,0.334982,0.645245,0.796442,0.928256,0.789048,Gaussian Naive Bayes
Recall,0.335,0.335,0.641923,0.793385,0.913077,0.734615,Gaussian Naive Bayes
F1 Score,0.330451,0.33005,0.643063,0.794242,0.914871,0.740526,Gaussian Naive Bayes
