In [6]:
import numpy as np 
import matplotlib.pyplot as plt 

In [7]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                test_size=0.3, shuffle=True, random_state=42)

In [65]:
from sklearn.tree import DecisionTreeClassifier
import random
from scipy import stats
from sklearn.metrics import classification_report

class Bagging :
    def __init__(self, B, bootstrap_ratio, max_depth, criterion, min_samples_split, replacement):
        self.B = B
        self.bootstrap_ratio = bootstrap_ratio
        self.replacement = replacement
        tree_params = {'max_depth': max_depth, 'criterion': criterion, 'min_samples_split': min_samples_split, 'max_features': 'sqrt'}
        self.models  = [DecisionTreeClassifier(**tree_params) for _ in range(B)]
    
    def fit(self, X, y) :
        sample_size = int(self.bootstrap_ratio * len(X))
        m, n = X.shape
        B = self.B
        xsamples = np.zeros((B, sample_size, n))
        ysamples = np.zeros((B, sample_size))
        #subsamples for each model
        oob = []
        y_oob = []
        for i in range(B):
            ##sampling with replacement; i.e., sample can occur more than once
            #for the same predictor
            idx_list = []
            for j in range(sample_size):
                idx = random.randrange(m)   #<----with replacement #change so no repetition
                if self.replacement == True:
                    if idx in idx_list :
                        #I don't know if there is a numpy command which can return an array like np.arange(m) without the values present in idx_list
                        #it would avoid to pick from an array with already forbidden values
                        idx_possibilities = np.arange(m)
                        idx_possibilities = np.delete(idx_possibilities, idx)
                        while idx in idx_list :
                            idx = random.choice(idx_possibilities)        
                xsamples[i, j, :] = X_train[idx]
                ysamples[i, j] = y_train[idx]
                idx_list.append(idx)
                #keep track of idx that i did not use for ith tree
            idx_list = np.array(idx_list)
            oob.append(X_train[~idx_list])
            y_oob.append(y_train[~idx_list])
        oob = np.array(oob)
        #calculate score for oob evaluation for bootstrapped dataset
        oob_predictions = []
        for i, model in enumerate(self.models):
            _X = xsamples[i, :]
            _y = ysamples[i, :]
            model.fit(_X, _y)
            predictions = np.zeros(oob[i].shape[0])
            yhat = model.predict(oob[i])
            oob_predictions.append(yhat)
        oob_acc = np.zeros(len(oob_predictions))
        for i in range(len(oob_predictions)):
            acc = (y_oob[i][y_oob[i] == oob_predictions[i]]).shape[0] / y_oob[i].shape[0]
            oob_acc[i] = acc
            
        print('Oob accuracy for each dataset : ', oob_acc)
        print('Oob average accuracy : ', np.mean(oob_acc))
        return model
              
    def predictions(self, X_train, y_train, X_test) :
        model = self.fit(X_train, y_train)
        predictions = np.zeros((self.B, X_test.shape[0]))
        for i, model in enumerate(self.models):
            yhat = model.predict(X_test)
            predictions[i, :] = yhat
        yhat = stats.mode(predictions)[0][0]
        return yhat

In [71]:
exp = Bagging(5, 1, 2, 'gini', 5, True)
yhat = exp.predictions(X_train, y_train, X_test)
print('Accuracy : ', y_test[y_test == yhat].shape[0] / y_test.shape[0])

Oob accuracy for each dataset :  [0.81904762 0.94285714 0.94285714 0.94285714 0.94285714]
Oob average accuracy :  0.9180952380952382
Accuracy :  0.9777777777777777
