Modify the Bagging scratch code in our lecture such that:
- Calculate for oob evaluation for each bootstrapped dataset, and also the average score
- Change the code to "without replacement"
- Put everything into a class <code>Bagging</code>.  It should have at least two methods, <code>fit(X_train, y_train)</code>, and <code>predict(X_test)</code>
- Modify the code from above to randomize features.  Set the number of features to be used in each tree to be <code>sqrt(n)</code>, and then select a subset of features for each tree.  This can be easily done by setting our DecisionTreeClassifier <code>max_features</code> to 'sqrt'

In [53]:
from sklearn.datasets import load_iris
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import math
import random
from scipy import stats

iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                test_size=0.25, shuffle=True, random_state=27)

In [54]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(112, 4)
(38, 4)
(112,)
(38,)


In [83]:
class Bagging:
    def __init__(self, B, bootstrap_ratio = 1, with_no_replacement = True):
        self.B = B
        self.bootstrap_ratio = bootstrap_ratio
        self.with_no_replacement = with_no_replacement
        self.tree_params = {'max_depth': 2, 'max_features': 'sqrt'}
        self.models = [DecisionTreeClassifier(**self.tree_params) for _ in range(B)]
        
    def fit(self, X_train, y_train):
        m, n = X_train.shape
        sample_size = int(self.bootstrap_ratio * len(X_train))
        xsamples = np.zeros((self.B, sample_size, n))
        ysamples = np.zeros((self.B, sample_size))

        xsamples_oob = []
        ysamples_oob = []

        for i in range(self.B):
            oob_idx = []
            idxes = []
            for j in range(sample_size):
                idx = random.randrange(m)
                if (self.with_no_replacement):
                    while idx in idxes:
                        idx = random.randrange(m)
                idxes.append(idx)
                oob_idx.append(idx)
                xsamples[i, j, :] = X_train[idx]
                ysamples[i, j] = y_train[idx]
            mask = np.zeros((m), dtype=bool)
            mask[oob_idx] = True
            xsamples_oob.append(X_train[~mask])
            ysamples_oob.append(y_train[~mask])
    
        oob_score = 0
        print("======Out of bag score for each tree======")
        for i, model in enumerate(self.models):
            
            _X_train = xsamples[i]
            _y_train = ysamples[i]
            model.fit(_X_train, _y_train)

        #calculating oob score
            _X_test = np.asarray(xsamples_oob[i])
            _y_test = np.asarray(ysamples_oob[i])
            yhat = model.predict(_X_test)
            oob_score += accuracy_score(_y_test, yhat)
            print(f"Tree {i}", accuracy_score(_y_test, yhat))
        self.avg_oob_score = oob_score / len(self.models)
        print("====Average out of bag score===")
        print(self.avg_oob_score)
    
    def predict(self, X_test):
        predictions = np.zeros((self.B, X_test.shape[0]))
        for i, model in enumerate(self.models):
            yhat = model.predict(X_test)
            predictions[i, :] = yhat
        return stats.mode(predictions)[0][0]

In [84]:
model = Bagging(B=5, bootstrap_ratio=0.7)
model.fit(X_train, y_train)
yhat = model.predict(X_test)
print(classification_report(y_test, yhat))

Tree 0 0.9117647058823529
Tree 1 1.0
Tree 2 0.9705882352941176
Tree 3 1.0
Tree 4 0.8823529411764706
====Average out of bag score===
0.9529411764705882
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       0.87      0.93      0.90        14
           2       0.93      0.87      0.90        15

    accuracy                           0.92        38
   macro avg       0.93      0.93      0.93        38
weighted avg       0.92      0.92      0.92        38

