# Bagging using decisison trees 

Using decision trees as base estimators. 

## Training 

In [4]:
import numpy as np 
from scipy.stats import mode 
from sklearn.tree import DecisionTreeClassifier 

In [3]:
rng = np.random.RandomState(seed=4190)        # initialize a random seed 

def bagging_fit(X, y, n_estimators, max_depth=5, max_samples=200):
    """Implement bagging using decision trees as base estimators. 
    
    Params: 
    - n_estimators: the ensemble size, i.e, the number of base estimators. 
    - max_depth: the maximum depth of decision trees. 
    - max_samples: the size of the bootstrap subset.
    """

    n_examples = len(y)
    estimators = [DecisionTreeClassifier(max_depth=max_depth)              # create a list of base estimators
                  for _ in range(n_estimators)]  

    for tree in estimators: 
        bag = np.random.choice(n_examples, max_samples, replace=True)      # generate a bootstrap sample (indices of the data points)
        tree.fit(X[bag, :], y[bag])                                        # fit a tree to the bootstrap sample 

    return estimators 

In [5]:
def bagging_predict(X, estimators): 
    """This function predict the test set X using a set of tree base estimators. 
    The majority voting is used for final prediction. 
    """

    all_preds = np.array([tree.predict(X) for tree in estimators])
    y_pred, _ = mode(all_preds, axis=0, keepdims=False)      # make the final prediction by majority voting 

    return np.squeeze(y_pred)

## Testing 

In [7]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


X, y = make_moons(n_samples=300, noise=.25, random_state=rng)                               # create a 2D dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=rng)

bag_ens = bagging_fit(X_train, y_train, n_estimators=500, max_depth=12, max_samples=300)    # train a bagging ensemble 
y_pred = bagging_predict(X_test, bag_ens)

print(accuracy_score(y_test, y_pred))

0.9292929292929293


In [15]:
cls = np.unique(y)
X.shape, cls

((300, 2), array([0, 1], dtype=int64))

# Compare the ensemble vs a single decision tree

In [9]:
tree = DecisionTreeClassifier(max_depth=12)
y_pred_single = tree.fit(X_train, y_train).predict(X_test)    # pipeline train and prediction 

tree_acc = accuracy_score(y_test, y_pred_single)
ens_acc = accuracy_score(y_test, y_pred)

print(f'Bagging: test acc: {ens_acc}')
print(f'Single tree: test acc: {tree_acc}')

Bagging: test acc: 0.9292929292929293
Single tree: test acc: 0.9191919191919192


In [None]:
from plot_utils 