# 12. Bootstrap Aggregation (Bagging)

In [3]:
from random import seed, randrange

from functions import *

### Define functions
* Bootstrap aggregating decision trees

In [4]:
# Create a random subsample from the dataset with replacement
# Input: dataset to bootstrap, ratio of dataset to use for bootstrap sample (e.g. 70% = 0.7)
# Output: bootstrapped dataset sample
def bootstrap_subsample(dataset, ratio = 1.0):
    # Create empty list for new bootstrapped sample
    sample = list()
    # Get number of observations in bootstrapped sample
    n_sample = round(len(dataset) * ratio)
    # Randomly add observations from dataset to bootstrap sample, with replacement
    while len(sample) < n_sample:
        index = randrange(len(dataset))
        sample.append(dataset[index])
    return sample

# For a set of bagged trees, makes a prediction with each and combines into a single return prediction
# Input: row to predict output, list of bagged trees with whom to get prediction
# Output: predicted class; most common prediction among the bagged trees
def bagging_predict_mode(trees, row):
    # For given row, get prediction of each bagged tree in 'trees' and add to list of tree predictions
    predictions = [cart_predict(tree, row) for tree in trees]
    # Select most common prediction from those made by the bagged trees, and return as prediction
    return max(set(predictions), key=predictions.count)

# Creates bootstrapped samples, trains a decision tree on each, then predicts using the bagged trees
# Input: train and test sets, max tree depth, min rows per branch, sample size ratio, num trees/samples
# Output: list of predictions for provided test rows, using provided train rows to bootstrap aggregate
def cart_bagging(train, test, max_depth, min_size, sample_size, n_trees):
    trees = list()
    # Iterate over number of trees, i.e. number of bootstrapped subsamples
    for i in range(n_trees):
        # Create a bootstrapped subsample for the current tree
        sample = bootstrap_subsample(train, sample_size)
        # Build a tree, fitted on the "constructed" bootstrapped subsample
        tree = cart_build_tree(sample, max_depth, min_size)
        # Add tree to list of n_trees trees
        trees.append(tree)
    # Get prediction of bagged trees for all rows in test dataset
    predictions = [bagging_predict_mode(trees, row) for row in test]
    return predictions

### Testing bagging on contrived dataset

In [5]:
# Contrived dataset for testing
seed(1)
# Create list of 20 random numbers between 0 and 9 (inclusive) => contrived dataset
# List of lists since each observation is typically a list of numbers (columns)
    # Though each row in this contrived dataset is one element, most functions expect row === list
dataset = [[randrange(10)] for i in range(20)]
print("Dataset:", dataset)
print('True Mean: %.3f' % mean([row[0] for row in dataset]))

# Estimated means
ratio = 0.10
# Experiment with estimated means over 1, 10, and 100 samples
for size in [1, 10, 100]:
    sample_means = list()
    # For each sample, create a bootstrapped subsample and calculate the mean value
    for i in range(size):
        sample = bootstrap_subsample(dataset, ratio)
        sample_mean = mean([row[0] for row in sample])
        sample_means.append(sample_mean)
    # After the samples are created, calculate their average estimate of the mean
    print('Samples=%d, Estimated Mean: %.3f' % (size, mean(sample_means)))

Dataset: [[2], [9], [1], [4], [1], [7], [7], [7], [6], [3], [1], [7], [0], [6], [6], [9], [0], [7], [4], [3]]
True Mean: 4.500
Samples=1, Estimated Mean: 4.000
Samples=10, Estimated Mean: 4.700
Samples=100, Estimated Mean: 4.570


### Testing bagging on Sonar dataset

In [7]:
seed(1)

# Load and prepare data
print("\nSonar Case Study:")
dataset = load_csv('data/sonar.all-data.csv')
# Convert string attributes to floats
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# Convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

# Evaluate algorithm
# 208/5 ~ 41 records evaluated upon each iteration => 41 test, 167 train => 5 times
n_folds = 5

# Relatively deep trees are permitted, with relatively narrow branches (few records) allowed
max_depth = 6
min_size = 2
# Each bootstrap sample is half the size of the dataset (with replacement)
    # This is to force some variety in the dataset subsamples used to train each tree
    # The default for bagging is to have size of sample datasets match original training dataset
sample_size = 0.50

# Run the algorithm for different numbers of samples (hence bagged trees, since one per sample)
    # Primarily to demonstrate the behaviour of the algorithm
for n_trees in [1, 5, 10, 50]:
    scores = evaluate_algorithm(dataset, cart_bagging, n_folds, accuracy_metric, max_depth, min_size, sample_size, n_trees)

    print('\nTrees: %d' % n_trees)
    print('Scores: %s' % scores)
    print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))


Sonar Case Study:
Loaded data file data/sonar.all-data.csv with 208 rows and 61 columns.

Trees: 1
Scores: [60.97560975609756, 65.85365853658537, 58.536585365853654, 63.41463414634146, 65.85365853658537]
Mean Accuracy: 62.927%

Trees: 5
Scores: [65.85365853658537, 56.09756097560976, 68.29268292682927, 68.29268292682927, 53.65853658536586]
Mean Accuracy: 62.439%

Trees: 10
Scores: [56.09756097560976, 63.41463414634146, 68.29268292682927, 75.60975609756098, 60.97560975609756]
Mean Accuracy: 64.878%

Trees: 50
Scores: [73.17073170731707, 70.73170731707317, 68.29268292682927, 65.85365853658537, 73.17073170731707]
Mean Accuracy: 70.244%
