# 13. Random Forest

In [2]:
from random import seed, randrange
from math import sqrt

from functions import *

### Define functions

In [3]:
# Select the best Random Forest splits for a dataset (subsample), with constrained feature candidates
# Input: dataset (subsample, like bagging), number of input features to (randomly) evaluate as splits
# Output: dictionary of optimal split point with feature index, split value,
def rf_get_split(dataset, n_features):
    # Get unique output values by looking through last dataset column
    class_values = list(set(row[-1] for row in dataset))
    # Initialize best split feature index, split value, gini score, resulting split groups of records
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    features = list()
    # Randomly select feature indices to test as splits, and add to list of candidates
        # I.e. same as creating subsamples for bagging, but for input features
    while len(features) < n_features:
        index = randrange(len(dataset[0])-1)
        # Without replacement, e.g. if 8 features but n_features =4, get 4 random but unique indices
        if index not in features:
            features.append(index)
    # Iterate over each randomly selected candidate feature
    for index in features:
        # Iterate over all rows in dataset, to try every possible feature value as split value
        for row in dataset:
            # Split entire dataset (subsample) using current feature and current feature value as split
            groups = cart_test_split(index, row[index], dataset)
            # Calculate Gini index of resulting groups for given class values
            gini = cart_gini_index(groups, class_values)
            # If gini of this feature + value as split is better, update dictionary of best parameters
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    # Return dictionary with best feature and value to split on, and resulting groups of dataset records
        # Difference from CART is we didn't evaluate every input feature, only a random subset
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Random Forest recursively create child splits for a node, or make a terminal node
# Input: pre-split node, max tree depth, min rows per node, node's depth, num features to evaluate
# Output: None, i.e. void function
def rf_split(node, max_depth, min_size, depth, n_features):
    # Extract left and right lists of group rows from the supplied node (dictionary)
    left, right = node['groups']
    # Deletes groups of data from parent node, as it no longer needs access to the data
    del(node['groups'])
    # Checks whether left or right list empty, i.e. whether a no split (100% in one group)
    if not left or not right:
        # Make the only child a terminal node , and set 'left' and 'right' to point to it
        node['left'] = node['right'] = cart_to_terminal(left + right)
        # Exit current iteration, since terminal child node has no child nodes of its own
        return
    # Check whether supplied node is at or above maximum tree depth
    if depth >= max_depth:
        # Set left and right child nodes to terminal nodes
        node['left'], node['right'] = cart_to_terminal(left), cart_to_terminal(right)
        # Exit current iteration, i.e. halting progression down this branch
        return

    # If we reach this point, we neither have a no split, nor have reached max depth
    # Process left child: if shorter than minimum row size, make it terminal
    if len(left) <= min_size:
        node['left'] = cart_to_terminal(left)
    # Neither too deep nor too small, so split left child node to two child nodes
    else:
        # Split left child node
        node['left'] = rf_get_split(left, n_features)
        # Recursively call function on the split left child node in a depth first fashion
        rf_split(node['left'], max_depth, min_size, depth+1, n_features)

    # Process right child: if shorter than minimum, make it a terminal node
    if len(right) <= min_size:
        node['right'] = cart_to_terminal(right)
    # If not, split right child node and make a recursive function call
    else:
        # Split right child node
        node['right'] = rf_get_split(right, n_features)
        # Make recursive call on the split right child
        rf_split(node['right'], max_depth, min_size, depth+1, n_features)

# Build a Random Forest decision tree
def rf_build_tree(train, max_depth, min_size, n_features):
    # Split the root node into two child nodes
    root = rf_get_split(train, n_features)
    # Call recursive function to add left nodes then right nodes in a depth first fashion
    rf_split(root, max_depth, min_size, 1, n_features)
    # Return root node; now just a dictionary with two child node references
    # Similarly, its child nodes are only references, until terminal nodes which contain rows
    return root

# Random Forest Algorithm
# Input: train test sets, tree max depth, min rows per node, subsample ratio, num trees, num features
# Output: list of predictions corresponding to test set
def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features = "default"):
    # If using default number of random features to evaluate, use square root of total num features
    if(n_features == "default"):
        n_features = int(sqrt(len(dataset[0])-1))
    # Create list to hold the tree trained on each subsample
    trees = list()
    # For each tree, bootstrap a subsample, train the random forest tree on it, and append to list
    for i in range(n_trees):
        sample = bootstrap_subsample(train, sample_size)
        tree = rf_build_tree(sample, max_depth, min_size, n_features)
        trees.append(tree)
    # For each row in test, get prediction as most common random forest tree prediction
    predictions = [bagging_predict_mode(trees, row) for row in test]
    # Returns list of predictions for each row in test dataset
    return predictions

### Testing RF on Sonar dataset

In [5]:
seed(1)

# Load and prepare data
print("\nSonar Case Study:")
dataset = load_csv('data/sonar.all-data.csv')
# Convert string attributes to floats
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
# Convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)

# Evaluate algorithm
# 208/5 ~ 41 records evaluated upon each iteration => 41 test, 167 train => 5 times
n_folds = 5

# Relatively deep trees are permitted, with relatively narrow branches (few records) allowed
max_depth = 10
min_size = 1
sample_size = 1.0
# Calculating number of features for random forest to evaluate
    # Algorithm has this built-in by default, but included here for clarity
n_features = int(sqrt(len(dataset[0])-1))

# Run the algorithm for different numbers of samples (hence bagged trees, since one per sample)
    # Primarily to demonstrate the behaviour of the algorithm
for n_trees in [1, 5, 10]:
    scores = evaluate_algorithm(dataset, random_forest, n_folds, accuracy_metric, max_depth, min_size, sample_size, n_trees, n_features)

    print('\nTrees: %d' % n_trees)
    print('Scores: %s' % scores)
    print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))


Sonar Case Study:
Loaded data file data/sonar.all-data.csv with 208 rows and 61 columns.

Trees: 1
Scores: [51.21951219512195, 78.04878048780488, 58.536585365853654, 65.85365853658537, 53.65853658536586]
Mean Accuracy: 61.463%

Trees: 5
Scores: [63.41463414634146, 60.97560975609756, 56.09756097560976, 60.97560975609756, 56.09756097560976]
Mean Accuracy: 59.512%

Trees: 10
Scores: [65.85365853658537, 58.536585365853654, 68.29268292682927, 53.65853658536586, 75.60975609756098]
Mean Accuracy: 64.390%
