# 3. Evaluation harness

In [2]:
from random import seed, randrange
from csv import reader

# Importing our own functions
from functions import *

### Define functions
Helper functions to easily fit, evaluate, and compare algorithms.
* Train/test split evaluation harness
* Cross-validation split evaluation harness
* Dynamic evaluation harness with flexible choices of splits and metrics

In [8]:
# algorithm is the algorithm to test, in our testing we use baseline algorithms
# *args is for any additional configuration parameters required by the above algorithm
# Requires train_test_split and accuracy_metric to be defined
def split_evaluate_algorithm(dataset, algorithm, split, *args):
    train, test = train_test_split(dataset, split)
    test_set = list()
    for row in test:
        row_copy = list(row)
        # Assuming last column is output, clear outputs to avoid accidental cheating by baseline prediction
        row_copy[-1] = None
        test_set.append(row_copy)
    # Get baseline prediction using specified algorithm
    predicted = algorithm(train, test_set, *args)
    actual = [row[-1] for row in test]
    # Assumes it's a classification problem, should make accuracy algorithm a parameter
    accuracy = accuracy_metric(actual, predicted)
    return accuracy

def cross_evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    # List to store accuracy scores of each fold k used as test set
    scores = list()
    # For each fold:
    for fold in folds:
        # Use all folds except the current as training data
        train_set = list(folds) # At this point, a list of folds where each fold is also a list
        # Remove the current fold from training set, instead using it as the test set
        train_set.remove(fold)
        # Flattens the list of lists into one big list containing all the rows (= appends folds together)
        train_set = sum(train_set, [])
        # Initialize empty list to hold test data
        test_set = list()
        # Set current fold as test set, and nullify the output to avoid accidental cheating
        for row in fold:
            # row_copy = row   would reference the same object, instead of copying it
            row_copy = list(row)
            row_copy[-1] = None
            test_set.append(row_copy)
        # Predict based on training with all other folds than the current fold, and specified algorithm
        predicted = algorithm(train_set, test_set, *args)
        # Set current fold outputs as test set
        actual = [row[-1] for row in fold]
        # Rate accuracy by comparing prediction to test set (i.e. current fold)
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

# folds_or_split is number of folds or train/test split. Train/test used if <= 1.0, otherwise cross-validation
# default metric to evaluate is accuracy, hence assumes classification unless specified otherwise
# algorithm is the ML algorithm to test
# Metric accuracy indirectly determines whether it's a classification or regression algorithm
def evaluate_algorithm(dataset, algorithm, folds_or_split = 0.6, metric = accuracy_metric, *args):
    scores = list()
    folds = list()

    if folds_or_split <= 1.0:
        # Then train/test split
        train, test = train_test_split(dataset, folds_or_split)
        # Append test set first, since that will be the first to be iterated over
        folds.append(test)
        folds.append(train)
    else:
        # Then cross-validation
        folds = cross_validation_split(dataset, folds_or_split)

    for fold in folds:
        # Training set is all folds but the current
        train_set = list(folds)
        train_set.remove(fold)  # Remove current fold, sine that is the test set
        train_set = sum(train_set, [])  # Flatten training data into one set/list

        # Nullify output values in test set to avoid accidental cheating
        test_set = list()
        for row in fold:
            row_copy = list(row)
            row_copy[-1] = None
            test_set.append(row_copy)

        # Predict values of test set based on specified algorithm
        predicted = algorithm(train_set, test_set, *args)
        # Observed (real) data
        actual = [row[-1] for row in fold]

        # Calculate accuracy based on specified metric; depends on if classification or regression
        accuracy = metric(actual, predicted)

        # If train/test split, stop after first iteration since only one fold (i.e. the test set)
        if folds_or_split <= 1.0:
            # Return accuracy and exit function - breaking loop before reverse combination of "folds" is used
            return accuracy
        # If not, continue the iteration
        else:
            # Store score of this fold: will error if metric returns more than one value!
            scores.append(accuracy)
            continue

    return scores

### Testing train/test harness on zero rule baseline

In [9]:
# Testing train-test test harness
seed(1)
filename = 'data/pima-indians-diabetes.csv'
dataset = load_csv(filename)
# Convert all columns to floats
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
# evaluate algorithm
split = 0.6
accuracy = split_evaluate_algorithm(dataset, zero_rule_algorithm_classification, split)
print('Accuracy: %.3f%%' % (accuracy))

Loaded data file data/pima-indians-diabetes.csv with 768 rows and 9 columns.
Accuracy: 67.427%


### Testing cross validation harness on zero rule baseline

In [10]:
# Testing cross-validation split harness
seed(1)
print('')
filename = 'data/pima-indians-diabetes.csv'
dataset = load_csv(filename)
# Convert all columns to floats
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
# evaluate algorithm
folds = 5
scores = cross_evaluate_algorithm(dataset, zero_rule_algorithm_classification, folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/len(scores)))


Loaded data file data/pima-indians-diabetes.csv with 768 rows and 9 columns.
Scores: [62.091503267973856, 64.70588235294117, 64.70588235294117, 64.70588235294117, 69.28104575163398]
Mean Accuracy: 65.098%


### Testing dynamic evaluation harness on zero rule baseline

In [12]:
# Testing dynamic evaluation method
seed(1)
print('')
dataset = load_csv('data/pima-indians-diabetes.csv')
# Convert all data in dataset to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
# Test method using zero_rule; coincidence that this is a baseline algo, can test any algo!
scores = evaluate_algorithm(dataset, zero_rule_algorithm_classification, 5, rmse_metric)
print(scores)
print(sum(scores)/len(scores))

print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (100*sum(scores)/len(scores)))


Loaded data file data/pima-indians-diabetes.csv with 768 rows and 9 columns.
[0.6156987634551992, 0.5940885257860046, 0.5940885257860046, 0.5940885257860046, 0.5542468245138262]
0.5904422330654079
Scores: [0.6156987634551992, 0.5940885257860046, 0.5940885257860046, 0.5940885257860046, 0.5542468245138262]
Mean Accuracy: 59.044%
