In [1]:
# Example of a Train-Test Test Harness
from random import seed
from random import randrange
from csv import reader

In [2]:
# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset

In [3]:
# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

In [4]:
# Split a dataset into a train and test set
def train_test_split(dataset, split):
	train = list()
	train_size = split * len(dataset)
	dataset_copy = list(dataset)
	while len(train) < train_size:
		index = randrange(len(dataset_copy))
		train.append(dataset_copy.pop(index))
	return train, dataset_copy

In [5]:
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0

In [6]:
# Evaluate an algorithm using a train/test split
def evaluate_algorithm_simple_split(dataset, algorithm, split, *args):
	train, test = train_test_split(dataset, split)
	test_set = list()
	for row in test:
		row_copy = list(row)
		row_copy[-1] = None
		test_set.append(row_copy)
	predicted = algorithm(train, test_set, *args)
	actual = [row[-1] for row in test]
	accuracy = accuracy_metric(actual, predicted)
	return accuracy

In [7]:
# zero rule algorithm for classification
def zero_rule_algorithm_classification(train, test):
	output_values = [row[-1] for row in train]
	prediction = max(set(output_values), key=output_values.count)
	predicted = [prediction for i in range(len(test))]
	return predicted

In [8]:
# Test the train/test harness
seed(1)
# load and prepare data
import os
filename = os.path.join('data', 'pima-indians-diabetes.csv')
dataset = load_csv(filename)

for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)

In [15]:
# evaluate algorithm
split = 0.6
accuracy = evaluate_algorithm_simple_split(dataset, zero_rule_algorithm_classification, split)

print('Accuracy: %.3f%%' % (accuracy))

Accuracy: 65.798%


In [10]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = len(dataset) / n_folds
	for i in range(n_folds):
		fold = list()
		while len(fold) < int(fold_size):
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

In [11]:
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm_cross_validation(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores

In [12]:
# Test cross validation test harness
seed(1)
# load and prepare data
filename = os.path.join('data', 'pima-indians-diabetes.csv')
dataset = load_csv(filename)

for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)

In [13]:
# evaluate algorithm
n_folds = 5
scores = evaluate_algorithm_cross_validation(dataset, zero_rule_algorithm_classification, n_folds)

print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/len(scores)))

Scores: [62.091503267973856, 64.70588235294117, 64.70588235294117, 64.70588235294117, 69.28104575163398]
Mean Accuracy: 65.098%
