In [296]:
from math import sqrt

In [297]:
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

dataset = [[2.7810836,2.550537003,0],
           [1.465489372,2.362125076,0],
           [3.396561688,4.400293529,0],
           [1.38807019,1.850220317,0],
           [3.06407232,3.005305973,0],
           [7.627531214,2.759262235,1],
           [5.332441248,2.088626775,1],
           [6.922596716,1.77106367,1],
           [8.675418651,-0.242068655,1],
           [7.673756466,3.508563011,1]]

row0 = dataset[0]
for row in dataset:
    distance = euclidean_distance(row0, row)
    print(distance)

0.0
1.3290173915275787
1.9494646655653247
1.5591439385540549
0.5356280721938492
4.850940186986411
2.592833759950511
4.214227042632867
6.522409988228337
4.985585382449795


In [298]:
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

neighbors = get_neighbors(dataset, dataset[0], 3)
for neighbor in neighbors:
    print(neighbor)

[2.7810836, 2.550537003, 0]
[3.06407232, 3.005305973, 0]
[1.465489372, 2.362125076, 0]


In [299]:
def predict_classification(train, test_row, num_neighbrs):
    neighbors = get_neighbors(train, test_row, num_neighbrs)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction

prediction = predict_classification(dataset, dataset[0], 3)
print('Expexted {}, Got {}.'.format(dataset[0][-1], prediction))

Expexted 0, Got 0.


In [300]:
num_neighbors = 5
row = [5.7, 2.9, 4.2, 1.3]
label = predict_classification(dataset, row, num_neighbors)
print('Data={}, Predicted: {}'.format(row, label))

Data=[5.7, 2.9, 4.2, 1.3], Predicted: 1


In [301]:
from csv import reader

def load_csv(filename):
    filename = 'iris.txt'
    file = open(filename, 'r')
    lines = reader(file)
    dataset = list(lines)
    return dataset

dataset = load_csv('iris.txt')
for row in dataset[:3]:
    print(row)

['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
['4.9', '3.0', '1.4', '0.2', 'Iris-setosa']
['4.7', '3.2', '1.3', '0.2', 'Iris-setosa']


In [302]:
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())
        
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)

In [303]:
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

str_column_to_int(dataset, 4)
print(lookup)

{'Iris-versicolor': 0, 'Iris-setosa': 1, 'Iris-virginica': 2}


In [304]:
from random import seed
from random import randrange

def train_test_split(dataset, split=0.6):
    train = list()
    train_size = len(dataset)*split
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

seed(1)
train, test = train_test_split(dataset)
print(train[:3])
print(test[:3])

[[4.9, 3.1, 1.5, 0.1, 1], [6.3, 2.5, 5.0, 1.9, 2], [5.4, 3.9, 1.3, 0.4, 1]]
[[4.9, 3.0, 1.4, 0.2, 1], [4.6, 3.1, 1.5, 0.2, 1], [5.4, 3.7, 1.5, 0.2, 1]]


In [305]:
def cross_validation_split(dataset, folds=3):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset)/folds)
    for i in range(folds):
        fold = list()
        while (len(fold) < fold_size):
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

seed(1)
folds = cross_validation_split(dataset, 5)
for i in range(len(folds)):
    print(folds[i][:3])

[[4.9, 3.1, 1.5, 0.1, 1], [6.3, 2.5, 5.0, 1.9, 2], [5.4, 3.9, 1.3, 0.4, 1]]
[[6.7, 2.5, 5.8, 1.8, 2], [4.4, 3.0, 1.3, 0.2, 1], [5.9, 3.2, 4.8, 1.8, 0]]
[[7.6, 3.0, 6.6, 2.1, 2], [6.1, 3.0, 4.6, 1.4, 0], [6.5, 3.2, 5.1, 2.0, 2]]
[[4.6, 3.1, 1.5, 0.2, 1], [6.3, 2.5, 4.9, 1.5, 0], [4.8, 3.4, 1.6, 0.2, 1]]
[[7.7, 3.0, 6.1, 2.3, 2], [6.3, 2.8, 5.1, 1.5, 2], [5.6, 3.0, 4.1, 1.3, 0]]


In [306]:
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

minmax = dataset_minmax(dataset)
minmax

[[4.3, 7.9], [2.0, 4.4], [1.0, 6.9], [0.1, 2.5], [0, 2]]

In [307]:
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i]-minmax[i][0])/(minmax[i][1]-minmax[i][0])

# normalize_dataset(dataset, minmax)
# print(dataset[:3])

In [308]:
num_neighbors = 5
def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, num_neighbors)
        predictions.append(output)
    return(predictions)

In [315]:
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual))*100.0

In [316]:
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = k_nearest_neighbors(train_set, test_set, 5)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [324]:
n_folds = 5
num_neighbors = 5
scores = evaluate_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Scores: {}'.format(scores))
print('Mean Accuracy: {:,.3f}%'.format(sum(scores)/float(len(scores))))

Scores: [96.66666666666667, 100.0, 96.66666666666667, 96.66666666666667, 100.0]
Mean Accuracy: 98.000%
