<a href="https://colab.research.google.com/github/pertvirt/jupyter_notebook/blob/master/kNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!tar xzvf data.tar.gz

In [3]:
# k-nearest neighbors
from random import seed
from random import randrange
from csv import reader
from math import sqrt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


# Load a file
def load_data(filename):
    dataset = list()
    with open(filename, 'r') as file:
        for row in file:
            if not row:
                continue
            dataset.append(row.rstrip('\n').split(','))
    return dataset


# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())


# Find the min and max values for each column
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax


# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])


# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split


# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0


# version 2 - Evaluate an algorithm in general
def evaluate_algorithm(train_set, test_set, algorithm, *args):
    results = {}
    predicted = algorithm(train_set, test_set, *args)
    results['predicted'] = predicted
    actual = [row[-1] for row in test_set]
    results['actual'] = actual
    accuracy = accuracy_metric(actual, predicted)
    return (results, accuracy)


# Calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1) - 1):
        distance += (row1[i] - row2[i]) ** 2
    return sqrt(distance)


# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    distances = list()
    for train_row in train:
        dist = euclidean_distance(test_row, train_row)
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors


# Make a prediction with neighbors
def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors)
    output_values = [row[-1] for row in neighbors]
    prediction = max(set(output_values), key=output_values.count)
    return prediction


# kNN Algorithm
def k_nearest_neighbors(train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = predict_classification(train, row, num_neighbors)
        predictions.append(output)
    return (predictions)


# load training set and preprocess data
seed(1)
train_set = 'data/iris/iris.trn'
loaded_train_set = load_data(train_set)

for i in range(len(loaded_train_set[0]) - 1):
    str_column_to_float(loaded_train_set, i)

# load test set and preprocess data
test_set = 'data/iris/iris.tst'
loaded_test_set = load_data(test_set)

for i in range(len(loaded_test_set[0]) - 1):
    str_column_to_float(loaded_test_set, i)

num_neighbors = 5
(confusions, scores) = evaluate_algorithm(loaded_test_set, loaded_test_set, k_nearest_neighbors, num_neighbors)
# print('Scores: %s' % scores)
print('Accuracy: %.3f%%' % (scores))

# Python script for confusion matrix creation 
actual = confusions['actual']
predicted = confusions['predicted']
results = confusion_matrix(actual, predicted)
print('Confusion Matrix :')
print(results)


Accuracy: 98.000%
Confusion Matrix :
[[17  0  0]
 [ 0 15  0]
 [ 0  1 17]]
