In [1]:
from math import sqrt
import numpy as np
import pandas as pd
from math import sqrt
from random import randrange

from sklearn.model_selection import train_test_split

In [2]:
dataset =[
    [1.1, 100.0, 1.2, 1.6, 1.6, 1.1, 1.2, 1.2, 1.0, 1.0],
    [1.4, 1.4, 1.4, 1.5, 100.0, 1.4, 1.2, 1.2, 1.0, 1.0],
    [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 100.0, 2.0, 2.0],
    [20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0]
]

In [3]:
def convert_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())

def convert_to_int(dataset, column):
	class_value = [row[column] for row in dataset]
	unique = set(class_value)
	lookup = dict()
	for i, value in enumerate(unique):
		lookup[value] = i
	for row in dataset:
		row[column] = lookup[row[column]]
	return lookup

In [67]:
def calc_euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)

def get_neighbors(training_set, testing_set, num_neighbors):
	distances = list()
	for train_row in training_set:
		dist = calc_euclidean_distance(testing_set, train_row)
		distances.append((train_row, dist))
	distances.sort(key=lambda tup: tup[1])
	neighbors = list()
	for i in range(num_neighbors):
		neighbors.append(distances[i][0])
	return neighbors
    
def predict_classification(train, test_row, num_neighbors):
	neighbors = get_neighbors(train, test_row, num_neighbors)
	output_values = [row[-1] for row in neighbors]
	prediction = max(set(output_values), key=output_values.count)
	
	for i in neighbors:
		print(i)
		print(output_values)
		

	return prediction

def k_nearest_neighbors(train, test, num_neighbors):
	predictions = list()
	for row in test:
		output = predict_classification(train, row, num_neighbors)
		predictions.append(output)
	return(predictions)

In [59]:
def calc_accuracy_metric(actual, predicted):
	correct = 0
	for i in range(len(actual)):
		if actual[i] == predicted[i]:
			correct += 1
	return correct / float(len(actual)) * 100.0
    
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for _ in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

def evaluating_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		accuracy = calc_accuracy_metric(actual, predicted)
		scores.append(accuracy)
	return scores


In [12]:
dataset

[[1.1, 100.0, 1.2, 1.6, 1.6, 1.1, 1.2, 1.2, 1.0, 0],
 [1.4, 1.4, 1.4, 1.5, 100.0, 1.4, 1.2, 1.2, 1.0, 0],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 100.0, 2.0, 1],
 [20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 2]]

In [73]:
# dataset = dataset.to_numpy

convert_to_int(dataset, len(dataset[0])-1)
n_folds = 2
num_neighbors = 1
scores = evaluating_algorithm(dataset, k_nearest_neighbors, n_folds, num_neighbors)
print('Mean Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))

[1.4, 1.4, 1.4, 1.5, 100.0, 1.4, 1.2, 1.2, 1.0, 0]
[0]
[1.1, 100.0, 1.2, 1.6, 1.6, 1.1, 1.2, 1.2, 1.0, 0]
[0]
[20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 2]
[2]
[20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 2]
[2]
Mean Accuracy: 0.00%


From the test above we can se the testing dataset result in 25% accuracy

In [74]:
row = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0,1.0, 1.0, 1.0, 1.0]
label = predict_classification(dataset, row, num_neighbors)

[20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 2]
[2]


In [75]:
print(f'Data={row}, Predicted: {label}')

Data=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], Predicted: 2


From the test run of data containing series of [1] the output values is the data that contains only 20, so the most similiar object with the object query