In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
#loading data
df = pd.read_csv('LOR.csv', usecols = ['Age', 'EstimatedSalary', 'Purchased'])
x = df.iloc[:,:2].values
y = df.iloc[:,2].values

In [3]:
# feature scaling to make calculation easier

scaler = StandardScaler()
x = scaler.fit_transform(x)

In [4]:
#splitting data into train & test
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.25, random_state = 0)

# data processing to feed into model
train_y = train_y.reshape((-1, 1))

dataset_train = np.append(train_x, train_y, axis = 1)

In [5]:
#calculates distance between two dataset rows
def get_distance(default_row, dataset_row):
    distance = sqrt(sum((default_row - dataset_row[:-1]) ** 2))
    return (distance, dataset_row[-1])

# predicts the belonging category for the dataset provided row
def get_belonging_category(default_row, dataset, neighbors_number):
    distances = []
    for row in dataset:
        distances.append(get_distance(default_row, row))
    distances.sort(key = lambda x: x[0])
    near_categories = np.array(distances[:neighbors_number])
    categories, categories_count = np.unique(near_categories[:, 1], return_counts = True)
    index_of_max = np.argmax(categories_count)
    belonging_category = categories[index_of_max]
    return belonging_category

#predicting y values for test data
y_pred_algo = [get_belonging_category(row, dataset_train, 9) for row in test_x]

#calculating our accuracy with test_y data
mat = confusion_matrix(test_y, y_pred_algo)
score = accuracy_score(test_y, y_pred_algo)
print(mat, score)

[[64  4]
 [ 3 29]] 0.93
