In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv("../preprocessed_data.csv")

In [4]:
from sklearn.model_selection import train_test_split

feature_cols = ['age','height_cm','weight_kg','body_fat_%','diastolic','systolic','gripforce','sit_and_bend_forward_cm','sit_ups_counts','broad_jump_cm','bmi']
x = data[feature_cols]
y = data['class']

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y) # 70% training and 30% test

y_train.value_counts()

C    2661
A    2658
B    2657
D    2208
Name: class, dtype: int64

In [5]:
from collections import Counter
import math
import pandas as pd

def knn(data, query, k, distance_fn, choice_fn, attrNumber):
    neighbor_distances_and_indices = []
    
    # 3. For each example in the data
    for index, example in enumerate(data):
        # 3.1 Calculate the distance between the query example and the current
        # example from the data.
        distance = distance_fn(example, query, attrNumber)
        
        # 3.2 Add the distance and the index of the example to an ordered collection
        neighbor_distances_and_indices.append((distance, index))
    
    # 4. Sort the ordered collection of distances and indices from
    # smallest to largest (in ascending order) by the distances
    sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
    
    # 5. Pick the first K entries from the sorted collection
    k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
    
    # 6. Get the labels of the selected K entries
    k_nearest_labels = [data[i][-1] for distance, i in k_nearest_distances_and_indices]
    # 7. If regression (choice_fn = mean), return the average of the K labels
    # 8. If classification (choice_fn = mode), return the mode of the K labels
    return k_nearest_distances_and_indices , choice_fn(k_nearest_labels)

def mode(labels):
    return Counter(labels).most_common(1)[0][0]

def euclidean_distance(sample, target, attrNumber):
    sum_squared_distance = 0
    for i in range(attrNumber):
        sum_squared_distance += math.pow(sample[i] - target[i], 2)
    return math.sqrt(sum_squared_distance)



In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

attrNumber = len(feature_cols)

train_data = X_train
train_data["class"] = y_train
test_data = X_test
test_data["class"] = y_test

predict = []
train_data = train_data.values.tolist()
test_data = test_data.values.tolist()
for i in range(len(test_data)):
    test = test_data[i]
    reg_k_nearest_neighbors, reg_prediction = knn(
        train_data, test, k=15, distance_fn=euclidean_distance, choice_fn=mode, attrNumber=attrNumber
    )
    predict.append(reg_prediction)

In [13]:
print("predicted class in array\n",predict)
real = y_test.to_list()

print("real class in array\n",real)

count = 0
for i in range(len(real)):
    if predict[i] == real[i]:
        count=count+1

print("\ncorrect prediction: ",str(count))
print("from total record: ",len(real))

from sklearn.metrics import confusion_matrix, classification_report

print(classification_report(real, predict))

predicted class in array
 ['C', 'B', 'C', 'B', 'A', 'B', 'C', 'A', 'A', 'B', 'A', 'C', 'B', 'C', 'C', 'B', 'C', 'B', 'B', 'B', 'A', 'B', 'B', 'B', 'A', 'B', 'D', 'D', 'A', 'B', 'D', 'A', 'B', 'D', 'B', 'D', 'C', 'C', 'C', 'A', 'C', 'B', 'C', 'C', 'B', 'D', 'A', 'B', 'C', 'B', 'A', 'D', 'A', 'C', 'B', 'A', 'A', 'A', 'C', 'A', 'A', 'A', 'B', 'A', 'D', 'D', 'A', 'C', 'A', 'C', 'D', 'B', 'B', 'A', 'A', 'B', 'B', 'C', 'D', 'A', 'C', 'A', 'C', 'B', 'B', 'B', 'C', 'C', 'C', 'C', 'A', 'B', 'A', 'A', 'C', 'D', 'A', 'C', 'D', 'A', 'C', 'C', 'A', 'C', 'C', 'C', 'B', 'C', 'D', 'B', 'A', 'B', 'C', 'C', 'B', 'A', 'A', 'A', 'A', 'D', 'C', 'D', 'C', 'A', 'C', 'B', 'B', 'C', 'A', 'B', 'B', 'A', 'A', 'B', 'B', 'D', 'B', 'A', 'C', 'C', 'B', 'C', 'C', 'A', 'A', 'B', 'B', 'C', 'C', 'D', 'C', 'B', 'C', 'C', 'C', 'B', 'D', 'C', 'B', 'B', 'A', 'A', 'D', 'B', 'B', 'C', 'B', 'B', 'C', 'B', 'C', 'D', 'B', 'A', 'A', 'A', 'C', 'B', 'B', 'C', 'A', 'D', 'D', 'A', 'A', 'A', 'A', 'B', 'B', 'D', 'A', 'D', 'B', 'A', 'B'

In [15]:
print(len(train_data))

10184
