In [1038]:
import pandas as pd
from operator import itemgetter

In [1039]:
# read dataset
df_train = pd.read_excel('./traintest.xlsx', 'train')
df_test = pd.read_excel('./traintest.xlsx', 'test')

In [1040]:
df_train.head()

Unnamed: 0,id,x1,x2,x3,y
0,1,60,64,0,1
1,2,54,60,11,0
2,3,65,62,22,0
3,4,34,60,0,1
4,5,38,69,21,0


In [1041]:
df_test.head()

Unnamed: 0,id,x1,x2,x3,y
0,297,43,59,2,?
1,298,67,66,0,?
2,299,58,60,3,?
3,300,49,63,3,?
4,301,45,60,0,?


In [1042]:
# convert dataframe to list
train = df_train.values.tolist()
test = df_test.values.tolist()

In [1043]:
# euclidean distance
def euclidean_distance(x, y):
    distance = 0
    for i in range(len(y)):
        distance += (x[i] - y[i])**2
    return distance ** (1/2)

# get k nearest neighbors
def get_neighbors(x, y, k): 
    distances = []
    neighbors = [] 

    # loop through the list of train dataset
    for row in x:
        # append tuple of x-th row in the train dataset and its distance
        distances.append((row, euclidean_distance(row[1:], y)))
    
    # sort distances list by the distance
    distances.sort(key=itemgetter(1))

    # get the k closest neighbors 
    for i in range(k): 
        neighbors.append(distances[i][0])

    return neighbors

# knn prediction. default k is set to 5.
def knn(x, y, k = 5): 
    neighbors = get_neighbors(x, y, k)
    output_y = []

    # loop through the neighbors list 
    for row in neighbors:
        # append y column to output_y list
        output_y.append(row[-1])

    # count the number of 0's and 1's in the output_y list
    cnt_1 = output_y.count(1)
    cnt_0 = output_y.count(0)

    # return 0 if there's more 0's, and return 1 otherwise.
    if cnt_0 > cnt_1:
        return 0
        
    return 1

# accuracy test
def accuracy_metric(correct_answers, predicted_answers):
    correct = 0

    # loop through correct_answers list
    for i in range(len(correct_answers)):
        if correct_answers[i] == predicted_answers[i]:
            # increment by 1 if the i-th predicted answer is correct
            correct += 1

    # return the mean
    return correct / float(len(correct_answers)) * 100.0

In [1044]:
# split train data set into train and validation datasets (75:25)
validation = train[222:] # 75% of total rows 
train = train[:222:]     # 25% of total rows

In [1045]:
correct_answers = []

# append y column from validation dataset as the correct answers for testing
for row in validation:
    correct_answers.append(row[4])

In [1046]:
best_acc = 0
best_k = 0

# get the best hyperparameter 
for k in range(1, 100):
    output = []
    for row in validation: 
        prediction = (knn(train, row[1:4], k))
        output.append(prediction)
    
    acc = accuracy_metric(correct_answers, output)
    if best_acc < acc:
        best_acc = acc
        best_k = k 
    
print(best_k, best_acc)

18 72.97297297297297


In [1047]:
output_y = []

# get the test dataset answers using knn
for row in test:
    prediction = (knn(train, row[1:4], best_k))
    output_y.append(prediction)

In [1048]:
# append output_y list to test dataframe
df_test['y'] = output_y
df_test

In [None]:
df_test.to_excel("jawaban_test.xlsx", index = False, header=True)