<a href="https://colab.research.google.com/github/jsdysw/knn-classifier/blob/master/k_nn_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset
feature vector X = {f1, f2, f3, f4, f5, f6}

class vector C = {"satisfied", "unsatisfied"}

In [15]:
import pandas as pd
import os

# load satisfaction_data.csv from the github repository
# url = 'https://raw.githubusercontent.com/jsdysw/knn-classifier/master/satisfaction_data.csv?token=GHSAT0AAAAAABQYSI53RANCS5CDDMW3Y3OIYTRKKOQ'
# df = pd.read_csv(url, header=None)

# load satisfaction_data.csv from the local path
directory_data = './'
filename_data = 'satisfaction_data.csv'
df = pd.read_csv(os.path.join(directory_data, filename_data), header=None)
dataset = df.to_numpy() # pandas dataframe -> numpy

print("Examples of dataset : \n", dataset[0:3])

# split data into train_data(9) and test_data(1)
train_data = dataset[0:18000]
test_data = dataset[18000:]

# split into features and class
X_train = train_data[:,:6]
y_train = train_data[:,6]
X_test = test_data[:,:6]
y_test = test_data[:,6]

print("\nShape of X_train : ", X_train.shape)
print("Shape of y_train : ", y_train.shape)
print("X_train example : ", X_train[0])
print("y_train example : ", y_train[0])

print("\nShape of X_test : ", X_test.shape)
print("Shape of y_test : ", y_test.shape)
print("X_test example : ", X_test[0])
print("y_test example : ", y_test[0])


Examples of dataset : 
 [[40 2 1 86872 25 9 'unsatisfied']
 [40 2 1 259323 54 10 'satisfied']
 [40 2 1 256813 43 14 'satisfied']]

Shape of X_train :  (18000, 6)
Shape of y_train :  (18000,)
X_train example :  [40 2 1 86872 25 9]
y_train example :  unsatisfied

Shape of X_test :  (2000, 6)
Shape of y_test :  (2000,)
X_test example :  [33 1 0 46094 42 13]
y_test example :  unsatisfied


# kNN model
distance between two data is defined as Euclidean(L2 norm)

In [40]:
import numpy as np
from collections import Counter

class KNN:
  def __init__(self, k):
    self.k = k
  
  def fit(self, X, y):
    self.X_train = X
    self.y_train = y

  def distance(self, data1, data2) :
    sub = data1 - data2
    dis = np.sum(np.square(sub)) ** 0.5
    # print("data1 : ", data1)
    # print("data2 : ", data2)
    # print(dis)
    return dis


  def predict(self, X_test):
    final_output = []

    for i in range(len(X_test)):
        d = []
        votes = []
        for j in range(len(X_train)):
            # get distance with every data samples
            dist = self.distance(X_train[j] , X_test[i])
            d.append([dist, j])
        
        d.sort()
        d = d[0:self.k]

        # vote
        for d, j in d:
            votes.append(y_train[j])  
          
        ans = Counter(votes).most_common(1)[0][0]
        final_output.append(ans)
      
    return final_output


# Result

In [45]:
# set k-NN model
window_size = 3
clf = KNN(window_size)
clf.fit(X_train, y_train)

# predict 
prediction = clf.predict(X_test[10:20])
ground_truth = y_test[10:20]

# prediction loss
loss = 1 - (prediction == ground_truth).sum() / len(ground_truth) 
print(loss)



['unsatisfied', 'unsatisfied', 'unsatisfied', 'unsatisfied', 'unsatisfied', 'unsatisfied', 'satisfied', 'satisfied', 'unsatisfied', 'satisfied']
['unsatisfied' 'unsatisfied' 'unsatisfied' 'unsatisfied' 'satisfied'
 'unsatisfied' 'satisfied' 'satisfied' 'satisfied' 'satisfied']
0.19999999999999996
