<a href="https://colab.research.google.com/github/jsdysw/knn-classifier/blob/master/k_nn_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [34]:
import pandas as pd
import os
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Read dataset
feature vector X = {f1, f2, f3, f4, f5, f6}

class vector C = {"satisfied", "unsatisfied"}

In [35]:
# load satisfaction_data.csv from the local path
directory_data = './'
filename_data = 'satisfaction_data.csv'
df = pd.read_csv(os.path.join(directory_data, filename_data), header=None)
dataset = df.to_numpy() # pandas dataframe -> numpy array

print("Examples of dataset : \n", dataset[0:3])

Examples of dataset : 
 [[40 2 1 86872 25 9 'unsatisfied']
 [40 2 1 259323 54 10 'satisfied']
 [40 2 1 256813 43 14 'satisfied']]


# Generate 10 different train/test dataset pairs randomly
train data : test data = 9 : 1

In [43]:
fold_size = int(len(dataset) / 10)
fold_num = 10

X_dataset = dataset[:,:6]
y_dataset = dataset[:,6]

X_train = []
y_train = []
X_test = []
y_test = []

for i in range(fold_num):
    X_test.append(X_dataset[i*fold_size:(i+1)*fold_size])
    y_test.append(y_dataset[i*fold_size:(i+1)*fold_size])

    xa = X_dataset[:i*fold_size]
    xb = X_dataset[(i+1)*fold_size:]

    ya = y_dataset[:i*fold_size]
    yb = y_dataset[(i+1)*fold_size:]

    # print(yb)

    X_train.append(np.concatenate((xa,xb)))
    y_train.append(np.concatenate((ya,yb)))


print("Shape of X_test", X_test[9].shape)
print("Shape of y_test", y_test[9].shape)

print("Shape of X_train", X_train[9].shape)
print("Shape of y_train", y_train[9].shape)

Shape of X_test (2000, 6)
Shape of y_test (2000,)
Shape of X_train (18000, 6)
Shape of y_train (18000,)


# Data preprocessing (normalize)

In [None]:
col_means = []
col_std = []

for i in range(10):
    col_means.append(X_train[i].sum(axis = 0) / len(X_train[i]))
    col_std.append(np.std(X_train[i], dtype=np.float64, axis = 0))
    
    normalized_train_data = (X_train[i] - col_means[i])/col_std[i]
    X_train[i] = normalized_train_data

    normalized_test_data = (X_test[i] - col_means[i])/col_std[i]
    X_test[i] = normalized_test_data

print("Examples of normalized train dataset : \n", X_train[i][0:2])
print("Examples of normalized test dataset : \n", X_test[i][0:2])

Examples of normalized train dataset : 
 [[-0.6945397033416779 -0.31574343633846397 -0.3019402096500801
  0.02748658333036758 0.7415145737616039 -0.12740830475030657]
 [-0.27039044159434794 -0.31574343633846397 -0.3019402096500801
  0.7673680620037387 -0.3464528712782156 -0.5351148799512876]]
Examples of normalized test dataset : 
 [[0.5779080819003121 1.4473655613635226 -0.3019402096500801
  0.5291644053397205 -0.3464528712782156 1.9111245712545988]
 [0.40824837720138 -1.1972979351894573 -0.3019402096500801
  -0.8367170394699519 0.35295477196166836 0.6880048456516555]]


# kNN model
distance between two data is defined as Euclidean(L2 norm)

In [None]:
class KNN:
  def __init__(self, k):
    self.k = k
  
  def fit(self, X, y):
    self.X_train = X
    self.y_train = y

  # distance
  def distance(self, data1, data2) :
    sub = data1 - data2
    dis = np.sum(np.square(sub)) ** 0.5
    # print("data1 : ", data1)
    # print("data2 : ", data2)
    # print("sub", sub)
    # print("dis", dis)
    return dis


  def predict(self, _X_test):
    final_output = []

    for i in range(len(_X_test)):
        if i % 300 == 0 :
          print("   Loading : ", i/len(_X_test))

        d = []
        votes = []

        for j in range(len(self.X_train)):
            # get distance with every data samples
            dist = self.distance(_X_test[i] , self.X_train[j])
            d.append([dist, j])
        
        d.sort()
        d = d[0:self.k]

        # vote
        for d, j in d:
            votes.append(self.y_train[j])  
        ans = Counter(votes).most_common(1)[0][0]
        final_output.append(ans)
      
    return final_output


# Predict satisfaction with 10 dataset pairs

In [None]:
# set k-NN model
window_size = 3
clf = KNN(window_size)

num_of_test_input = fold_size
num_of_datapairs = fold_num
prediction_arr = []

for i in range(num_of_datapairs):
    print("Predict with train/test pair# : ", i)

    clf.fit(X_train[i][0:], y_train[i][0:])
    
    # predict 
    prediction = clf.predict(X_test[i][0:num_of_test_input])
    prediction_arr.append(prediction)
    
    ground_truth = y_test[i][0:num_of_test_input]
    # print(prediction)
    # print(ground_truth)
    # prediction loss
    accuracy_score = np.sum(prediction == ground_truth) / len(ground_truth) 
    print("       Accuracy_score : ", accuracy_score)


# export preiction result .csv 
df = pd.DataFrame(X_dataset)
df.insert(6,"class" ,prediction_arr)
df.to_csv(r'./20174089.csv', index = False)

Predict with train/test pair# :  0
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.751
Predict with train/test pair# :  1
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7555
Predict with train/test pair# :  2
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7585
Predict with train/test pair# :  3
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.752
Predict with train/test pair# :  4
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7485
Predict with train/test pair# :  5
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.756
Predict with train/test pair# :  6
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.759
Predict with train/test pair# :  7
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7615
Predict with train/test pair