<a href="https://colab.research.google.com/github/jsdysw/knn-classifier/blob/master/k_nn_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import pandas as pd
import os
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Read dataset
feature vector X = {f1, f2, f3, f4, f5, f6}

class vector C = {"satisfied", "unsatisfied"}

In [23]:
# load satisfaction_data.csv from the local path
directory_data = './'
filename_data = 'satisfaction_data.csv'
df = pd.read_csv(os.path.join(directory_data, filename_data), header=None)
dataset = df.to_numpy() # pandas dataframe -> numpy array

print("Examples of dataset : \n", dataset[0:3])

Examples of dataset : 
 [[40 2 1 86872 25 9 'unsatisfied']
 [40 2 1 259323 54 10 'satisfied']
 [40 2 1 256813 43 14 'satisfied']]


# Data preprocessing (normalize)

In [30]:
tot_data = dataset[:,:6]
label = dataset[:,6]
col_mean = tot_data.sum(axis = 0) / len(tot_data)
col_std = np.std(tot_data, dtype=np.float64, axis = 0)

normalized_tot_data = (tot_data - col_mean)/col_std

dataset[:,:6] = normalized_tot_data

print("Examples of dataset : \n", dataset[0:3])

Examples of dataset : 
 [[-0.2719121181249427 -0.3154478691949683 -0.30263175423913774
  -0.9674773980382068 -1.1209701180688763 -0.5313300832691843
  'unsatisfied']
 [-0.2719121181249427 -0.3154478691949683 -0.30263175423913774
  0.7378681063973553 1.1311590611140068 -0.123977698350118 'satisfied']
 [-0.2719121181249427 -0.3154478691949683 -0.30263175423913774
  0.7130470429292789 0.27690316556187883 1.5054318413261472 'satisfied']]


# Generate 10 different train/test dataset pairs randomly
train data : test data = 9 : 1

In [48]:
X_dataset = dataset[:,:6]
y_dataset = dataset[:,6]

X_train = []
y_train = []
X_test = []
y_test = []

for i in range(10):
    xtrain, xtest, ytrain, ytest  = train_test_split(X_dataset, y_dataset, test_size=0.1, stratify=y_dataset)
    X_train.append(xtrain)
    y_train.append(ytrain)
    X_test.append(xtest)
    y_test.append(ytest)

# print("Examples of dataset : \n", X_test[0][0:3])

# df = pd.DataFrame(X_test[0])
# df.insert(6,"class" ,y_test[0])
# df.to_csv(r'./pair0_test.csv', index = False)

# kNN model
distance between two data is defined as Euclidean(L2 norm)

In [26]:
class KNN:
  def __init__(self, k):
    self.k = k
  
  def fit(self, X, y):
    self.X_train = X
    self.y_train = y

  # distance
  def distance(self, data1, data2) :
    sub = data1 - data2
    dis = np.sum(np.square(sub)) ** 0.5
    # print("data1 : ", data1)
    # print("data2 : ", data2)
    # print("sub", sub)
    # print("dis", dis)
    return dis


  def predict(self, _X_test):
    final_output = []

    for i in range(len(_X_test)):
        if i % 700 == 0 :
          print("   Loading : ", i/len(_X_test))

        d = []
        votes = []

        for j in range(len(self.X_train)):
            # get distance with every data samples
            dist = self.distance(_X_test[i] , self.X_train[j])
            d.append([dist, j])
        
        d.sort()
        d = d[0:self.k]

        # vote
        for d, j in d:
            votes.append(self.y_train[j])  
        ans = Counter(votes).most_common(1)[0][0]
        final_output.append(ans)
      
    return final_output


# Predict satisfaction with 10 dataset pairs

In [27]:
# set k-NN model
window_size = 5
clf = KNN(window_size)

num_of_test_input = 2000
num_of_datapairs = 10

for i in range(num_of_datapairs):
    print("Predict with train/test pair# : ", i)

    clf.fit(X_train[i][0:], y_train[i][0:])
    
    # predict 
    prediction = clf.predict(X_test[i][0:num_of_test_input])
        
    ground_truth = y_test[i][0:num_of_test_input]
    # print(prediction)
    # print(ground_truth)
    # prediction loss
    accuracy_score = np.sum(prediction == ground_truth) / len(ground_truth) 
    print("       Accuracy_score : ", accuracy_score)

Predict with train/test pair# :  0
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.751
Predict with train/test pair# :  1
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7555
Predict with train/test pair# :  2
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7585
Predict with train/test pair# :  3
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.752
Predict with train/test pair# :  4
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7485
Predict with train/test pair# :  5
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.756
Predict with train/test pair# :  6
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.759
Predict with train/test pair# :  7
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7615
Predict with train/test pair

In [28]:
# # set k-NN model
# # window_size = 5
# # clf = KNN(window_size)

# num_of_test_input = 2000
# num_of_datapairs = 10

# for i in range(num_of_datapairs):
#     accuracy = []
    
#     for ws in range(10):
#         window_size = 2*ws + 1
#         clf = KNN(window_size)
        
#         clf.fit(X_train[i][0:], y_train[i][0:])

#         # predict 
#         prediction = clf.predict(X_test[i][0:num_of_test_input])
        
#         ground_truth = y_test[i][0:num_of_test_input]
#         # print(prediction)
#         # print(ground_truth)
#         # prediction loss
#         accuracy_score = np.sum(prediction == ground_truth) / len(ground_truth) 
#         accuracy.append(accuracy_score)
#         # print("Accuracy_score : ", accuracy_score)
        
#     # plot graph
#     plt.figure(figsize=(10,8))
#     plt.title('accurracy with data pair#')
#     plt.xlabel('window size')
#     plt.ylabel('accurracy')
#     plt.plot(accuracy)

# For additional prediction test

In [49]:
import pandas as pd
import os
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

####################################################
# load train data
####################################################
directory_data = './'
filename_data = 'satisfaction_data.csv'
train_dataset = df.to_numpy() # pandas dataframe -> numpy array

# get mean and std of train data and normalize
tot_data = train_dataset[:,:6]
label = train_dataset[:,6]
col_mean = tot_data.sum(axis = 0) / len(tot_data)
col_std = np.std(tot_data, dtype=np.float64, axis = 0)

normalized_tot_data = (tot_data - col_mean)/col_std

train_dataset[:,:6] = normalized_tot_data

print("Shape of train dataset : ", train_dataset.shape)
print("Mean of train dataset : ", col_mean)
print("Std of train dataset : ", col_std)
print("Examples of normalized train dataset : \n", train_dataset[0:3])

# split feature and label
X_train = train_dataset[:,:6]
y_train = train_dataset[:,6]

####################################################
# load test data
####################################################
filename_data = 'test_satisfaction_data.csv'
df = pd.read_csv(os.path.join(directory_data, filename_data), header=None)
train_dataset = df.to_numpy() # pandas dataframe -> numpy array

print("Shape of test dataset : ", train_dataset.shape)

# normalizae test dataset
# split feature and label
X_test = train_dataset[:,:6]
y_test = train_dataset[:,6]
normalized_tot_data = (tot_data - col_mean)/col_std

X_test = normalized_tot_data
print("Examples of normalized dataset : \n", X_test[0:3])


# set k-NN model
window_size = 5
clf = KNN(window_size)
clf.fit(X_train[0:], y_train[0:])

# predict 
prediction = clf.predict(X_test[0:])

# prediction loss
ground_truth = y_test[0:]
accuracy_score = np.sum(prediction == ground_truth) / len(ground_truth) 

print("Additional test accuracy score : ", accuracy_score)  