<a href="https://colab.research.google.com/github/jsdysw/knn-classifier/blob/master/k_nn_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [1]:
import pandas as pd
import os
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Read dataset
feature vector X = {f1, f2, f3, f4, f5, f6}

class vector C = {"satisfied", "unsatisfied"}

In [18]:
# load satisfaction_data.csv from the local path
directory_data = './'
filename_data = 'satisfaction_data.csv'
df = pd.read_csv(os.path.join(directory_data, filename_data), header=None)
dataset = df.to_numpy() # pandas dataframe -> numpy array

print("Examples of dataset : \n", dataset[0:3])

Examples of dataset : 
 [[40 2 1 86872 25 9 'unsatisfied']
 [40 2 1 259323 54 10 'satisfied']
 [40 2 1 256813 43 14 'satisfied']]


# Generate 10 different train/test dataset pairs randomly
train data : test data = 9 : 1

In [19]:
X_dataset = dataset[:,:6]
y_dataset = dataset[:,6]

X_train = []
y_train = []
X_test = []
y_test = []

for i in range(10):
    xtrain, xtest, ytrain, ytest  = train_test_split(X_dataset, y_dataset, test_size=0.1, stratify=y_dataset)
    X_train.append(xtrain)
    y_train.append(ytrain)
    X_test.append(xtest)
    y_test.append(ytest)

# print("Examples of dataset : \n", X_test[0][0:3])

# df = pd.DataFrame(X_test[0])
# df.insert(6,"class" ,y_test[0])
# df.to_csv(r'./pair0_test.csv', index = False)

# Data preprocessing (normalize)

In [20]:
col_means = []
col_std = []

for i in range(10):
    col_means.append(X_train[i].sum(axis = 0) / len(X_train[i]))
    col_std.append(np.std(X_train[i], dtype=np.float64, axis = 0))
    
    normalized_train_data = (X_train[i] - col_means[i])/col_std[i]
    X_train[i] = normalized_train_data

    normalized_test_data = (X_test[i] - col_means[i])/col_std[i]
    X_test[i] = normalized_test_data

print("Examples of normalized train dataset : \n", X_train[i][0:2])
print("Examples of normalized test dataset : \n", X_test[i][0:2])

Examples of normalized train dataset : 
 [[-0.6945397033416779 -0.31574343633846397 -0.3019402096500801
  0.02748658333036758 0.7415145737616039 -0.12740830475030657]
 [-0.27039044159434794 -0.31574343633846397 -0.3019402096500801
  0.7673680620037387 -0.3464528712782156 -0.5351148799512876]]
Examples of normalized test dataset : 
 [[0.5779080819003121 1.4473655613635226 -0.3019402096500801
  0.5291644053397205 -0.3464528712782156 1.9111245712545988]
 [0.40824837720138 -1.1972979351894573 -0.3019402096500801
  -0.8367170394699519 0.35295477196166836 0.6880048456516555]]


# kNN model
distance between two data is defined as Euclidean(L2 norm)

In [31]:
class KNN:
  def __init__(self, k):
    self.k = k
  
  def fit(self, X, y):
    self.X_train = X
    self.y_train = y

  # distance
  def distance(self, data1, data2) :
    sub = data1 - data2
    dis = np.sum(np.square(sub)) ** 0.5
    # print("data1 : ", data1)
    # print("data2 : ", data2)
    # print("sub", sub)
    # print("dis", dis)
    return dis


  def predict(self, _X_test):
    final_output = []

    for i in range(len(_X_test)):
        if i % 700 == 0 :
          print("   Loading : ", i/len(_X_test))

        d = []
        votes = []

        for j in range(len(self.X_train)):
            # get distance with every data samples
            dist = self.distance(_X_test[i] , self.X_train[j])
            d.append([dist, j])
        
        d.sort()
        d = d[0:self.k]

        # vote
        for d, j in d:
            votes.append(self.y_train[j])  
        ans = Counter(votes).most_common(1)[0][0]
        final_output.append(ans)
      
    return final_output


# Predict satisfaction with 10 dataset pairs

In [None]:
# set k-NN model
window_size = 5
clf = KNN(window_size)

num_of_test_input = 2000
num_of_datapairs = 10

for i in range(num_of_datapairs):
    print("Predict with train/test pair# : ", i)

    clf.fit(X_train[i][0:], y_train[i][0:])
    
    # predict 
    prediction = clf.predict(X_test[i][0:num_of_test_input])
        
    ground_truth = y_test[i][0:num_of_test_input]
    # print(prediction)
    # print(ground_truth)
    # prediction loss
    accuracy_score = np.sum(prediction == ground_truth) / len(ground_truth) 
    print("       Accuracy_score : ", accuracy_score)

Predict with train/test pair# :  0
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.751
Predict with train/test pair# :  1
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7555
Predict with train/test pair# :  2
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7585
Predict with train/test pair# :  3
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.752
Predict with train/test pair# :  4
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7485
Predict with train/test pair# :  5
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.756
Predict with train/test pair# :  6
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.759
Predict with train/test pair# :  7
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.7615
Predict with train/test pair

In [None]:
# # set k-NN model
# # window_size = 5
# # clf = KNN(window_size)

# num_of_test_input = 2000
# num_of_datapairs = 10

# for i in range(num_of_datapairs):
#     accuracy = []
    
#     for ws in range(10):
#         window_size = 2*ws + 1
#         clf = KNN(window_size)
        
#         clf.fit(X_train[i][0:], y_train[i][0:])

#         # predict 
#         prediction = clf.predict(X_test[i][0:num_of_test_input])
        
#         ground_truth = y_test[i][0:num_of_test_input]
#         # print(prediction)
#         # print(ground_truth)
#         # prediction loss
#         accuracy_score = np.sum(prediction == ground_truth) / len(ground_truth) 
#         accuracy.append(accuracy_score)
#         # print("Accuracy_score : ", accuracy_score)
        
#     # plot graph
#     plt.figure(figsize=(10,8))
#     plt.title('accurracy with data pair#')
#     plt.xlabel('window size')
#     plt.ylabel('accurracy')
#     plt.plot(accuracy)

# Make another train/test dataset pair and export result into .csv

In [26]:
X_dataset = dataset[:,:6]
y_dataset = dataset[:,6]

X_train, X_test, y_train, y_test  = train_test_split(X_dataset, y_dataset, test_size=0.1, stratify=y_dataset)
X_test_original = X_test

print("Examples of dataset : \n", X_test[0:3])

# export test dataset .csv 
df = pd.DataFrame(X_test)
df.insert(6,"class" ,y_test)
df.to_csv(r'./test_dataset.csv', index = False)

# export train dataset .csv 
df = pd.DataFrame(X_train)
df.insert(6,"class" ,y_train)
df.to_csv(r'./train_dataset.csv', index = False)

Examples of dataset : 
 [[40 5 1 92178 61 7]
 [45 2 1 117186 33 13]
 [60 2 2 156980 21 9]]


In [29]:
# normalize
col_means = X_train.sum(axis = 0) / len(X_train)
col_std = np.std(X_train, dtype=np.float64, axis = 0)
    
normalized_train_data = (X_train - col_means)/col_std
X_train = normalized_train_data

normalized_test_data = (X_test- col_means)/col_std
X_test = normalized_test_data

print("Examples of normalized train dataset : \n", X_train[0:1])
print("Shape of train dataset : \n", X_train.shape)

print("Examples of normalized test dataset : \n", X_test[0:1])
print("Shape of test dataset : \n", X_test.shape)

# set k-NN model
window_size = 5
clf = KNN(window_size)
clf.fit(X_train[0:], y_train[0:])

num_of_test_input = 2000
    
# predict 
prediction = clf.predict(X_test[0:num_of_test_input])
ground_truth = y_test[0:num_of_test_input]
   
# prediction loss
accuracy_score = np.sum(prediction == ground_truth) / len(ground_truth) 
print("       Accuracy_score : ", accuracy_score)

Examples of normalized train dataset : 
 [[-2.3823678115222116 -0.3134079409904128 1.462509254732351
  0.18625436821391314 -1.7379921120433612 -1.3446493854559411]]
Shape of train dataset : 
 (18000, 6)
Examples of normalized test dataset : 
 [[-0.2687448154771361 2.326846164077026 -0.3071308760274172
  -0.9154160903661961 1.6750998592292792 -1.3446493854559411]]
Shape of test dataset : 
 (2000, 6)
   Loading :  0.0
   Loading :  0.35
   Loading :  0.7
       Accuracy_score :  0.759


In [30]:
# export preiction result .csv 
df = pd.DataFrame(X_test_original)
df.insert(6,"class" ,prediction)
df.to_csv(r'./20174089.csv', index = False)

# For additional prediction test

In [33]:
import pandas as pd
import os
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

####################################################
# load train data
####################################################
directory_data = './'
filename_data = 'satisfaction_data.csv'
df = pd.read_csv(os.path.join(directory_data, filename_data), header=None)
train_dataset = df.to_numpy() # pandas dataframe -> numpy array

# split feature and label
X_train = train_dataset[:,:6]
y_train = train_dataset[:,6]

# get mean and std of train data and normalize
col_mean = X_train.sum(axis = 0) / len(X_train)
col_std = np.std(X_train, dtype=np.float64, axis = 0)

# normalize
normalized_tot_data = (X_train - col_mean)/col_std
X_train = normalized_tot_data

print("Shape of train dataset : ", train_dataset.shape)
print("Mean of train dataset : ", col_mean)
print("Std of train dataset : ", col_std)
print("Examples of normalized train dataset : \n", train_dataset[0])


####################################################
# load test data
####################################################
filename_data = 'test_satisfaction_data.csv'
df = pd.read_csv(os.path.join(directory_data, filename_data), header=None)
test_dataset = df.to_numpy() # pandas dataframe -> numpy array

print("Shape of test dataset : ", test_dataset.shape)

# normalizae test dataset
# split feature and label
X_test = test_dataset[:,:6]
y_test = test_dataset[:,6]

normalized_tot_data = (X_test - col_mean)/col_std
X_test = normalized_tot_data
print("Examples of normalized dataset : \n", X_test[0])


# set k-NN model
window_size = 5
clf = KNN(window_size)
clf.fit(X_train, y_train)

# predict 
prediction = clf.predict(X_test)

# prediction loss
ground_truth = y_test
accuracy_score = np.sum(prediction == ground_truth) / len(ground_truth) 

print("Additional test accuracy score : ", accuracy_score)  

Shape of train dataset :  (20000, 7)
Mean of train dataset :  [43.22015 2.3586 1.1708 184706.98085 39.4344 10.30435]
Std of train dataset :  [1.18426131e+01 1.13679639e+00 5.64382282e-01 1.01123790e+05
 1.28767036e+01 2.45487700e+00]
Examples of normalized train dataset : 
 [40 2 1 86872 25 9 'unsatisfied']
Shape of test dataset :  (30, 7)
Examples of normalized dataset : 
 [-0.27191211812494154 -0.3154478691949687 -0.3026317542391366
 -0.9674773980382068 -1.1209701180688758 -0.531330083269184]
   Loading :  0.0
   Loading :  0.3333333333333333
   Loading :  0.6666666666666666
Additional test accuracy score :  0.7
