### Problem 4. K-nearest Neighbor Classification

### Part II. Consider the three-dimensional data set in train.csv

In [33]:
import numpy as np
from collections import Counter
import pandas as pd

In [34]:
train_data = pd.read_csv('./train.csv')
train_data.head()

Unnamed: 0,x,y,z,class
0,8.599291,9.729418,6.432371,1
1,6.592955,0.082556,1.969544,1
2,5.596471,9.815682,0.027295,1
3,2.743639,8.783177,4.041946,0
4,4.458362,5.750222,0.09907,0


In [35]:
X_train = np.array(train_data.loc[:,:'z'])
y_train = np.array(train_data['class'])

In [36]:
test_data = pd.read_csv('./test.csv')
test_data.head()

Unnamed: 0,ID,x,y,z,actual-class
0,1,8.074807,5.988044,3.844979,1
1,2,4.952249,5.823205,1.612045,0
2,3,4.773178,0.078757,4.209442,0
3,4,9.845919,2.055448,3.525702,1
4,5,1.612492,1.320515,8.200455,0


In [37]:
X_test = np.array(test_data.loc[:,'x':'z'])
y_test = np.array(test_data['actual-class'])

In [38]:
def euclidean_dis(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [39]:
class KNN:
    def __init__(self, k = 3):
        self.k = k
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
    
    def predicted(self, X):
        predicted = [self.predict(x) for x in X]
        return np.array(predicted)
    
    def weighted_predicted(self, X):
        predicted = [self.predict(x) for x in X]
        return np.array(predicted)
    
    def predict(self, x):
        distance = [euclidean_dis(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distance)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        result = Counter(k_nearest_labels).most_common(1)
        return result[0][0]
    
    def weighted_predict(self, X):
        distance = 1 / ([euclidean_dis(x, x_train) for x_train in self.X_train])
        k_indices = np.argsort(distance)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        result = Counter(k_nearest_labels).most_common(1)
        return result[0][0]

#### (1) Classify the data points in test.csv according to their 3-nearest neighbors. Also give the probability estimates for the final decision.

In [40]:
clf = KNN()
clf.fit(X_train, y_train)
prediction = clf.predicted(X_test)
prediction

array([1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0],
      dtype=int64)

#### (2) Do the same for the Euclidean distance weighted 3-nearest neighbors (1/d^2). Does the predicted label for each point remain the same as that in question (1)?

In [41]:
clf = KNN()
clf.fit(X_train, y_train)
weighted_prediction = clf.weighted_predicted(X_test)
weighted_prediction

array([1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0],
      dtype=int64)

According to the two preditions, we can conclude that the predicted label for each point remain the same as that in question (1)

#### (3) In the test.csv, the true class labels are also provided. Construct the confusion matrix and calculate Accuracy, Precision, F-measure for questions (1) and (2). From your results, which method gives better performance?

In [42]:
from sklearn.metrics import confusion_matrix

#### Construct the confusion matrix and calculate Accuracy, Precision, F-measure for questions (1)

In [43]:
confusion_matrix(y_test, prediction)

array([[13,  1],
       [ 0,  6]], dtype=int64)

In [44]:
tn, fp, fn, tp = confusion_matrix(y_test, prediction).ravel()
recall = (tp/(tp+fn))
precision = (tp/(tp+fp))
accuracy = ((tp+tn)/(tp+fn+tn+fp))
f_score = 2 * precision * recall / (precision + recall)
print("Accuracy is {} \nPrecision is {} \nF-measure is {} ".format(accuracy, precision, f_score))

Accuracy is 0.95 
Precision is 0.8571428571428571 
F-measure is 0.923076923076923 


In [45]:
from sklearn.metrics import classification_report
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.86      1.00      0.92         6

    accuracy                           0.95        20
   macro avg       0.93      0.96      0.94        20
weighted avg       0.96      0.95      0.95        20



#### Construct the confusion matrix and calculate Accuracy, Precision, F-measure for questions (2)

In [46]:
confusion_matrix(y_test, weighted_prediction)

array([[13,  1],
       [ 0,  6]], dtype=int64)

In [47]:
tn, fp, fn, tp = confusion_matrix(y_test, weighted_prediction).ravel()
recall = (tp/(tp+fn))
precision = (tp/(tp+fp))
accuracy = ((tp+tn)/(tp+fn+tn+fp))
f_score = 2 * precision * recall / (precision + recall)
print("Accuracy is {} \nPrecision is {} \nF-measure is {} ".format(accuracy, precision, f_score))

Accuracy is 0.95 
Precision is 0.8571428571428571 
F-measure is 0.923076923076923 


In [48]:
from sklearn.metrics import classification_report
print(classification_report(y_test,weighted_prediction))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.86      1.00      0.92         6

    accuracy                           0.95        20
   macro avg       0.93      0.96      0.94        20
weighted avg       0.96      0.95      0.95        20

