# Classifier Implementation from Scratch : K-Nearest Neighbor

###### This notebook will train the "Negative" emotion on User A using a KNN that will be developed from scratch below.

In [30]:
import pandas as pd 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,GridSearchCV,KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score,plot_roc_curve, recall_score
from math import sqrt
import plotly.express as px
from collections import Counter
import numpy as np

In [7]:
# load in dataset 
# select emotion
emotion = "negative" 

# read in data file
df = pd.read_csv(f"grammatical_facial_expression/a_{emotion}_datapoints.txt",delimiter = " ",)
df_target = pd.read_csv(f"grammatical_facial_expression/a_{emotion}_targets.txt",delimiter = " ",header=None)

# combine both dataframes using the target dataset
df['target'] = df_target

### Train/Test Split

In [8]:
# split train/test and validation
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

# scale data
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y,test_size = 0.2, random_state=3)

### Develop KNN Classifier

In [199]:
class knn:
    '''This is the implemented classifier for K-Nearest Neighbor Classification. Default value for k is 3.
    '''
    def __init__(self, k=3):
        self.k = k
        
    def euclidean_distance(self, row1, row2):
        return np.sqrt(np.sum((row1 - row2)**2))
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # loop thru each row of test data and calculate the nearest neighbor
        y_pred = []
        for x in X:
            y_pred.append(self.nearest_neighbor(x))
            
        return np.array(y_pred)

    def nearest_neighbor(self, x):
        # use the euclidean distance function above to calculate distances between rows of data
        distances = []
        for x_train in self.X_train:
            distances.append(self.euclidean_distance(x,x_train))
        
        # sort by minimum distance and return the index
        index = np.argsort(distances)[:self.k]
        
        # np.take uses the index to return the actual label
        k_neighbor_labels = np.take(y_train,index)   
        
        # the Counter function returns the most common label
        label = Counter(k_neighbor_labels).most_common(1)
        
        return label[0][0]

In [200]:
clf = knn(3)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)

In [201]:
accuracy_score(pred,y_test)

0.8577777777777778