# K Nearest Neighbor Classification

In [119]:
import pandas as pd
import numpy as np
from collections import Counter

### Downloading data
- Data can be found here: https://archive.ics.uci.edu/ml/datasets/QSAR+fish+toxicity
- the features are 6 molecular descriptors, the target is acute aquatic toxicity (that may kill fish)

In [120]:
columns=['X1','X2','X3','X4','X5','X6','Y']
data=pd.read_csv('qsar_fish_toxicity.csv',sep=';',names=columns)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,Y
0,3.26,0.829,1.676,0,1,1.453,3.77
1,2.189,0.58,0.863,0,0,1.348,3.115
2,2.125,0.638,0.831,0,0,1.348,3.531
3,3.027,0.331,1.472,1,0,1.807,3.51
4,2.094,0.827,0.86,0,0,1.886,5.39


### Dividing the data into training and test sets

In [121]:
import random
target='Y'
features=data.columns.drop(labels='Y')
i_test=random.sample(range(0,908),190)
i_train=[]
for i in data.index:
    if i not in i_test:
        i_train.append(i)
X_train=data.loc[i_train,features]
X_test=data.loc[i_test,features]
y_train=data.loc[i_train,target]
y_test=data.loc[i_test,target]

### Normalizing Features

In [122]:
mean=np.mean(X_train)
std=np.std(X_train)
X_train-=mean
X_test-=mean
X_train/=std
X_test/=std

### Making y categorical

In [123]:
y_train=np.round(y_train)
y_test=np.round(y_test)

### Inserting bais term and turning data in Numpy Arrays

In [124]:
X_train.insert(0,'bias',1)
X_test.insert(0,'bias',1)
X_train=np.array(X_train)
X_test=np.array(X_test)
y_train=np.array([y_train]).T
y_test=np.array([y_test]).T

### Euclidean Distance
$d(q,p)=\sqrt{\sum_{i=1}^{n}(q_{i}-p_{i})^{2}}$

### KNN, K=11

In [125]:
my_predict=[]
for test in X_test:
    eu_dist=[]
    for train in X_train:
        eu_dist.append(np.sqrt(np.sum(train-test)**2))
    idx=np.argsort(eu_dist)[:11]
    most_common=Counter(y_train[idx].ravel()).most_common(1)
    my_predict.append(most_common[0][0])
my_acc=np.sum(my_predict==y_test)/len(y_test)
my_acc

45.72105263157895

### sklearn reference

In [128]:
y=y_train.ravel()
from sklearn.neighbors import KNeighborsClassifier
KNN=KNeighborsClassifier(n_neighbors=11,metric='euclidean')
KNN.fit(X_train,y)
sk_predict=KNN.predict(X_test)

### Accuracy Comparision

In [129]:
my_acc=np.sum(my_predict==y_test)/len(y_test)
sk_acc=np.sum(sk_predict==y_test)/len(y_test)
print('my accuracy score:', np.round(my_acc,4))
print('sklearn accuracy score:', np.round(sk_acc,4))

my accuracy score: 45.7211
sklearn accuracy score: 42.1
