# $k$-Nearest-Neighbors classification with Scikit-learn

## 1. Imports

In [31]:
import numpy as np
import pandas as pd
import sklearn.metrics
from sklearn.neighbors import KNeighborsClassifier
from tools import utils, prop_metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import sklearn.metrics


## 2. Functions

## 3. Loading raw data

In [32]:
train_raw = np.loadtxt('data/df_train.csv',skiprows=1,delimiter=',')
test_raw = np.loadtxt('data/df_test.csv',skiprows=1,delimiter=',')

train_features_raw = train_raw[:,:-1]
train_labels_raw = train_raw[:,-1]

test_features_raw = test_raw[:,:-1]
test_labels_raw = test_raw[:,-1]

## 4. k-NN on vanilla data

Making a baseline model on given data.

### 4.1 Standardize data

In [33]:
# Standardize data
mean = np.mean(train_features_raw, axis=0)
std = np.std(train_features_raw, axis=0)

train_features_standardized = (train_features_raw - mean) / std
test_features_standardized = (test_features_raw - mean) / std

### 4.2 Create validation sets

In [34]:
sets = utils.n_folder(5,train_features_standardized,labels=train_labels_raw,shuffle_before_split=True)

### 4.3 Train model on validation sets for different $k$ to select final hyperparameter

In [35]:
k = 30

records = []

for num in range(1,k+1):
    accuracies = []
    for item in sets:
        test,train = item
        
        knn = KNeighborsClassifier(num)
        knn.fit(train[:,:-1],train[:,-1])
        
        y_pred = knn.predict(test[:,:-1])
        acc = sklearn.metrics.accuracy_score(test[:,-1], y_pred)
        accuracies.append(acc)

    records.append((round(sum(accuracies)/len(accuracies)*100,2),num))
print(sorted(records,reverse=True))
print('Max accuracy achieved with k = ' + str(max(records)[1]) + ', at ' + str(max(records)[0])+'%')

[(66.21, 4), (66.21, 1), (62.07, 3), (62.07, 2), (60.69, 8), (60.69, 5), (60.0, 6), (59.31, 7), (57.93, 10), (57.93, 9), (57.24, 12), (56.55, 14), (55.86, 17), (55.86, 11), (55.17, 16), (55.17, 13), (54.48, 18), (54.48, 15), (53.1, 20), (53.1, 19), (51.72, 21), (51.03, 26), (51.03, 25), (50.34, 24), (50.34, 23), (49.66, 28), (48.97, 27), (48.97, 22), (48.28, 30), (48.28, 29)]
Max accuracy achieved with k = 4, at 66.21%


### 4.4 Predict on unseen data

Choose below $k$ as suggested above, evaluate the printed metrics.

In [36]:
k = 1

n_classes = 6

knn = KNeighborsClassifier(k)

knn.fit(train_features_standardized,train_labels_raw)

y_pred = knn.predict(test_features_standardized)


prop_metrics.performance_report(test_labels_raw, y_pred, n_classes)


Confusion matrix for prediction:
 [[14.  2.  1.  0.  0.  0.]
 [ 2. 17.  2.  1.  0.  0.]
 [ 5.  0.  2.  0.  0.  0.]
 [ 0.  3.  0.  3.  0.  1.]
 [ 0.  1.  0.  0.  3.  0.]
 [ 0.  0.  0.  0.  0.  8.]]


Accuracy for prediction:
 0.7230769230769231


Metrics for classes
_______________________________________________________________________________
Class	|	Precision	|	Recall		|	F1 Score
_______________________________________________________________________________

Class 1 |	 0.824 		|	 0.667 		|	 0.737

Class 2 |	 0.773 		|	 0.739 		|	 0.756

Class 3 |	 0.286 		|	 0.4 		|	 0.333

Class 5 |	 0.429 		|	 0.75 		|	 0.545

Class 6 |	 0.75 		|	 1.0 		|	 0.857

Class 7 |	 1.0 		|	 0.889 		|	 0.941


Weighted F1 score:
 0.734

Macro F1 score:
 0.695


## 5. k-NN with principal components

### 5.1 Decompose data

In [37]:
pca = sklearn.decomposition.PCA()

pca_train_features = pca.fit_transform(train_features_standardized)
pca_test_features = pca.transform(test_features_standardized)

In [38]:
cumulative_var = 0

#for num in range(len(pca.explained_variance_ratio_)):
 #   print('('+str(num+1)+','+str(round(pca.explained_variance_ratio_[num],3))+')')

#for num in range(len(pca.explained_variance_ratio_)):
 #   cumulative_var += pca.explained_variance_ratio_[num]
  #  print('(',num+1,',',round(cumulative_var,3),')')

### 5.2 5-component model for ensemble method

In [39]:
sets = utils.n_folder(5,pca_train_features[:,:5],labels=train_labels_raw,shuffle_before_split=True)

### 5.3 Train model on validation sets for different $k$ to select final hyperparameter

In [40]:
k = 30

records = []

for num in range(1,k+1):
    accuracies = []
    for item in sets:
        test,train = item
        
        knn = KNeighborsClassifier(num)
        knn.fit(train[:,:-1],train[:,-1])
        
        y_pred = knn.predict(test[:,:-1])
        acc = sklearn.metrics.accuracy_score(test[:,-1], y_pred)
        accuracies.append(acc)

    records.append((round(sum(accuracies)/len(accuracies)*100,2),num))
print(sorted(records,reverse=True))
print('Max accuracy achieved with k = ' + str(max(records)[1]) + ', at ' + str(max(records)[0])+'%')

[(67.59, 1), (66.21, 4), (64.83, 6), (64.14, 5), (64.14, 2), (63.45, 7), (63.45, 3), (62.76, 8), (61.38, 10), (61.38, 9), (58.62, 15), (58.62, 12), (57.24, 11), (56.55, 16), (56.55, 14), (56.55, 13), (54.48, 17), (53.79, 23), (53.79, 22), (53.79, 21), (53.1, 20), (53.1, 19), (52.41, 24), (51.72, 27), (51.72, 18), (51.03, 28), (51.03, 25), (50.34, 26), (49.66, 30), (48.97, 29)]
Max accuracy achieved with k = 1, at 67.59%


### 5.4 Predict on unseen data

In [41]:
k = 1

n_classes = 6

knn = KNeighborsClassifier(k)

knn.fit(pca_train_features[:,:5],train_labels_raw)

y_pred = knn.predict(pca_test_features[:,:5])


prop_metrics.performance_report(test_labels_raw, y_pred, n_classes)


Confusion matrix for prediction:
 [[15.  1.  1.  0.  0.  0.]
 [ 2. 17.  3.  1.  0.  0.]
 [ 4.  1.  1.  0.  0.  0.]
 [ 0.  3.  0.  3.  0.  1.]
 [ 0.  1.  0.  0.  3.  1.]
 [ 0.  0.  0.  0.  0.  7.]]


Accuracy for prediction:
 0.7076923076923077


Metrics for classes
_______________________________________________________________________________
Class	|	Precision	|	Recall		|	F1 Score
_______________________________________________________________________________

Class 1 |	 0.882 		|	 0.714 		|	 0.789

Class 2 |	 0.739 		|	 0.739 		|	 0.739

Class 3 |	 0.167 		|	 0.2 		|	 0.182

Class 5 |	 0.429 		|	 0.75 		|	 0.545

Class 6 |	 0.6 		|	 1.0 		|	 0.75

Class 7 |	 1.0 		|	 0.778 		|	 0.875


Weighted F1 score:
 0.72

Macro F1 score:
 0.647


### 5.5 2-component model to compare with LD-components

In [42]:
sets = utils.n_folder(5,pca_train_features[:,:2],labels=train_labels_raw,shuffle_before_split=True)

### 5.6 Train model on validation sets for different $k$ to select final hyperparameter

In [43]:
k = 30

records = []

for num in range(1,k+1):
    accuracies = []
    for item in sets:
        test,train = item
        
        knn = KNeighborsClassifier(num)
        knn.fit(train[:,:-1],train[:,-1])
        
        y_pred = knn.predict(test[:,:-1])
        acc = sklearn.metrics.accuracy_score(test[:,-1], y_pred)
        accuracies.append(acc)

    records.append((round(sum(accuracies)/len(accuracies)*100,2),num))

print(sorted(records,reverse=True))
print('Max accuracy achieved with k = ' + str(max(records)[1]) + ', at ' + str(max(records)[0])+'%')

[(60.0, 9), (60.0, 8), (60.0, 7), (60.0, 4), (59.31, 17), (59.31, 10), (59.31, 6), (58.62, 19), (58.62, 15), (58.62, 11), (57.93, 14), (57.93, 13), (57.93, 12), (57.93, 5), (57.24, 22), (57.24, 21), (57.24, 18), (57.24, 16), (57.24, 3), (55.86, 23), (55.86, 20), (55.86, 2), (55.17, 30), (55.17, 26), (54.48, 29), (54.48, 28), (54.48, 27), (54.48, 25), (54.48, 24), (51.03, 1)]
Max accuracy achieved with k = 9, at 60.0%


### 5.7 Predict on unseen data

In [44]:
k = 4

n_classes = 6

knn = KNeighborsClassifier(k)

knn.fit(pca_train_features[:,:2],train_labels_raw)

y_pred = knn.predict(pca_test_features[:,:2])


prop_metrics.performance_report(test_labels_raw, y_pred, n_classes)


Confusion matrix for prediction:
 [[18.  4.  2.  1.  0.  0.]
 [ 3. 18.  3.  1.  1.  1.]
 [ 0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  2.  0.]
 [ 0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  1.  0.  8.]]


Accuracy for prediction:
 0.676923076923077


Metrics for classes
_______________________________________________________________________________
Class	|	Precision	|	Recall		|	F1 Score
_______________________________________________________________________________

Class 1 |	 0.72 		|	 0.857 		|	 0.783

Class 2 |	 0.667 		|	 0.783 		|	 0.72

Class 3 |	 0 		|	 0.0 		|	 0

Class 5 |	 0.0 		|	 0.0 		|	 0

Class 6 |	 0.0 		|	 0.0 		|	 0

Class 7 |	 0.889 		|	 0.889 		|	 0.889


Weighted F1 score:
 0.631

Macro F1 score:
 0.399


## 6. k-NN with LD components

### 6.1 Decompose data

In [15]:
lda = LinearDiscriminantAnalysis(solver='eigen')

lda_train_features = lda.fit_transform(train_features_standardized,train_labels_raw)
lda_test_features = lda.transform(test_features_standardized)

In [16]:
lda_plot_data = np.column_stack((lda_train_features[:,:2],train_labels_raw))

In [17]:
classes = {1: 'a', 2: 'b', 3: 'c', 5: 'd', 6: 'e', 7: 'f'}

#for item in lda_plot_data:
 #   print(str(round(item[0],3)) + ' ' + str(round(item[1],3)) + ' ' + classes[item[2]])

In [18]:


cumulative_var = 0

#for num in range(len(lda.explained_variance_ratio_)):
 #   print('(' + str(num+1) + ',' + str(round(lda.explained_variance_ratio_[num],3)) + ')')

#for num in range(len(lda.explained_variance_ratio_)):
 #   cumulative_var += lda.explained_variance_ratio_[num]
  #  print('(' + str(num+1) + ',' + str(round(cumulative_var,3)) +')')
    


### 6.2 Create validation sets

In [19]:
sets = utils.n_folder(5,lda_train_features[:,:2],labels=train_labels_raw,shuffle_before_split=True)

### 6.3 Train model on validation sets for different $k$ to select final hyperparameter

In [20]:
k = 30

records = []

for num in range(1,k+1):
    accuracies = []
    wf1s = []
    for item in sets:
        test,train = item
        
        knn = KNeighborsClassifier(num)
        knn.fit(train[:,:-1],train[:,-1])
        
        y_pred = knn.predict(test[:,:-1])
        acc = sklearn.metrics.accuracy_score(test[:,-1], y_pred)
        accuracies.append(acc)
        
        weighted_f1 = sklearn.metrics.f1_score(test[:,-1],y_pred,average='weighted')
        wf1s.append(weighted_f1)

    avg_wf1 = round(sum(wf1s)/len(wf1s)*100,2)
    records.append([round(sum(accuracies)/len(accuracies)*100,2),avg_wf1,num])
print(sorted(records,reverse=True))
print('Max accuracy achieved with k = ' + str(max(records)[2]) + ', at ' + str(max(records)[0])+'%')

[[59.31, 57.77, 5], [59.31, 56.67, 8], [58.62, 56.65, 4], [57.93, 54.56, 9], [57.24, 54.93, 3], [56.55, 53.11, 11], [56.55, 52.84, 10], [56.55, 52.56, 13], [55.86, 52.47, 25], [55.86, 52.29, 27], [55.86, 52.06, 17], [55.86, 51.94, 12], [55.86, 51.87, 15], [55.86, 51.78, 2], [55.86, 51.74, 14], [55.17, 53.38, 6], [55.17, 53.2, 7], [55.17, 51.32, 23], [55.17, 50.79, 29], [54.48, 52.4, 1], [54.48, 51.27, 19], [54.48, 51.11, 21], [54.48, 51.09, 22], [54.48, 51.0, 20], [54.48, 50.72, 24], [54.48, 50.47, 28], [54.48, 50.43, 16], [53.79, 49.29, 30], [53.1, 49.71, 26], [53.1, 49.09, 18]]
Max accuracy achieved with k = 5, at 59.31%


### 6.4 Plot decision boundaries

In [21]:
create_data = False

if create_data:

    db_X = np.loadtxt('data/db_data.csv',delimiter=',')


    k = 4

    knn = KNeighborsClassifier(k)

    knn.fit(lda_train_features[:,:2],train_labels_raw)

    y_pred = knn.predict(db_X)

    labels_interpreted = np.array([classes[i] for i in y_pred])
    outfile = np.column_stack((db_X,labels_interpreted))
    
    lines = []

    for row in outfile:
        lines.append(' '.join([str(row[0]),str(row[1]),str(row[2])]))

    f = open("db_coords.txt", "w")
    f.write('\n'.join(lines))
    f.close()

### 6.5 Predict on unseen data

In [22]:
k = 4

n_classes = 6

knn = KNeighborsClassifier(k)

knn.fit(lda_train_features[:,:2],train_labels_raw)

y_pred = knn.predict(lda_test_features[:,:2])

prop_metrics.performance_report(test_labels_raw, y_pred, n_classes)


Confusion matrix for prediction:
 [[17. 10.  2.  0.  1.  0.]
 [ 2. 10.  3.  1.  0.  0.]
 [ 2.  1.  0.  0.  0.  0.]
 [ 0.  2.  0.  3.  0.  1.]
 [ 0.  0.  0.  0.  2.  1.]
 [ 0.  0.  0.  0.  0.  7.]]


Accuracy for prediction:
 0.6


Metrics for classes
_______________________________________________________________________________
Class	|	Precision	|	Recall		|	F1 Score
_______________________________________________________________________________

Class 1 |	 0.567 		|	 0.81 		|	 0.667

Class 2 |	 0.625 		|	 0.435 		|	 0.513

Class 3 |	 0.0 		|	 0.0 		|	 0

Class 5 |	 0.5 		|	 0.75 		|	 0.6

Class 6 |	 0.667 		|	 0.667 		|	 0.667

Class 7 |	 1.0 		|	 0.778 		|	 0.875


Weighted F1 score:
 0.586

Macro F1 score:
 0.554


In [23]:
# to compute weighted F1

weighted_f1 = sklearn.metrics.f1_score(test_labels_raw,y_pred,average='weighted')

print("Weighted F1 score\n",weighted_f1)

Weighted F1 score
 0.5856903353057199


In [24]:
# to compute micro F1

micro_f1 = sklearn.metrics.f1_score(test_labels_raw,y_pred,average='micro')

print("Micro F1 score\n",micro_f1)

Micro F1 score
 0.6


In [25]:
#classes = {1: 'aa', 2: 'bb', 3: 'cc', 5: 'dd', 6: 'ee', 7: 'ff'}

#dat = np.column_stack((lda_test_features[:,:2],test_labels_raw))

#for row in dat:
 #   print(round(row[0],3),round(row[1],3),classes[row[2]])

## 7. Binary classification with LD-components

### 7.1 Reclassify labels into window/non-window

In [26]:
train_labels_raw[train_labels_raw < 4] = 1
train_labels_raw[train_labels_raw > 4] = 2

test_labels_raw[test_labels_raw < 4] = 1
test_labels_raw[test_labels_raw > 4] = 2

### 7.2 Create validation sets

In [27]:
sets = utils.n_folder(5,lda_train_features[:,:2],labels=train_labels_raw,shuffle_before_split=True)

### 7.3 Train model on validation sets for different  $k$ to select final hyperparameter

In [28]:
k = 30

records = []

for num in range(1,k+1):
    accuracies = []
    for item in sets:
        test,train = item
        
        knn = KNeighborsClassifier(num)
        knn.fit(train[:,:-1],train[:,-1])
        
        y_pred = knn.predict(test[:,:-1])
        acc = sklearn.metrics.accuracy_score(test[:,-1], y_pred)
        accuracies.append(acc)

    records.append((round(sum(accuracies)/len(accuracies)*100,2),num))
print(sorted(records,reverse=True))
print('Max accuracy achieved with k = ' + str(max(records)[1]) + ', at ' + str(max(records)[0])+'%')

[(95.86, 1), (94.48, 15), (93.79, 17), (93.79, 16), (93.79, 14), (93.79, 13), (93.1, 23), (93.1, 21), (93.1, 19), (93.1, 18), (93.1, 12), (93.1, 11), (93.1, 9), (93.1, 8), (92.41, 30), (92.41, 27), (92.41, 26), (92.41, 25), (92.41, 24), (92.41, 22), (92.41, 20), (92.41, 10), (92.41, 4), (92.41, 3), (91.72, 29), (91.72, 28), (91.72, 7), (91.72, 5), (91.03, 6), (91.03, 2)]
Max accuracy achieved with k = 1, at 95.86%


### 7.4 Predict on unseen data

In [29]:
k = 1

n_classes = 2

knn = KNeighborsClassifier(k)

knn.fit(lda_train_features[:,:2],train_labels_raw)

y_pred = knn.predict(lda_test_features[:,:2])


prop_metrics.performance_report(test_labels_raw, y_pred, n_classes)


Confusion matrix for prediction:
 [[44.  2.]
 [ 5. 14.]]


Accuracy for prediction:
 0.8923076923076924


Metrics for classes
_______________________________________________________________________________
Class	|	Precision	|	Recall		|	F1 Score
_______________________________________________________________________________

Class 1 |	 0.957 		|	 0.898 		|	 0.926

Class 2 |	 0.737 		|	 0.875 		|	 0.8


Weighted F1 score:
 0.895

Macro F1 score:
 0.863


In [30]:
# to compute weighted F1

weighted_f1 = sklearn.metrics.f1_score(test_labels_raw,y_pred,average='weighted')

print("Weighted F1 score\n",weighted_f1)

Weighted F1 score
 0.8952226720647773


In [31]:
micro_f1 = sklearn.metrics.f1_score(test_labels_raw,y_pred,average='micro')

print("Micro F1 score\n",micro_f1)

Micro F1 score
 0.8923076923076924
