## K Nearest Neighbors

In [3]:
import pprint
import numpy as np
import pandas as pd
import knn_images
from IPython.display import display as disp
import matplotlib.pyplot as plt


In [4]:
def get_train_data(numBatches):
    train_data = []
    train_labels = []
    file = "cifar-10-batches-py/data_batch_" + str(numBatches)
    batch_data_2 = knn_images.unpickle(file)
    train_data = batch_data_2[b'data']
    train_labels = batch_data_2[b'labels']

    for i in range(1,numBatches):
        file = "cifar-10-batches-py/data_batch_" + str(i)
        batch_data = knn_images.unpickle(file)
        data = batch_data[b'data']
        labels = batch_data[b'labels']
        
        train_data = np.concatenate((train_data, data),0)
        train_labels= np.concatenate((train_labels, labels),0)
        
    return train_data, train_labels

def get_test_data():
    file = "cifar-10-batches-py/test_batch"
    test_batch = knn_images.unpickle(file)
    test_data = test_batch[b'data']
    test_labels = test_batch[b'labels']
    return test_data, test_labels

In [9]:
# retrieve train_data, train_labels
train_data, train_labels = get_train_data(5)
test_data, test_labels = get_test_data()

train_labels = train_labels[:500]
train_data = train_data[:500]

test_labels = test_labels[:100]
test_data = test_data[:100]
#convert binary labels

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

def fit_knn(train_data, train_labels, k):
    knn = KNeighborsClassifier(1)
    knn.fit(train_data,train_labels) #fit data in classifier
    return knn

#retrieve knn model with specified number of k
knn = fit_knn(train_data, train_labels, 10)

In [11]:
import time

start = time.time()
pred_labels = knn.predict(test_data)
print("time: " + str(time.time()-start))
acc = 0

for i in range(0,len(pred_labels)):
    if pred_labels[i] == test_labels[i]:
        acc += 1


time: 0.32402896881103516


In [13]:
from sklearn.metrics import classification_report, confusion_matrix

print(acc/len(test_data))
print(confusion_matrix(test_labels[:500], pred_labels))  
print(classification_report(test_labels[:500], pred_labels))  


0.2
[[3 0 0 0 3 1 0 0 1 2]
 [1 1 1 0 1 0 1 0 1 0]
 [0 0 2 0 4 0 1 1 0 0]
 [0 0 1 0 3 1 3 1 1 0]
 [0 0 1 0 3 1 1 0 1 0]
 [0 0 1 0 5 0 0 0 2 0]
 [0 0 3 0 7 1 4 0 1 0]
 [2 0 3 0 2 2 0 1 0 1]
 [3 0 0 0 1 0 0 1 6 2]
 [2 0 1 0 3 0 0 0 5 0]]
             precision    recall  f1-score   support

          0       0.27      0.30      0.29        10
          1       1.00      0.17      0.29         6
          2       0.15      0.25      0.19         8
          3       0.00      0.00      0.00        10
          4       0.09      0.43      0.15         7
          5       0.00      0.00      0.00         8
          6       0.40      0.25      0.31        16
          7       0.25      0.09      0.13        11
          8       0.33      0.46      0.39        13
          9       0.00      0.00      0.00        11

avg / total       0.24      0.20      0.19       100

[4 8 0 8 4 4 6 8 4 8 5 0 8 2 8 4 4 5 9 4 2 0 8 4 4 4 4 0 4 6 2 4 5 4 8 6 2
 0 8 4 4 6 8 4 9 0 4 8 4 4 8 8 4 5 0 8 7 2 4 5 5 2 

  'precision', 'predicted', average, warn_for)


## SVM

In [18]:
# changes the labels to apply binary classification (1 vs 0)

def binary_labels(labels, target):
    return [1 if label == target else 0 for label in labels]

In [25]:
#svm
from sklearn.svm import SVC

def predict_svm(train,test):
    train_data,train_labels = train
    test_data,test_labels = test
    # 10 svms to predict each class 1vsALL
    svms = [SVC(kernel="linear") for i in range(10)]
    # fit 10 models
    svms = [svms[i].fit(train_data,binary_labels(train_labels,i)) for i in range(10)]
    # should return list of lists, where predictions[5] = labels predicted for class 5 ...
    preds = [svms[i].predict(test_data) for i in range(10)]
    return preds

def svm_acc(preds, test_labels):
    acc = 0
    for i in range(len(test_labels)):
        label = test_labels[i]
        svm_label = preds[label]
        if svm_label[i] == 1:
            acc += 1
    return acc/len(test_labels)

def convert_predictions(preds):
    tot_preds = list(range(len(preds[0])))
    for i in range(len(preds)):
        pred = preds[i]
        for j in range(len(pred)):
            if pred[j] == 1:
                tot_preds[j] = i
    return tot_preds

In [26]:
#grab data
train_data, train_labels = get_train_data(5)
test_data, test_labels = get_test_data()
#slice labels

train_labels = train_labels[:500]
train_data = train_data[:500]

test_labels = test_labels[:100]
test_data = test_data[:100]
#convert binary labels

In [27]:
preds = predict_svm((train_data,train_labels),(test_data,test_labels))
acc = svm_acc(preds, test_labels)
print(acc)

0.26


In [30]:
svm_preds = convert_predictions(preds)
import random
svm_preds = [random.randint(0,9) if label >]
print(svm_preds)

from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(test_labels,svm_preds))  
print(classification_report(test_labels,svm_preds))  

[3, 9, 8, 7, 4, 7, 6, 6, 3, 9, 10, 11, 4, 13, 5, 4, 7, 8, 9, 19, 1, 4, 22, 9, 5, 25, 2, 8, 7, 6, 7, 4, 32, 33, 9, 7, 6, 37, 8, 4, 3, 6, 8, 3, 7, 0, 6, 5, 4, 4, 1, 51, 8, 2, 8, 55, 8, 8, 6, 0, 4, 6, 7, 9, 64, 4, 8, 8, 9, 9, 70, 71, 72, 8, 8, 4, 76, 77, 78, 79, 8, 5, 3, 4, 8, 85, 86, 87, 8, 8, 0, 91, 8, 93, 94, 1, 7, 0, 2, 7]
[[2 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
             precision    recall  f1-score   support

          0       0.50      0.20      0.29        10
          1       0.00      0.00      0.00         6
          2       0.00      0.00      0.00         8
          3       0.40      0.20      0.27        10
          4       0.00      0.00      0.00         7
          5       0.25      0.12      0.17         8
          6       0.38      0.19      0.25        16
          7       0.10      0.09      0.10        11
          8       0.33      0.46      0.39        13
          9       0.25 

  'recall', 'true', average, warn_for)
