## Load the data

In [361]:
import numpy as np

file_names = []
files = []

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict


for i in range(1,6):
    file = f"data_batch_{i}"
    file_names.append(file)

file_names.append("test_batch")

for file in file_names:
    cifar10_dict = unpickle(file)
    files.append(cifar10_dict)

n_of_files = len(file_names)







## Inspect the list `files`
* In this list i have 6 dictionaries
* Below i get to see the keys from all the dictionaries

In [362]:
for i in range(n_of_files):
    print(f"Dictionary {file_names[i]}: \nKeys: {files[i].keys()}\nBatch label: {files[i][b'batch_label']}\nData size: {files[i][b'data'].shape} \n")

Dictionary data_batch_1: 
Keys: dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
Batch label: b'training batch 1 of 5'
Data size: (10000, 3072) 

Dictionary data_batch_2: 
Keys: dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
Batch label: b'training batch 2 of 5'
Data size: (10000, 3072) 

Dictionary data_batch_3: 
Keys: dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
Batch label: b'training batch 3 of 5'
Data size: (10000, 3072) 

Dictionary data_batch_4: 
Keys: dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
Batch label: b'training batch 4 of 5'
Data size: (10000, 3072) 

Dictionary data_batch_5: 
Keys: dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
Batch label: b'training batch 5 of 5'
Data size: (10000, 3072) 

Dictionary test_batch: 
Keys: dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
Batch label: b'testing batch 1 of 1'
Data size: (10000, 3072) 



## Since i have the contents of the cifar10, i am keeping only the data and the labels

In [363]:
for i in range(n_of_files):
    #use try except because if i try to run this cell alone, an error will occur, because i have already deleted these keys
    try:
        del files[i][b'batch_label']
        del files[i][b'filenames']
    except KeyError:
        print("These keys have already been deleted!\nRun all the cells again")
        pass
    print(f"Dictionary {file_names[i]}: \nKeys: {files[i].keys()}\n")

    

    

Dictionary data_batch_1: 
Keys: dict_keys([b'labels', b'data'])

Dictionary data_batch_2: 
Keys: dict_keys([b'labels', b'data'])

Dictionary data_batch_3: 
Keys: dict_keys([b'labels', b'data'])

Dictionary data_batch_4: 
Keys: dict_keys([b'labels', b'data'])

Dictionary data_batch_5: 
Keys: dict_keys([b'labels', b'data'])

Dictionary test_batch: 
Keys: dict_keys([b'labels', b'data'])



## Create the X_train, y_train, X_test, y_test

In [364]:
#create X_train, y_train, X-test, y_test
X_train = np.full((50000,3072),0,dtype=int)
X_test = np.full((10000,3072),0,dtype=int)
y_train = np.full((50000,),0,dtype=int)
y_test = np.full((10000,),0,dtype=int)

for i in range(n_of_files):
    if i != 5:
        #this is the X_train, y_train
        X_train[i*10000:(i+1)*10000,:] = files[i][b'data']
        y_train[i*10000:(i+1)*10000] = files[i][b'labels']
    else:
        #i have just finished X_train, y_train
        print(f"Shape X_train: {X_train.shape}\nShape y_train: {y_train.shape}")
        #this is the X_test, y_test
        X_test[:,:] = files[i][b'data']
        y_test[:] = files[i][b'labels']
        print(f"Shape X_test: {X_train.shape}\nShape y_test: {y_train.shape}\n")

#check if there is a nan value left
print(f"NaN inidces in X_train: {np.where(np.isnan(X_train))}")
print(f"NaN inidces in y_train: {np.where(np.isnan(y_train))}")
print(f"NaN inidces in X_test: {np.where(np.isnan(X_test))}")
print(f"NaN inidces in y_test: {np.where(np.isnan(y_test))}")


        







Shape X_train: (50000, 3072)
Shape y_train: (50000,)
Shape X_test: (50000, 3072)
Shape y_test: (50000,)

NaN inidces in X_train: (array([], dtype=int64), array([], dtype=int64))
NaN inidces in y_train: (array([], dtype=int64),)
NaN inidces in X_test: (array([], dtype=int64), array([], dtype=int64))
NaN inidces in y_test: (array([], dtype=int64),)


## Preprocessing the data

- `fit_transform` is used on `X_train` to calculate the mean and standard deviation of the training data (this is the "fit" part).
It then scales `X_train` using those calculated values (this is the "transform" part).

- The `transform` only applies the previously computed mean and standard deviation (from X_train) to scale X_test.
This step is crucial to ensure that X_test is scaled in the same way as X_train, maintaining consistency between datasets.
If you used `fit_transform` on X_test, it would calculate a new mean and standard deviation just for the test set, causing the training and test data to be on different scales, which would lead to unreliable results.

In [365]:
from sklearn.preprocessing import StandardScaler

#Feature scaling
sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


In [366]:
# from sklearn.decomposition import PCA

# #!!!!!!!!!!!!!! KANO PCA KAIS TO TEST BATCH DATA ??????????? !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# pca = PCA(n_components=2000)
# X_train = pca.fit_transform(X_train)
# X_test = pca.transform(X_test)



## Implement KNN

* Each batch file has the `data` and the `labels`
* `Data` is a 10000(images) x 3072(rgb) matrix 
* `Labels` is a list with 10000 elements, each one corresponds to an image

* So the concept is this:
``
I am working in the 3072 dimensional space and i have seen all the training batches.
The new image will use the KNN to find the K nearest images, using a defined metric (Euklideian Distance,cosine,...)
Then the majority class (label) between the K nearest images will be the class (label) of the new image 
`` 



In [367]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Define the model: init 1-NN
# !!!!!!!!!!!!!!!!!!!!!! Find out what cosine metric extually does !!!!!!!!!!!!!!!!!!!!!!!!!!
classifier_1NN = KNeighborsClassifier(n_neighbors=1,weights="distance",metric="cosine")  #i noticed that with the cosine metric, the accuracy is higher than with the euclidean

# Train the model
# Only the training batches

classifier_1NN.fit(X_train,y_train)

# Predict the test set results
# predict the labels from the test batch data

y_pred_labels = classifier_1NN.predict(X_test)
# y_test_labels = files[n_of_files-1][b'labels']

# Evaluate the model using accuracy (y_pred_labels == y_test) / number of tests
# number of tests = 10k

print(f"Accuracy of 1-NN: {accuracy_score(y_test,y_pred_labels)}")

Accuracy of 1-NN: 0.4102


In [368]:
# Define the model: init 3-NN
classifier_3NN = KNeighborsClassifier(n_neighbors=3,weights="distance",metric="cosine")  #i noticed that with the cosine metric, the accuracy is higher than with the euclidean

# Train the model
# Only the training batches

classifier_3NN.fit(X_train,y_train)

# Predict the test set results
# predict the labels from the test batch data

y_pred_labels = classifier_3NN.predict(X_test)

# Evaluate the model using accuracy (y_pred_labels == y_test_labels) / number of tests
# number of tests = 10k

print(f"Accuracy of 1-NN: {accuracy_score(y_test_labels,y_pred_labels)}")

Accuracy of 1-NN: 0.4272


I have tried many different things:
* `KNeighborsClassifier(n_neighbors=k,weights="distance",metric="cosine")` ~ 0.35
* `KNeighborsClassifier(n_neighbors=k,metric="cosine")` ~ 0.345
* `KNeighborsClassifier(n_neighbors=k,metric="euclidean")` ~ 0.29
* used all the other metrics: `cityblock`, `haversine`, `l1`, `l2`, `manhattan`, `nan_euclidean` < 0.35.
* (`manhattan` = `minkowski` for p=1 and `euclidean` is `minkowski` for p = 2)

* Also i used `weights = "distance"` and i got a slightly better accuracy. This means that neighbors that are nearer to the query point will have a greater influence on the predicted class
The default value is `weights = "uniform"` (each neighbor contributes equally to the decision.)