In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics, neighbors, linear_model
from sklearn.model_selection import train_test_split 
%matplotlib inline

## Feature extraction
The first step is to extract features from the images to convert the image into a feature vector. The images all have the same size, 32x32 pixels. We reduce them to 8x8 pixels and use the 64 pixels values vector as the features. The function `extract_features_subresolution` process a given image and return the feature vector. The list of all the images is read from the file `Data/MNIST_all.csv`. The features are stored in a matrix `X` and the target class in a vector `y`.

In [None]:
from tqdm import tqdm
from PIL import Image, ImageFilter
import os
DATA_PATH = 'Data/'
DEFAULT_SUBRESOLUTION = (8,8)
def extract_features_subresolution(img,img_feature_size = DEFAULT_SUBRESOLUTION):
    # reduce the image to a given size
    reduced_img = img.resize(
        img_feature_size, Image.BOX).filter(ImageFilter.SHARPEN)
    # return the values of the reduced image as features
    return [i for i in reduced_img.getdata()]

# load the list of all the images with their class
all_df = pd.read_csv("Data/MNIST_all.csv",header=None,names=['filename','class'])
all_df = all_df.sample(frac=1).reset_index(drop=True)
file_list = all_df['filename']

# target is the class
y = all_df['class']

# extract the features 
data = []
for i_path in tqdm(file_list):
    page_image = Image.open(os.path.join(DATA_PATH,i_path))
    data.append(extract_features_subresolution(page_image,DEFAULT_SUBRESOLUTION))
X = np.array(data)

# Check the size
# Feature size should be DEFAULT_SUBRESOLUTION
print ("feature size",X.shape[1])
print ("number of samples",X.shape[0])



In [None]:
# define the train/test split
X_train, X_test, y_train, y_test = train_test_split(## ADD PARAMETERS)

# Define a kNN classifier with k = 1
## ADD YOUR CODE HERE
    
# Train the classifier on training set
## ADD YOUR CODE HERE
    
# Print the acuracy on the train et test set
## ADD YOUR CODE HERE

## Training the classifier

In [None]:
from IPython.display import clear_output
# Split train/dev/test
X_train, X_test, y_train, y_test = # ADD YOUR CODE HERE
# Create validation set so that train = 60% , validation = 20% and test =  20%
X_train_hyper, X_valid_hyper, y_train_hyper, y_valid_hyper = # ADD YOUR CODE HERE

# select a smaller sample to train
sub_size = 5000 # Change this value if you want
# take the first samples : this is OK because we shuffled the file list.
X_train_hyper_sub = X_train_hyper[:][:sub_size]  
y_train_hyper_sub = y_train_hyper[:][:sub_size]

#  list of k values to test
k_values = # ADD YOUR CODE HERE

# store the score in a dataframe
df_scores = pd.DataFrame(columns=['train','valid','test'],index=k_values)

# iterate on différent values of k
for k in k_values:
    
    # create a kNN classifier with a given k
    clf = neighbors.KNeighborsClassifier(k,n_jobs=-1)
    
    # Train the classifier on training set
    # ADD YOUR CODE HERE
    
    # Compute the classification score on the different sets
    for _name,_train_set,_test_set in [('train',X_train_hyper_sub,y_train_hyper_sub),('valid',X_valid_hyper,y_valid_hyper),('test',X_test,y_test)]:
        _accuracy = # ADD YOUR CODE HERE
        df_scores.at[k,_name] = float("{:.2f}".format(_accuracy))
        # print the dataframe with score
        clear_output(wait=True)
        print(df_scores)

# plot the score for different values of k
_g = df_scores.plot()