# Assignment 3: Supervised learning algorithms
### ip222gs

## Exercise 2: One versus all MNIST

In [175]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix

In [176]:
# read train data csv and save output as numpy array
df_train = np.loadtxt('mnist_train.csv', delimiter=',', skiprows=1)

In [177]:
# read validation data csv and save output as numpy array
df_val = np.loadtxt('mnist_test.csv', delimiter=',', skiprows=1)

In [179]:
# Alocating 1st 10000 raws for training to reduce computational Time
df_train_red = df_train[:10000, :]

In [180]:
# define features and label data
X = df_train_red[:, 1:]
y = df_train_red[:, 0]
X_val  = df_val[:, 1:]
y_val  = df_val[:, 0]

In [181]:
np.unique(y)

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

In [182]:
# Loaded dataset shapes

print(f'Training set: X= {X.shape}, y= {y.shape}')
print(f'Validation set: X= {X_val.shape}, y= {y_val.shape}')

Training set: X= (10000, 784), y= (10000,)
Validation set: X= (10000, 784), y= (10000,)


### Task 1. SVM fitting and hyperparmeters tuning

In [183]:
parameters = {'C': [1, 2],
            'gamma' : [2e-07, 3e-07, 4e-07, 5e-07, 6e-07, 7e-07]
            }
svc = SVC()
clf = GridSearchCV(svc, parameters, verbose=1, cv=2, scoring='f1_macro')

clf.fit(X, y.ravel())
print(f'The most optimal value for Gaussian kernel hyperparameter C at {clf.best_params_.get("C")} and gamma at {clf.best_params_.get("gamma")} gives an accuracy score of {clf.best_score_}')

Fitting 2 folds for each of 12 candidates, totalling 24 fits
The most optimal value for Gaussian kernel hyperparameter C at 2 and gamma at 5e-07 gives an accuracy score of 0.9607404654443994


In [184]:
svc = SVC(C=2, gamma=5e-07)
svc.fit(X, y)

svc_test_predict = svc.predict(X_val)

# Evaluate the model
print('Model performance on a validation set:')
score = accuracy_score(y_val, svc_test_predict)
print(f'Testing score: {score:.2%}')

Model performance on a validation set:
Testing score: 96.94%


Provided score of 96,94% is higher then required by the assignemnt as of 95%. Further optimizaiton is needed to achieve even better results

### Task 2. Implement one-vs-all SVM

In [185]:
# using proposal from the following code:
# https://houxianxu.github.io/implementation/One-vs-All-LogisticRegression.html

In [186]:
# Define numebr of classes
n_classes =  len(np.unique(y))

classifiers = []

# Define one-vs-rest classifiers
for i in range(n_classes):
    y_train = y.copy()
    index_i = (y_train == i)
    y_train[index_i] = 1
    y_train[~index_i] = 0
    svc = SVC(C=2, gamma=5e-07, probability=True) # activate probablity argumnet for future probability prediction
    svc.fit(X, y_train)
    classifiers.append(svc)

In [187]:
# Function for predicting values

def prediction(X):
    pred_y = []
    scores = []
    for i in range(n_classes):
        classifier = classifiers[i]
        scores.append(classifier.predict_proba(X)[:,1])
    y_pred = np.argmax(scores, axis=0)
    return y_pred

In [188]:
# Evaluate the model for one_vs_all
y_pred = prediction(X_val)

print('Model performance on a validation set using one vs all binarization scheme:')
score = accuracy_score(y_val, y_pred)
print(f'Testing score: {score:.2%}')

Model performance on a validation set using one vs all binarization scheme:
Testing score: 97.13%


We were able to get higher score comparing to a default one vs one binarization scheme

### Confusion matrix comparison

In [189]:
print('Confusion matrix for one vs all binarization scheme')
print(confusion_matrix(y_true=y_val, y_pred=y_pred))

Confusion matrix for one vs all binarization scheme
[[ 967    0    3    0    0    1    5    1    3    0]
 [   0 1124    2    3    0    1    4    0    1    0]
 [   5    0  998    7    1    0    2    8   10    1]
 [   0    0    7  978    0    6    0    6   10    3]
 [   1    0    3    0  953    0    7    2    3   13]
 [   3    0    0   10    0  865    8    1    4    1]
 [   6    3    0    0    5    3  937    0    4    0]
 [   0    9   16    2    3    0    0  989    0    9]
 [   2    0    2    7    2    3    2    5  950    1]
 [   5    3    3    6   13    4    2    9   12  952]]


In [190]:
print('Confusion matrix for one vs one binarization scheme')
print(confusion_matrix(y_true=y_val, y_pred=svc_test_predict))

Confusion matrix for one vs one binarization scheme
[[ 967    0    3    0    0    2    5    1    2    0]
 [   0 1124    3    2    0    1    3    0    2    0]
 [   5    0 1004    2    1    0    4   10    6    0]
 [   0    0   10  976    0    5    0    9    9    1]
 [   1    0    3    0  956    0    8    1    2   11]
 [   3    0    3   12    2  859    6    1    5    1]
 [   7    3    1    0    4    3  938    0    2    0]
 [   0    6   20    2    7    0    0  980    2   11]
 [   5    0    2   11    5    5    0    3  938    5]
 [   5    6    3    6   18    3    0    6   10  952]]


By studing confusion matrixes I cannot identify any apparent difference between the two methods in terms of misclassifications.Both methods delivert quite similar results in terms of prediction without any clear difference.