In [18]:
%reset -f

# Imports + Codebook configuration

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint
from sklearn.datasets import fetch_openml
from IPython.display import display, HTML
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import time
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import  RidgeClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from matplotlib import rcParams
from sklearn import svm


rcParams['figure.figsize'] = (10, 6)
rcParams['legend.fontsize'] = 16
rcParams['axes.labelsize'] = 16


# Loading and random printing

In [21]:
# Load MNIST data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

# Print the size of the data and labels
print('Data size: %d, %d' % (X.shape[0],X.shape[1]))
print('Labels size: %d' % (y.shape))

# Convert to numpy arrays
X = X.to_numpy()
y = y.to_numpy()

# # Show some random training samples
# num_random_samples = 4
# random_samples = np.random.randint(0, X.shape[0], num_random_samples)
# for random_sample_i in random_samples:
#     imi = X.iloc[random_sample_i,:].values.reshape(28,28)
#     fig, ax1 = plt.subplots(1,1)
#     figtitle = "training image #%d" % random_sample_i
#     ax1.imshow(imi, cmap=plt.get_cmap('gray'))
#     ax1.set_title(figtitle)
#     plt.show()
#     print('Label: %s' % (y[random_sample_i]))

Data size: 70000, 784
Labels size: 70000


# Split

In [22]:
X_train_and_val, X_test, y_train_and_val, y_test = train_test_split(X, y, test_size = 0.1, random_state=5)
X_train, X_val, y_train, y_val = train_test_split(X_train_and_val, y_train_and_val, test_size = 0.1, random_state=7)

print('# training samples: ', X_train.shape[0])
print('# validation samples: ', X_val.shape[0])
print('# test samples: ', X_test.shape[0])

# training samples:  56700
# validation samples:  6300
# test samples:  7000


# Scaling

In [23]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on training data
X_val_scaled = scaler.transform(X_val)          # Only transform on validation data
X_test_scaled = scaler.transform(X_test)        # Only transform on test data



Scaling the data should help to converge the models faster.

# K-nn

In [6]:
k_neighbors = range(1, 10)

for k in k_neighbors: 
    # Define a knn classifier with the training data
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train_scaled, y_train)

    # Start timer to report the time
    start_time = time.time()

    # Evaluate the model with the test split and print results
    acc_knn_clf = knn_clf.score(X_val_scaled, y_val)
    print(f"k={k} accuracy={acc_knn_clf * 100:.2f}%, time={time.time() - start_time:.1f}s")

k=1 accuracy=94.92%, time=3.4s
k=2 accuracy=94.24%, time=3.5s
k=3 accuracy=94.90%, time=3.6s
k=4 accuracy=94.70%, time=3.5s
k=5 accuracy=94.75%, time=3.5s
k=6 accuracy=94.49%, time=3.5s
k=7 accuracy=94.43%, time=3.7s
k=8 accuracy=94.29%, time=3.8s
k=9 accuracy=94.21%, time=4.4s


Different neighbors does not improve the accuracy

# K-NN + PCA

In [7]:
dims = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600]

# For several values of the k parameter
for dim in dims:
    
    pca = PCA(n_components=dim)
    X_train_pca_scaled = pca.fit_transform(X_train_scaled)
    X_val_pca_scaled = pca.transform(X_val_scaled)
    
    # Start timer to report the time
    start_time = time.time()

    # Define a knn classifier with the training data
    knn_clf = KNeighborsClassifier(n_neighbors=1)
    knn_clf.fit(X_train_pca_scaled, y_train)

    # Evaluate the model with the test split and print results
    acc_knn_clf = knn_clf.score(X_val_pca_scaled, y_val)
    print(f"dims after PCA={dim}, accuracy={acc_knn_clf * 100:.2f}%, time={time.time() - start_time:.1f} s")

dims after PCA=10, accuracy=90.21%, time=0.4 s
dims after PCA=20, accuracy=95.02%, time=0.4 s
dims after PCA=30, accuracy=95.65%, time=0.5 s
dims after PCA=40, accuracy=95.94%, time=0.5 s
dims after PCA=50, accuracy=96.00%, time=0.6 s
dims after PCA=60, accuracy=96.02%, time=0.7 s
dims after PCA=70, accuracy=95.90%, time=0.8 s
dims after PCA=80, accuracy=96.05%, time=0.7 s
dims after PCA=90, accuracy=96.05%, time=0.8 s
dims after PCA=100, accuracy=96.10%, time=0.9 s
dims after PCA=200, accuracy=95.40%, time=1.4 s
dims after PCA=300, accuracy=95.38%, time=1.8 s
dims after PCA=400, accuracy=95.14%, time=2.6 s
dims after PCA=500, accuracy=94.95%, time=3.2 s
dims after PCA=600, accuracy=95.00%, time=3.4 s


20 dimensions explain the 95% of accuracy of a K-nn model

# Gridsearch for k with 20 dim

In [98]:
pca = PCA(n_components=20)
X_train_pca_scaled = pca.fit_transform(X_train_scaled)
X_val_pca_scaled = pca.transform(X_val_scaled)

# Set up a parameter grid to search for the best k
param_grid = {'n_neighbors': list(range(1, 21))}

# Perform grid search with cross-validation
knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_pca_scaled, y_train)

# Get the best number of neighbors
best_k = grid_search.best_params_['n_neighbors']
print(f"Optimal number of neighbors (k): {best_k}")

Optimal number of neighbors (k): 5


Optimal k=5

# K-nn with optimal k and optimal dimensions of PCA

In [96]:
pca = PCA(n_components=20)
X_train_pca_scaled = pca.fit_transform(X_train_scaled)
X_val_pca_scaled = pca.transform(X_val_scaled)

# Start timer to report the time
start_time = time.time()
# Define a knn classifier with the training data
knn_clf = KNeighborsClassifier(n_neighbors=best_k)
knn_clf.fit(X_train_pca_scaled, y_train)

# Evaluate the model with the test split and print results
acc_knn_clf = knn_clf.score(X_val_pca_scaled, y_val)
print(f"dims after PCA=20, accuracy={acc_knn_clf * 100:.2f}%, time={time.time() - start_time:.1f} s")


dims after PCA=90, accuracy=95.22%, time=0.5 s


# K-nn with all dimensions and optimal k

In [100]:
# Start timer to report the time
start_time = time.time()
# Define a knn classifier with the training data
knn_clf = KNeighborsClassifier(n_neighbors=best_k)
knn_clf.fit(X_train_scaled, y_train)

# Evaluate the model with the test split and print results
acc_knn_clf = knn_clf.score(X_val_scaled, y_val)
print(f"k-nn with k={best_k}, accuracy={acc_knn_clf * 100:.2f}%, time={time.time() - start_time:.1f} s")


k-nn with k=5, accuracy=94.75%, time=3.6 s


Poorer performance than without PCA

# Logistic Regression

In [53]:

# Start timer to report the time
start_time = time.time()

# Fit a logistic regression with scaled data
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train_scaled, y_train)
acc_lr = lr.score(X_val_scaled, y_val)
print(f'Accuracy, logistic classifier: {acc_lr * 100:.2f}, time={time.time() - start_time:.1f}s')

Accuracy, logistic regression: 0.9136507936507936, time=36.6s


Logistic regression performs poor than k-nn.

# Logistic Regression with 20 dim

In [103]:
pca = PCA(n_components=20)
X_train_pca_scaled = pca.fit_transform(X_train_scaled)
X_val_pca_scaled = pca.transform(X_val_scaled)

# Start timer to report the time
start_time = time.time()

# Fit a logistic regression with scaled data
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train_pca_scaled, y_train)
acc_lr = lr.score(X_val_pca_scaled, y_val)
print(f'Accuracy, logistic classifier with 20 components of PCA: {acc_lr * 100:.2f}, time={time.time() - start_time:.1f}s')


Accuracy, logistic classifier with 20 components of PCA: 87.13, time=4.4s


In Logistic Regression, PCA lowers significantly the accuracy.

# Ridge

In [101]:
# Fit a ridge classifier for different alphas
start_time = time.time()

alphas = [0.001, 0.01, 0.1, 1, 10, 100]

for alpha in alphas:
    rc = RidgeClassifier(alpha=0.001)
    rc.fit(X_train_scaled, y_train)
    acc_rc = rc.score(X_val, y_val)
    print(f'Accuracy, ridge classifier, alpha={alpha}: {acc_rc * 100:.2f}%, time={time.time() - start_time:.1f}s')

Accuracy, ridge classifier0.001: 77.06, time=2.6s
Accuracy, ridge classifier0.01: 77.06, time=5.1s
Accuracy, ridge classifier0.1: 77.06, time=7.5s
Accuracy, ridge classifier1: 77.06, time=9.9s
Accuracy, ridge classifier10: 77.06, time=12.4s
Accuracy, ridge classifier100: 77.06, time=15.9s


Ridge doesnt pass the threshold for cross-validation

# SVM with 20 and 30 PCA components

In [24]:
n_components = [20, 30]
for component in n_components:
    pca = PCA(component)
    X_train_pca_scaled = pca.fit_transform(X_train_scaled)
    X_val_pca_scaled = pca.transform(X_val_scaled)

    # Start timer to report the time
    start_time = time.time()

    # Fit a SVM classifier
    svm_n = svm.SVC(C=1.0,kernel='rbf')
    svm_n.fit(X_train_pca_scaled, y_train)
    acc_svm_n = svm_n.score(X_val_pca_scaled, y_val)
    print(f'Accuracy, SVM with {component} components of PCA: {acc_svm_n * 100:.2f}%, time={time.time() - start_time:.1f}s')


AttributeError: module 'sklearn.svm' has no attribute 'score'