In [19]:
from sklearn.datasets import fetch_openml
    
mnist = fetch_openml('mnist_784', version='active')

print(mnist.keys())

print("Data shape:", mnist.data.shape)
print("Target shape:", mnist.target.shape)
print("Feature names:", mnist.feature_names[:10], "...")  
print("Target names:", mnist.target_names)
print("Description:", mnist.DESCR)  
print("Details:", mnist.details)

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])
Data shape: (70000, 784)
Target shape: (70000,)
Feature names: ['pixel1', 'pixel2', 'pixel3', 'pixel4', 'pixel5', 'pixel6', 'pixel7', 'pixel8', 'pixel9', 'pixel10'] ...
Target names: ['class']
Description: **Author**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges  
**Source**: [MNIST Website](http://yann.lecun.com/exdb/mnist/) - Date unknown  
**Please cite**:  

The MNIST database of handwritten digits with 784 features, raw data available at: http://yann.lecun.com/exdb/mnist/. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples  

It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and

In [20]:
# shuffle, split, and scale the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(
    mnist.data, mnist.target, test_size=0.2, random_state=42
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [21]:
# KNN Classifier on all features
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
y_pred_knn = knn_clf.predict(X_test)

from sklearn.metrics import accuracy_score
knn_accuracy = accuracy_score(y_test, y_pred_knn)

print("KNN Classifier Accuracy:", knn_accuracy)

KNN Classifier Accuracy: 0.9457857142857143


In [22]:
# PCA for 2, 10, 50 components
import time
from sklearn.decomposition import PCA

for n_components in [2, 10, 50]:

    start = time.time()
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    end = time.time()
    print(f"Time taken for {n_components} PCA components:", end - start)

    knn_clf_pca = KNeighborsClassifier(n_neighbors=5)
    knn_clf_pca.fit(X_train_pca, y_train)
    y_pred_knn_pca = knn_clf_pca.predict(X_test_pca)

    knn_accuracy_pca = accuracy_score(y_test, y_pred_knn_pca)
    print(f"KNN Classifier Accuracy with {n_components} PCA components:", knn_accuracy_pca)

Time taken for 2 PCA components: 1.3388962745666504
KNN Classifier Accuracy with 2 PCA components: 0.32107142857142856
Time taken for 10 PCA components: 1.2925469875335693
KNN Classifier Accuracy with 10 PCA components: 0.9115
Time taken for 50 PCA components: 1.363429307937622
KNN Classifier Accuracy with 50 PCA components: 0.9588571428571429


In [None]:
# TSNE for 2, 10, 50 components
from sklearn.manifold import TSNE

for n_components in [2, 10, 50]:
    start = time.time()
    tsne = TSNE(n_components=n_components, perplexity=30, method='exact')
    X_train_tsne = tsne.fit_transform(X_train)
    X_test_tsne = tsne.fit_transform(X_test)

    end = time.time()
    print(f"Time taken for {n_components} t-SNE components:", end - start)

    knn_clf_tsne = KNeighborsClassifier(n_neighbors=5)
    knn_clf_tsne.fit(X_train_tsne, y_train)
    y_pred_knn_tsne = knn_clf_tsne.predict(X_test_tsne)

    knn_accuracy_tsne = accuracy_score(y_test, y_pred_knn_tsne)
    print(f"KNN Classifier Accuracy with {n_components} t-SNE components:", knn_accuracy_tsne)

In [None]:
import umap

reducer = umap.UMAP()
