# PCA+k-means on CIFAR10

In [None]:
import numpy as np
from torchvision import datasets
import torchvision.transforms as transforms
from sklearn.decomposition import PCA
import time

# to import CIFAR-10 as torch tensor
transform = transforms.ToTensor()

# load the training and test datasets
train_set = datasets.CIFAR10(root='data', train=True,
                                   download=True, transform=transform)
test_set = datasets.CIFAR10(root='data', train=False,
                                  download=True, transform=transform)
train_data = train_set.data
train_labels = np.array(train_set.targets)
test_data = test_set.data
test_labels = np.array(test_set.targets)

## Dimensionality reduction using PCA

In [None]:
pca = PCA(n_components=50)
pca_train = pca.fit_transform(train_data.reshape(-1, 3*32*32))
pca_test = pca.fit_transform(test_data.reshape(-1, 3*32*32))

## clustering model instantiation and training

In [None]:
from sklearn.cluster import KMeans

# create a k-means model to cluster the embedded features
clustering_model = KMeans(n_clusters=10, tol = 1e-4, max_iter = 400).fit(pca_train)

## visualizing results and performance evaluation

In [None]:
labels_pred = clustering_model.predict(pca_test)

In [None]:
import import_ipynb
import DataVisuals as dv


view_results = dv.DataVisuals(train_data, train_labels, clustering_model.labels_)
view_results_test = dv.DataVisuals(test_data, test_labels, labels_pred)

In [None]:
# confusion matrix of training set
view_results.cm()

In [None]:
# confusion matrix of testing set
view_results_test.cm()

In [None]:
# scatter plot of subset of training set
view_results.scat(pca_train)

In [None]:
# scatter plot of subset of testing set
view_results_test.scat(pca_test)

In [None]:
# metrics of training set
metric = dv.Metrics(train_labels, clustering_model.labels_)
nmi = metric.nmi()
ari = metric.ari()
acc = metric.acc()
print('NMI = {:.4f} \nARI = {:.4f} \nACC = {:.4f}'.format(nmi, ari, acc))

In [None]:
# metrics of testing set
metric = dv.Metrics(test_labels, labels_pred)
nmi = metric.nmi()
ari = metric.ari()
acc = metric.acc()
print('NMI = {:.4f} \nARI = {:.4f} \nACC = {:.4f}'.format(nmi, ari, acc))