In [1]:
# Dataset: CIFAR-10

# Combine PCA and KMeans to classify images

# KMeans is an unsupervised clustering method. -> Only use the test dataset

# Define the batch size for training and testing

from tqdm import tqdm

from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# 1. Load all test images
    # Convert to numpy array
    # Flattend 2D images to 1D vectors
        # -> Obtain a feature matrix of shape (10000, 3072)

# 2. Normalize features to be zero-mean, unit standard deviation (refer to here)

# 3. Use PCA to reduce the feature dimension from 3072 to 5

# 4. Use kMeans to cluster the features from PCA. 
    # Use kMeans from scikit-learn
    # Use 10 clusters (k=10) and random_state=0

# 5. Use the following purity metric:
def purity_score(predicted_clusters, labels):
    # predicted_clusters: 1D array of assigned cluster ID, produced by KMeans
    # labels: 1D array of groundtruth class labels.
    cluster_map = {}
    for i, cluster in enumerate(predicted_clusters):
        if cluster not in cluster_map:
            cluster_map[cluster] = {}
        lbl = labels[i].item()
        cluster_map[cluster][lbl] = cluster_map[cluster].get(lbl, 0) + 1

    cluster_to_label = {}

    for cluster, freq_dict in cluster_map.items():
        cluster_to_label[cluster] = max(freq_dict, key=freq_dict.get)

    correct = 0
    for i, cluster in enumerate(predicted_clusters):
        if labels[i].item() == cluster_to_label[cluster]:
            correct += 1
            
    accuracy = correct / len(labels)
    return accuracy


In [None]:

# Define a transform to convert images to tensors and normalize them
transform = transforms.Compose([
transforms.ToTensor(), # Convert PIL image to tensor
transforms.Normalize((0.4914, 0.4822, 0.4465), # Mean for each channel
                    (0.2470, 0.2435, 0.2616)) # Std for each channel
])

# Load the CIFAR-10 training dataset with transformations applied
train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
# Load the CIFAR-10 test dataset with the same transformations
test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)
# Create a data loader for the training set
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, # Number of samples per batch
                        shuffle=True) # Shuffle the data each epoch
# Create a data loader for the test set
test_loader = DataLoader(dataset=test_dataset,
                        batch_size=batch_size, # Same batch size as training
                        shuffle=False) # No shuffling for test data