In [1]:
# Importing necessary packages
import torch
from torchvision import datasets, transforms
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans as skl_KMeans

from classes.KMeans import KMeans

### Loading and Pre-processing Data:

In [3]:
# Load the MNIST dataset
transform = transforms.Compose([transforms.ToTensor()])  # Converting image to tensor
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)  # Loading train data
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform, download=True)  # Loading test data

# Preprocess the data
train_X = train_dataset.data.view(train_dataset.data.size(0), -1).numpy()  # Flattening the images
train_y = train_dataset.targets.numpy()  # Converting tensor to numpy array

test_X = test_dataset.data.view(test_dataset.data.size(0), -1).numpy()  # Flattening the images
test_y = test_dataset.targets.numpy()  # Converting tensor to numpy array


"""
Here we are going to extract 500 data points for each class:
"""
num_data = 500
num_classes = 10

# Initialize variables to keep track of image counts for each class
class_counts = np.zeros(num_classes, dtype=int)

# Initialize lists to store masked images and labels
masked_images = []
masked_labels = []

# Iterate through the dataset and mask images
for image, label in zip(train_X, train_y):
    if class_counts[label] < num_data:
        masked_images.append(image)
        masked_labels.append(label)
        class_counts[label] += 1

    # Break out of loop if 500 images have been collected for each class
    if np.all(class_counts >= num_data):
        break

# Convert lists to NumPy arrays
train_X = np.array(masked_images)
train_y = np.array(masked_labels)


def normalize_data(data):
    max_ = np.max(data, axis=0)
    min_ = np.min(data, axis=0)

    range_values = max_ - min_
    range_values[range_values == 0] = 1 

    normalized_data = (data - min_) / range_values

    return normalized_data

# Normalizing data
train_X = normalize_data(train_X)

### KMeans clustering example:

In [11]:
n_clusters = 10

kmeans = KMeans(n_clusters=n_clusters, max_iters=100, similarity="cosine")
kmeans.fit(train_X)
clus_dist = kmeans.internal_cluster_dist()

kmeans_sk = skl_KMeans(n_clusters=n_clusters)
kmeans_sk.fit(train_X)
clus_dist_sk = kmeans_sk.inertia_

print("Internal cluster distance for k = 10 is: %f" %(clus_dist))
print("Internal cluster distance for k = 10 (sklearn) is: %f" %(clus_dist_sk), end="\n\n")

Converged




Internal cluster distance for k = 10 is: 195795.110037
Internal cluster distance for k = 10 (sklearn) is: 194619.342969

