In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
from scipy.stats import mode
from scipy.spatial.distance import cdist
from collections import Counter

In [3]:
train_data = pd.read_csv('mnist_train.csv')
y_train_full = train_data.iloc[:, 0].values
X_train_full = train_data.iloc[:, 1:].values
 
test_data = pd.read_csv('mnist_test.csv')
y_test = test_data.iloc[:, 0].values
X_test = test_data.iloc[:, 1:].values
 
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
 
 
X_combined = np.vstack((X_train_full, X_test))
y_combined = np.hstack((y_train_full, y_test))
 
 
X_train, X_remaining, y_train, y_remaining = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined
)
 
print(f"Training Data Shape: {X_train.shape}, Training Labels Shape: {y_train.shape}")
print(f"Test Data Shape: {X_test.shape}, Test Labels Shape: {y_test.shape}")

Training Data Shape: (55998, 784), Training Labels Shape: (55998,)
Test Data Shape: (9999, 784), Test Labels Shape: (9999,)


In [None]:
class SoftKMeans:
    def __init__(self,k=10,beta=0.1,max_iter = 100, diff = 1e-4):
        self.k = k
        self.beta = beta
        self.centroids = None
        self.max_iter = max_iter
        self.diff = diff
        self.pi_ik = None
        
    def initialize_clusters(self,X):
        np.random.seed(42)
        indices = np.random.choice(X.shape[0],self.k,replace = False)
        self.centroids = X[indices]
    
    def compute_distance(self,X):
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
        exponent_power = np.exp(-self.beta * distances)
        self.pi_ik = exponent_power / np.sum(exponent_power, axis=1, keepdims=True)
        return self.pi_ik
    
    def update_centroids(self,X):
        self.centroids = (self.pi_ik.T @ X) / np.sum(self.pi_ik, axis=0)[:, np.newaxis]
    
    def fit(self, X):
        self.initialize_clusters(X)
    
        for i in range(self.max_iter):

            old_centroids = self.centroids.copy()

            self.compute_distance(X)
            self.update_centroids(X)

            if np.linalg.norm(self.centroids - old_centroids) < self.diff:
                break
                
    def predict(self, X):
        self.compute_distance(X)
        return np.argmax(self.pi_ik, axis=1)
        

In [5]:
def calculate_purity(y_true, y_pred, K):
    purity = 0
    for i in range(K):
        cluster_labels = y_true[y_pred == i]
        if len(cluster_labels) > 0:
            majority_class = mode(cluster_labels).mode[0]
            purity += np.sum(cluster_labels == majority_class)
    return purity / len(y_true)


In [6]:
from collections import Counter
import numpy as np
from scipy.spatial.distance import cdist

results = {}

beta = [0.1, 1, 10]

for b in beta:
    soft_kmeans = SoftKMeans(k=10, beta=b, max_iter=100)
    soft_kmeans.fit(X_train)
    y_pred = soft_kmeans.predict(X_combined)

    # Compute distances
    distances = cdist(X_combined, soft_kmeans.centroids, metric='euclidean')
    weights = soft_kmeans.compute_distance(X_combined)
    objective_value = np.sum(weights * distances**2)

    # Assign cluster labels
    cluster_labels = []
    for k in range(soft_kmeans.k):
        cluster_indices = np.where(y_pred == k)[0]
        if len(cluster_indices) > 0:
            class_counts = Counter(y_combined[cluster_indices])
            cluster_labels.append(class_counts.most_common(1)[0][0])
        else:
            cluster_labels.append(-1)

    purity = np.mean([1 if y_combined[i] == cluster_labels[y_pred[i]] else 0 for i in range(len(y_pred))])

    gini_index = 1 - np.mean([
        sum((count / sum(class_counts.values())) ** 2 for count in class_counts.values())
        for class_counts in [Counter(y_combined[y_pred == k]) for k in range(soft_kmeans.k)]
        if sum(class_counts.values()) > 0  # Avoid division by zero
    ])

    results[b] = (objective_value, purity, gini_index)

for b, (obj_val, pur, gini) in results.items():
    print(f"Beta: {b}")
    print(f"Objective Value: {obj_val}")
    print(f"Purity: {pur}")
    print(f"Gini Index: {gini}")
    print("-" * 30)


Beta: 0.1
Objective Value: 3692313.3908608477
Purity: 0.26686476756478755
Gini Index: 0.7371381105801285
------------------------------
Beta: 1
Objective Value: 3692313.390788205
Purity: 0.20949169976285037
Gini Index: 0.7910950348893235
------------------------------
Beta: 10
Objective Value: 2760141.185328836
Purity: 0.5890311151747193
Gini Index: 0.5071621009829562
------------------------------
