In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

In [27]:
class KMeans:
    
    def __init__(self,K,max_iter = 100,diff = 1e-4):
        self.k = K
        self.max_iter = max_iter
        self.diff = diff
        self.centroids = None

    def centroids_init(self,X):
        self.centroids = X[np.random.choice(X.shape[0],self.k,replace=False)]
    
    def compute_distances(self, X, centroids):
        return np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)

    def assign_cluster(self,X):
        distances = self.compute_distances(X,self.centroids)
        cluster = np.argmin(distances,axis=1)
        return cluster
    
    def update_centroids(self,X,labels):
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(self.k)])
        return new_centroids

    def compute_objective(self, X, labels):
        distances = np.linalg.norm(X - self.centroids[labels], axis=1)
        return np.sum(distances**2)
    
    def fit(self, X):
        self.centroids_init(X)
        for i in range(self.max_iter):
            cluster_assignments = self.assign_cluster(X)
            new_centroids = self.update_centroids(X, cluster_assignments)
            if np.all(np.abs(new_centroids - self.centroids) < self.diff):
                print(f"Converged after {i+1} iterations.")
                break
            
            self.centroids = new_centroids
        return cluster_assignments, self.centroids


In [15]:
train_data = pd.read_csv('mnist_train.csv')
y_train_full = train_data.iloc[:, 0].values
X_train_full = train_data.iloc[:, 1:].values
 
test_data = pd.read_csv('mnist_test.csv')
y_test = test_data.iloc[:, 0].values
X_test = test_data.iloc[:, 1:].values
 
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
 
 
X_combined = np.vstack((X_train_full, X_test))
y_combined = np.hstack((y_train_full, y_test))
 
 
X_train, X_remaining, y_train, y_remaining = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined
)
 
print(f"Training Data Shape: {X_train.shape}, Training Labels Shape: {y_train.shape}")
print(f"Test Data Shape: {X_test.shape}, Test Labels Shape: {y_test.shape}")

Training Data Shape: (55998, 784), Training Labels Shape: (55998,)
Test Data Shape: (9999, 784), Test Labels Shape: (9999,)


In [22]:
def evaluate_objective(X, y, K_values):
    for K in K_values:
        print(f"\nEvaluating for K = {K}")
        kmeans = KMeans(K=K)
        cluster_assignments, centroids = kmeans.fit(X)
        objective = kmeans.compute_objective(X, cluster_assignments)
        print(f"KMeans Objective Function Value: {objective:.4f}")
        

def compute_purity(y_true, cluster_assignments, K):
    cluster_to_label = {}
    for cluster in range(K):
        indices = np.where(cluster_assignments == cluster)[0]
        cluster_labels = y_true[indices]
        if len(cluster_labels) > 0:
            most_common_label = Counter(cluster_labels).most_common(1)[0][0]
            cluster_to_label[cluster] = most_common_label
    
    y_pred = np.array([cluster_to_label[c] for c in cluster_assignments])
    accuracy = np.mean(y_pred == y_true)
    return accuracy

def compute_gini_index(y_true, cluster_assignments, K):
    total_samples = len(y_true)
    gini_total = 0

    for cluster in range(K):
        indices = np.where(cluster_assignments == cluster)[0]
        cluster_labels = y_true[indices]
        if len(cluster_labels) == 0:
            continue
        
        label_counts = Counter(cluster_labels)
        cluster_size = len(cluster_labels)
        gini = 1 - sum((count / cluster_size) ** 2 for count in label_counts.values())
        gini_total += (cluster_size / total_samples) * gini
    
    return gini_total


In [18]:
K_values = [10, 5, 20]
evaluate_objective(X_train, y_train, K_values)


Evaluating for K = 10
Converged after 99 iterations.
KMeans Objective Function Value: 2211648.8621

Evaluating for K = 5
Converged after 27 iterations.
KMeans Objective Function Value: 2432322.2072

Evaluating for K = 20
Converged after 90 iterations.
KMeans Objective Function Value: 1978495.3097


In [23]:

for K in K_values:
    print(f"\nEvaluating Purity and Gini Index for K = {K}")
    kmeans = KMeans(K=K)
    cluster_assignments, centroids = kmeans.fit(X_train)

    purity = compute_purity(y_train, cluster_assignments, K)
    gini_index = compute_gini_index(y_train, cluster_assignments, K)

    print(f"Purity Score: {purity:.4f}")
    print(f"Gini Index: {gini_index:.4f}")


Evaluating Purity and Gini Index for K = 10
Converged after 99 iterations.
Purity Score: 0.5892
Gini Index: 0.5374

Evaluating Purity and Gini Index for K = 5
Converged after 27 iterations.
Purity Score: 0.4516
Gini Index: 0.6590

Evaluating Purity and Gini Index for K = 20
Purity Score: 0.7075
Gini Index: 0.3981


In [25]:
from sklearn.cluster import KMeans
for K in K_values:
    print(f"\nEvaluating Purity and Gini Index for K = {K}")

    kmeans = KMeans(n_clusters=K, random_state=42, n_init=10, max_iter=300)
    cluster_assignments = kmeans.fit_predict(X_train)

    purity = compute_purity(y_train, cluster_assignments, K)
    gini_index = compute_gini_index(y_train, cluster_assignments, K)

    print(f"Purity Score: {purity:.4f}")
    print(f"Gini Index: {gini_index:.4f}")


Evaluating Purity and Gini Index for K = 10
Purity Score: 0.5841
Gini Index: 0.5440

Evaluating Purity and Gini Index for K = 5
Purity Score: 0.4521
Gini Index: 0.6586

Evaluating Purity and Gini Index for K = 20
Purity Score: 0.7095
Gini Index: 0.4027
