In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_openml
from collections import Counter

In [13]:
class KMeansCustom:
    
    def __init__(self,K,max_iter = 100,diff = 1e-4):
        self.k = K
        self.max_iter = max_iter
        self.diff = diff
        self.centroids = None

    def centroids_init(self,X):
        self.centroids = X[np.random.choice(X.shape[0],self.k,replace=False)]
    
    def compute_distances(self, X, centroids):
        return np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)

    def assign_cluster(self,X):
        distances = self.compute_distances(X,self.centroids)
        cluster = np.argmin(distances,axis=1)
        return cluster
    
    def update_centroids(self,X,labels):
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(self.k)])
        return new_centroids

    def compute_objective(self, X, labels):
        distances = np.linalg.norm(X - self.centroids[labels], axis=1)
        return np.sum(distances**2)
    
    def fit(self, X):
        self.centroids_init(X)
        for i in range(self.max_iter):
            cluster_assignments = self.assign_cluster(X)
            new_centroids = self.update_centroids(X, cluster_assignments)
            if np.all(np.abs(new_centroids - self.centroids) < self.diff):
                print(f"Converged after {i+1} iterations.")
                break
            
            self.centroids = new_centroids
        return cluster_assignments, self.centroids


In [14]:
def evaluate_objective(X, y, K_values):
    for K in K_values:
        print(f"\nEvaluating for K = {K}")
        kmeans = KMeansCustom(K=K)
        cluster_assignments, centroids = kmeans.fit(X)
        objective = kmeans.compute_objective(X, cluster_assignments)
        print(f"KMeans Objective Function Value: {objective:.4f}")
        

def compute_purity(y_true, cluster_assignments, K):
    purity = 0
    for cluster in range(K):
        indices = np.where(cluster_assignments == cluster)[0]
        cluster_labels = y_true[indices]
        if len(cluster_labels) > 0:
            most_common_label = Counter(cluster_labels).most_common(1)[0][0]
            purity += np.sum(cluster_labels == most_common_label)
    purity_score = purity / len(y_true)
    return purity_score

def compute_gini_index(y_true, cluster_assignments, K):
    total_samples = len(y_true)
    gini_total = 0

    for cluster in range(K):
        indices = np.where(cluster_assignments == cluster)[0]
        cluster_labels = y_true[indices]
        if len(cluster_labels) == 0:
            continue
        
        label_counts = Counter(cluster_labels)
        cluster_size = len(cluster_labels)
        gini = 1 - sum((count / cluster_size) ** 2 for count in label_counts.values())
        gini_total += (cluster_size / total_samples) * gini
    
    return gini_total


In [4]:
fashion_mnist = fetch_openml('Fashion-MNIST', version=1)
X = fashion_mnist.data.astype('float32')
y = fashion_mnist.target.astype('int')

X /= 255.0

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



print(f"Training Data Shape: {X_train.shape}, Training Labels Shape: {y_train.shape}")
print(f"Test Data Shape: {X_test.shape}, Test Labels Shape: {y_test.shape}")


Training Data Shape: (56000, 784), Training Labels Shape: (56000,)
Test Data Shape: (14000, 784), Test Labels Shape: (14000,)


In [5]:
X_train = X_train.values if isinstance(X_train, pd.DataFrame) else X_train


K_values = [5,10,20]
evaluate_objective(X_train, y_train, K_values)


Evaluating for K = 5
Converged after 40 iterations.
KMeans Objective Function Value: 2199268.1776

Evaluating for K = 10
Converged after 63 iterations.
KMeans Objective Function Value: 1793730.7334

Evaluating for K = 20
KMeans Objective Function Value: 1503278.0695


In [15]:

for K in K_values:
    print(f"\nEvaluating Purity and Gini Index for K = {K}")
    kmeans = KMeansCustom(K)
    cluster_assignments, centroids = kmeans.fit(X_train)
    purity = compute_purity(y_train.to_numpy(), cluster_assignments, K)
    gini_index = compute_gini_index(y_train.to_numpy(), cluster_assignments, K)

    print(f"Purity Score: {purity:.4f}")
    print(f"Gini Index: {gini_index:.4f}")


Evaluating Purity and Gini Index for K = 5
Converged after 38 iterations.
Purity Score: 0.4111
Gini Index: 0.7060

Evaluating Purity and Gini Index for K = 10
Converged after 61 iterations.
Purity Score: 0.5749
Gini Index: 0.5577

Evaluating Purity and Gini Index for K = 20
Converged after 69 iterations.
Purity Score: 0.6573
Gini Index: 0.4449


In [9]:
from sklearn.cluster import KMeans
for K in K_values:
    print(f"\nEvaluating Purity and Gini Index for K = {K}")

    kmeans = KMeans(n_clusters=K, random_state=42, n_init=10, max_iter=300)
    cluster_assignments = kmeans.fit_predict(X_train)

    purity = compute_purity(y_train.to_numpy(), cluster_assignments, K)
    gini_index = compute_gini_index(y_train.to_numpy(), cluster_assignments, K)

    print(f"Purity Score: {purity:.4f}")
    print(f"Gini Index: {gini_index:.4f}")


Evaluating Purity and Gini Index for K = 5
Purity Score: 0.4111
Gini Index: 0.7060

Evaluating Purity and Gini Index for K = 10
Purity Score: 0.5543
Gini Index: 0.5665

Evaluating Purity and Gini Index for K = 20
Purity Score: 0.6518
Gini Index: 0.4493
