In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

In [29]:
class KMeans:
    def __init__(self,K,max_iter = 100,diff = 1e-4):
        self.k = K
        self.max_iter = max_iter
        self.diff = diff
        self.centroids = None

    def centroids_init(self,X):
        self.centroids = X[np.random.choice(X.shape[0],self.k,replace=False)]
    
    def compute_distances(self, X, centroids):
        return np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)



    def assign_cluster(self,X):
        distances = self.compute_distances(X,self.centroids)
        cluster = np.argmin(distances,axis=1)
        return cluster
    
    def update_centroids(self,X,labels):
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(self.k)])
        return new_centroids

    def compute_objective(self, X, labels):
        distances = np.linalg.norm(X - self.centroids[labels], axis=1)
        return np.sum(distances**2)
    
    def fit(self, X):
        self.centroids_init(X)
        for i in range(self.max_iter):
            cluster_assignments = self.assign_cluster(X)
            
            new_centroids = self.update_centroids(X, cluster_assignments)
            
            if np.all(np.abs(new_centroids - self.centroids) < self.diff):
                print(f"Converged after {i+1} iterations.")
                break
            
            self.centroids = new_centroids

        return cluster_assignments, self.centroids

    
    def predict(self,X):
        distances = self.compute_objective(X,self.centroids)
        labels = np.argmin(distances,axis=1)
        return labels


In [17]:
train_data = pd.read_csv('../Dataset/MNIST_CSV/mnist_train.csv')
y_train_full = train_data.iloc[:, 0].values
X_train_full = train_data.iloc[:, 1:].values
 
test_data = pd.read_csv('../Dataset/MNIST_CSV/mnist_test.csv')
y_test = test_data.iloc[:, 0].values
X_test = test_data.iloc[:, 1:].values
 
X_train_full = X_train_full / 255.0
X_test = X_test / 255.0
 
 
X_combined = np.vstack((X_train_full, X_test))
y_combined = np.hstack((y_train_full, y_test))
 
 
X_train, X_remaining, y_train, y_remaining = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined
)
 
X_val, X_test, y_val, y_test = train_test_split(
    X_remaining, y_remaining, test_size=0.5, random_state=42, stratify=y_remaining
)
 
 
print(f"Training Data Shape: {X_train.shape}, Training Labels Shape: {y_train.shape}")
print(f"Validation Data Shape: {X_val.shape}, Validation Labels Shape: {y_val.shape}")
print(f"Test Data Shape: {X_test.shape}, Test Labels Shape: {y_test.shape}")

Training Data Shape: (55998, 784), Training Labels Shape: (55998,)
Validation Data Shape: (7000, 784), Validation Labels Shape: (7000,)
Test Data Shape: (7000, 784), Test Labels Shape: (7000,)


In [30]:
kmeans = KMeans(K=10)

cluster_assignments, centroids = kmeans.fit(X_train)

for i in range(10):
    plt.subplot(2, 5, i+1)
    plt.imshow(centroids[i].reshape(28, 28), cmap='gray')
    plt.title(f"Centroid {i}")
    plt.axis('off')

plt.show()


KeyboardInterrupt: 

In [6]:
from sklearn.datasets import load_files
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

newsgroups_train = load_files('../Dataset/20news-bydate/20news-bydate-train', encoding='ISO-8859-1')
newsgroups_test = load_files('../Dataset/20news-bydate/20news-bydate-test', encoding='ISO-8859-1')

X_train_raw, y_train = newsgroups_train.data, newsgroups_train.target
X_test_raw, y_test = newsgroups_test.data, newsgroups_test.target

print('Training Dataset Size:', len(X_train_raw))
print('Testing Dataset Size:', len(X_test_raw))
print('Number of Classes:', len(newsgroups_train.target_names))

vectorizer = TfidfVectorizer(max_features=5000) 
X_train = vectorizer.fit_transform(X_train_raw).toarray()
X_test = vectorizer.transform(X_test_raw).toarray()

print(f"Training Data Shape: {X_train.shape}, Training Labels Shape: {y_train.shape}")
print(f"Test Data Shape: {X_test.shape}, Test Labels Shape: {y_test.shape}")


Training Dataset Size: 11314
Testing Dataset Size: 7532
Number of Classes: 20
Training Data Shape: (11314, 5000), Training Labels Shape: (11314,)
Test Data Shape: (7532, 5000), Test Labels Shape: (7532,)


In [11]:
kmeans = KMeans(20,100,1e-5)

kmeans.fit(X_train)

print(f"Final Objective Function Value: {kmeans.objective_value}")
print(f"Final Centroids Shape: {kmeans.centroids.shape}")
print(f"Cluster Assignments (First 10 Data Points): {kmeans.final_labels[:10]}")


AttributeError: 'KMeans' object has no attribute 'compute_distances'