In [3]:
import numpy as np
from sklearn import datasets
import os
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal

def k_means(X, K, max_iters=100):

    # Randomly initialize centroids
    centroids = X[np.random.choice(len(X), K, replace=False)]
    for i in range(max_iters):
        # Assignment step
        distances = np.linalg.norm(X[:, np.newaxis] - centroids, axis=2)
        labels = np.argmin(distances, axis=1)
        
        # Update step
        new_centroids = np.array([X[labels == k].mean(axis=0) for k in range(K)])
        
        # Check for convergence
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    return centroids, labels, i+1

def run_convergence_kmeans():
    n_samples_per_blob = 100
    n_features_list = [2, 3, 4, 5, 6, 7, 8]
    n_trials = 10
    cluster_std = 0.5
    K = 3  

    convergence_times = {n_features: [] for n_features in n_features_list}

    for n_features in n_features_list:

        centers = n_features 
        for trial in range(n_trials):
            # Generate data
            X, _ = datasets.make_blobs(n_samples=10000, n_features=trial+1, centers=3, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=None, return_centers=False)

            # Run K-means
            _, _, iterations = k_means(X, K)
            
            # Record convergence time
            convergence_times[n_features].append(iterations)
    
    return convergence_times
def analyze_results(convergence_times):
    for n_features, times in convergence_times.items():
        mean_iterations = np.mean(times)
        std_iterations = np.std(times)
        print(f"Dimensionality: {n_features}D")
        print(f"Average Iterations to Converge: {mean_iterations:.2f}")
        print(f"Standard Deviation: {std_iterations:.2f}\n")

print("K-means with random initialization:")
convergence_times = run_convergence_kmeans()
analyze_results(convergence_times)


K-means with random initialization:
Dimensionality: 2D
Average Iterations to Converge: 21.20
Standard Deviation: 20.77

Dimensionality: 3D
Average Iterations to Converge: 21.80
Standard Deviation: 17.85

Dimensionality: 4D
Average Iterations to Converge: 5.80
Standard Deviation: 8.21

Dimensionality: 5D
Average Iterations to Converge: 8.30
Standard Deviation: 12.58

Dimensionality: 6D
Average Iterations to Converge: 11.30
Standard Deviation: 14.75

Dimensionality: 7D
Average Iterations to Converge: 11.40
Standard Deviation: 16.18

Dimensionality: 8D
Average Iterations to Converge: 25.40
Standard Deviation: 29.19



  new_centroids = np.array([X[labels == k].mean(axis=0) for k in range(K)])
  ret = um.true_divide(


In [4]:
def kmeans_pp(X, n_clusters=3, max_iter=300, tol=1e-4, random_state=42):
    # Set random seed
    np.random.seed(random_state)
    n_samples, _ = X.shape
    centroids = []
    first_centroid_idx = np.random.randint(0, n_samples)
    centroids.append(X[first_centroid_idx])
    for _ in range(1, n_clusters):
        distances = np.array([
            min(np.linalg.norm(x - c) ** 2 for c in centroids) for x in X
        ])
        probabilities = distances / distances.sum()
        cumulative_probabilities = np.cumsum(probabilities)
        r = np.random.rand()
        next_centroid_idx = np.where(cumulative_probabilities >= r)[0][0]
        centroids.append(X[next_centroid_idx])
    centroids = np.array(centroids)
    for iteration in range(max_iter):
        clusters = np.argmin(
            np.linalg.norm(X[:, np.newaxis] - centroids, axis=2), axis=1
        )
        new_centroids = np.array([
            X[clusters == k].mean(axis=0) if len(X[clusters == k]) > 0 else centroids[k]
            for k in range(n_clusters)
        ])
        if np.all(np.linalg.norm(new_centroids - centroids, axis=1) < tol):
            break
        centroids = new_centroids
    return centroids, clusters, iteration + 1
def run_experiments_kmeanspp():
    n_samples_per_blob = 100
    n_features_list = [2, 3, 4, 5, 6, 7, 8]
    n_trials = 10
    K = 3  # Number of clusters

    convergence_times = {n_features: [] for n_features in n_features_list}

    for n_features in n_features_list:
        for trial in range(n_trials):
            # Generate data
            X, _ = datasets.make_blobs(
                n_samples=10000,
                n_features=n_features,
                centers=K,
                cluster_std=1.0,
                center_box=(-10.0, 10.0),
                shuffle=True,
                random_state=None,
                return_centers=False
            )

            # Run KMeans++ algorithm
            _, _, iterations = kmeans_pp(X, n_clusters=K)

            # Record convergence time
            convergence_times[n_features].append(iterations)

    return convergence_times

print("K-means++:")
convergence_times_pp = run_experiments_kmeanspp()
analyze_results(convergence_times_pp)


K-means++:
Dimensionality: 2D
Average Iterations to Converge: 3.80
Standard Deviation: 0.60

Dimensionality: 3D
Average Iterations to Converge: 16.00
Standard Deviation: 0.00

Dimensionality: 4D
Average Iterations to Converge: 2.00
Standard Deviation: 0.00

Dimensionality: 5D
Average Iterations to Converge: 2.00
Standard Deviation: 0.00

Dimensionality: 6D
Average Iterations to Converge: 2.00
Standard Deviation: 0.00

Dimensionality: 7D
Average Iterations to Converge: 2.00
Standard Deviation: 0.00

Dimensionality: 8D
Average Iterations to Converge: 2.00
Standard Deviation: 0.00



In [6]:
def gmm_clustering(X, K, max_iter=100, tol=1e-6):
    N, D = X.shape
    pi = np.ones(K) / K
    indices = np.random.choice(N, K, replace=False)
    mu = X[indices]
    sigma = np.array([np.eye(D) for _ in range(K)]) 
    gamma = np.zeros((N, K))
    log_likelihoods = []
    for iteration in range(max_iter):     
        for k in range(K):
            gamma[:, k] = pi[k] * multivariate_normal.pdf(X, mean=mu[k], cov=sigma[k])
        gamma /= gamma.sum(axis=1, keepdims=True)
        N_k = gamma.sum(axis=0)
        pi = N_k / N
        mu = (gamma.T @ X) / N_k[:, np.newaxis]
        for k in range(K):
            X_centered = X - mu[k]
            gamma_k = gamma[:, k][:, np.newaxis]
            sigma_k = (gamma_k * X_centered).T @ X_centered / N_k[k]
            sigma[k] = sigma_k + 1e-6 * np.eye(D)
        log_likelihood = np.sum(
            np.log(
                np.sum(
                    [pi[k] * multivariate_normal.pdf(X, mu[k], sigma[k]) for k in range(K)],
                    axis=0
                )
            )
        )
        log_likelihoods.append(log_likelihood)
        # Check for convergence
        if iteration > 0 and abs(log_likelihood - log_likelihoods[-2]) < tol:
            break
    # Assign labels based on the highest responsibility
    labels = np.argmax(gamma, axis=1)
    return labels, iteration + 1  # Return number of iteration

def run_experiments_gmm():
    n_samples_per_blob = 100
    n_features_list = [2, 3, 4, 5, 6, 7, 8]
    n_trials = 10
    K = 3  # Number of clusters

    convergence_times = {n_features: [] for n_features in n_features_list}

    for n_features in n_features_list:
        for trial in range(n_trials):
            # Generate data
            X, _ = datasets.make_blobs(
                n_samples=10000,
                n_features=n_features,
                centers=K,
                cluster_std=1.0,
                center_box=(-10.0, 10.0),
                shuffle=True,
                random_state=None,
                return_centers=False
            )

            # Run GMM
            _, iterations = gmm_clustering(X, K)

            # Record convergence time
            convergence_times[n_features].append(iterations)

    return convergence_times
print("GMM clustering:")
convergence_times_gmm = run_experiments_gmm()
analyze_results(convergence_times_gmm)

GMM clustering:
Dimensionality: 2D
Average Iterations to Converge: 30.80
Standard Deviation: 28.96

Dimensionality: 3D
Average Iterations to Converge: 40.90
Standard Deviation: 41.30

Dimensionality: 4D
Average Iterations to Converge: 25.20
Standard Deviation: 37.59

Dimensionality: 5D
Average Iterations to Converge: 35.90
Standard Deviation: 42.13

Dimensionality: 6D
Average Iterations to Converge: 40.40
Standard Deviation: 43.42

Dimensionality: 7D
Average Iterations to Converge: 46.10
Standard Deviation: 44.60

Dimensionality: 8D
Average Iterations to Converge: 42.20
Standard Deviation: 47.23

