<a href="https://colab.research.google.com/github/haydenkirkeide/CAP-5771-Assignment-2-Files/blob/main/run2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
import numpy as np
from scipy.spatial.distance import euclidean

import warnings
warnings.filterwarnings('ignore')

# load in the data
df = np.loadtxt('seeds.txt')

# k vals
def init_cents(df, k):
    indices = np.random.choice(df.shape[0], k, replace=False)
    return df[indices].copy()

# euclidean distance
def assign_clusters(df, centroids):
    distances = np.zeros((df.shape[0], len(centroids)))
    for i, centroid in enumerate(centroids):
        for j, point in enumerate(df):
            distances[j, i] = euclidean(point, centroid)
    return np.argmin(distances, axis=1)

# recalculating centroids
def centroid_recalc(df, labels, k):
    centroids = np.zeros((k, df.shape[1]))
    for i in range(k):
        cluster_points = df[labels == i]
        if len(cluster_points) > 0:
            centroids[i] = cluster_points.mean(axis=0)
        else:
            centroids[i] = df[np.random.choice(df.shape[0])]
    return centroids

# sum of squared errors (SSE)
def calc_sse(df, labels, centroids):
    sse = 0
    for i, centroid in enumerate(centroids):
        cluster_points = df[labels == i]
        for point in cluster_points:
            sse += euclidean(point, centroid) ** 2
    return sse

# k means
def kmeans(df, k, max_iterations=100, tolerance=0.001):
    centroids = init_cents(df, k)
    prev_sse = float('inf')

    for iteration in range(max_iterations):
        labels = assign_clusters(df, centroids)
        centroids = centroid_recalc(df, labels, k)
        sse = calc_sse(df, labels, centroids)

        if abs(prev_sse - sse) < tolerance:
            return centroids, labels, sse, iteration + 1
        prev_sse = sse

    return centroids, labels, sse, max_iterations

# loop through k vals
k_vals = [3, 5, 7]
n_runs = 10

results = {}
for k in k_vals:
    sse_values = []
    for run in range(n_runs):
        centroids, labels, sse, iterations = kmeans(df, k)
        sse_values.append(sse)
    avg_sse = np.mean(sse_values)
    results[k] = avg_sse

# results
print("Average SSE for each k (over 10 random initializations): \n--------------------------------------------------------")
for k, avg_sse in results.items():
    print(f"k = {k}: average SSE = {avg_sse}")

Average SSE for each k (over 10 random initializations): 
--------------------------------------------------------
k = 3: average SSE = 588.0503018858932
k = 5: average SSE = 407.8457492422916
k = 7: average SSE = 303.8364768349491
