# Imports 

In [None]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans 
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from utils.data import load_breast_cancer_kagglehub, standardize_fit_transform
from utils.internal_metrics import silhouette_score , calinski_harabasz_index , davies_bouldin_index ,wcss
from utils.external_metrics import adjusted_rand_index , purity_score , normalized_mutual_info
from KMeans import KMeans
from Autoencoders.autoencoder import Autoencoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

# Dataset

In [None]:
x_ae , y_ae , feature_names = load_breast_cancer_kagglehub()
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_ae)

# Autoencoder Model

In [None]:
bottleneck_size = [2, 5, 10, 15, 20]
results_ae = []
all_losses = []

# input dimension
input_dim = x_scaled.shape[1]

# hidden layers
hidden_layers = [64 , 32]

best_loss = float('inf')
best_k = None

for idx, b in enumerate(bottleneck_size):
    model_ae = Autoencoder(input_dim, hidden_layers, bottleneck=b, activation='relu', lr=0.01, lamda=0.1)
    losses = model_ae.train(x_scaled, epochs=30, batch_size=16)
    
    final_loss = losses[-1]  # final epoch loss
    if final_loss < best_loss:
        best_loss = final_loss
        best_k = b

print("="*10)
print(f"Best bottleneck size: {best_k} with loss {best_loss:.4f}")
 

# K-Means Model

In [None]:
kmeans_model = KMeans(n_clusters=2, init="kmeans++", random_state=42)

# reconstruct data using best AE model
best_ae_model = Autoencoder(input_dim, hidden_layers, bottleneck=best_k,activation='relu', lr=0.01, lamda=0.1)  
loss = best_ae_model.train(x_scaled, epochs=50, batch_size=16)

x_reconstructed = best_ae_model.predict(x_scaled)

labels_ae = kmeans_model.fit_predict(x_reconstructed)



# Internal Analysis

In [None]:
ss_ae = silhouette_score(x_reconstructed, labels_ae)
print(f"Silhouette Score after AE reconstruction with bottleneck {best_k}: {ss_ae:.4f}")

dbi_ae = davies_bouldin_index(x_reconstructed, labels_ae)
print(f"Davies-Bouldin Index after AE reconstruction with bottleneck {best_k}: {dbi_ae:.4f}")

ch_ae = calinski_harabasz_index(x_reconstructed, labels_ae)
print(f"Calinski-Harabasz Index after AE reconstruction with bottleneck {best_k}: {ch_ae:.4f}")


# External Analysis

In [None]:
ari_ae = adjusted_rand_index(y_ae, labels_ae)
print(f"Adjusted Rand Index after AE reconstruction with bottleneck {best_k}: {ari_ae:.4f}")

purity_ae = purity_score(y_ae, labels_ae)
print(f"Purity Score after AE reconstruction with bottleneck {best_k}: {purity_ae:.4f}")

nmi_ae = normalized_mutual_info(y_ae, labels_ae)
print(f"Normalized Mutual Information after AE reconstruction with bottleneck {best_k}: {nmi_ae:.4f}")


confusion_matrix_ae = confusion_matrix(y_ae, labels_ae)
print("Confusion Matrix after AE reconstruction with bottleneck {}: \n{}".format(best_k, confusion_matrix_ae))

plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix_ae, annot=True, fmt="d", cmap="Blues",
                xticklabels=['M','B'], yticklabels=['M','B'])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Training Curve Loss

In [None]:

#plot loss curve for best AE model
plt.figure(figsize=(8, 5))
plt.plot(loss, label=f'Bottleneck Size: {best_k}')
plt.title('Autoencoder Training Loss Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()

# Cluster Visualization

In [None]:
plt.figure(figsize=(6,5))
plt.scatter(x_reconstructed[:,0], x_reconstructed[:,1], c=labels_ae, cmap="viridis", s=30)
plt.xlabel("Reconstructed M")
plt.ylabel("Reconstructed B")
plt.title("KMeans Clusters After Autoencoder")
plt.grid(True)
plt.show()