# Imports

In [None]:
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans 
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from utils.data import load_breast_cancer_kagglehub, standardize_fit_transform
from utils.internal_metrics import silhouette_score , calinski_harabasz_index , davies_bouldin_index ,wcss
from utils.external_metrics import adjusted_rand_index , purity_score , normalized_mutual_info
from GMM import GMM
from Autoencoders.autoencoder import Autoencoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

# Dataset

In [None]:
x_ae , y_ae , feature_names = load_breast_cancer_kagglehub()
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_ae)

# Autoencoder Model

In [None]:
bottleneck_size = [2, 5, 10, 15, 20]
results_ae = []
all_losses = []

# input dimension
input_dim = x_scaled.shape[1]

# hidden layers
hidden_layers = [64 , 32]

best_loss = float('inf')
best_k = None

for idx, b in enumerate(bottleneck_size):
    model_ae = Autoencoder(input_dim, hidden_layers, bottleneck=b, activation='relu', lr=0.01, lamda=0.1)
    losses = model_ae.train(x_scaled, epochs=30, batch_size=16)
    
    final_loss = losses[-1]  # final epoch loss
    if final_loss < best_loss:
        best_loss = final_loss
        best_k = b

    # Store reconstructed data
    x_reconstructed = model_ae.predict(x_scaled)
    results_ae.append((b, x_reconstructed, final_loss))
    all_losses.append(final_loss)    

print("="*10)
print(f"Best bottleneck size: {best_k} with loss {best_loss:.4f}")

best_model = Autoencoder(input_dim, hidden_layers, bottleneck=best_k, activation='relu', lr=0.01, lamda=0.1)
loss = best_model.train(x_scaled, epochs=30, batch_size=16)
Z = best_model.predict(x_scaled)


# GMM Model



In [None]:
cov_types = ['full', 'tied', 'diag', 'spherical']
results = {}

for cov in cov_types:
    bic_list, aic_list, loglik_list, gmm_models = [], [], [], []

        # Test GMM with k=2 clusters
    gmm = GMM(n_components=2, covariance_type=cov, max_iter=200, random_state=42)
    gmm.fit(Z)

    bic_list.append(gmm.bic(Z))
    aic_list.append(gmm.aic(Z))
    loglik_list.append(gmm.log_likelihoods_)
    gmm_models.append(gmm)

    # Predict
    y_pred = gmm.predict(Z)

    # Internal metrics
    sil = silhouette_score(Z, y_pred)
    dbi = davies_bouldin_index(Z, y_pred)
    ch = calinski_harabasz_index(Z, y_pred)
    wcss_val = wcss(Z, y_pred, gmm.means_)
    
    # External metrics
    ari = adjusted_rand_index(y_ae, y_pred)
    nmi = normalized_mutual_info(y_ae, y_pred)
    pur = purity_score(y_ae, y_pred)

    # Store everything
    results[cov] = {
        'BIC': bic_list[0],
        'AIC': aic_list[0],
        'loglik': loglik_list[0],
        'silhouette': sil,
        'davies_bouldin': dbi,
        'calinski_harabasz': ch,
        'wcss': wcss_val,
        'ARI': ari,
        'NMI': nmi,
        'Purity': pur,
        'model': gmm,
        }    

In [None]:
# Print and visualize
for cov in cov_types:
    print(f"Covariance Type: {cov}")
    for metric, value in results[cov].items():
        if metric != 'model' :
            print(f"  {metric}: {value}")
        
    print("="*30)

# Training Curve Loss

In [None]:
#plot loss curve for best AE model
plt.figure(figsize=(8, 5))
plt.plot(loss, label=f'Bottleneck Size: {best_k}')
plt.title('Autoencoder Training Loss Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()

# Cluster Visualization

In [None]:
plt.figure(figsize=(6,5))
plt.scatter(Z[:,0], Z[:,1], c=y_pred, cmap="viridis", s=30)
plt.xlabel("Reconstructed M")
plt.ylabel("Reconstructed B")
plt.title("KMeans Clusters After Autoencoder")
plt.grid(True)
plt.show()