# Imports

In [None]:
import json
import numpy as np
import pandas as pd
import struct
import random
import matplotlib.pyplot as plt
import umap
import seaborn as sns
import csv
import hdbscan
import networkx as nx
import pickle
from sklearn.metrics import silhouette_score, silhouette_samples, confusion_matrix, accuracy_score, davies_bouldin_score, adjusted_rand_score
from sklearn.metrics import pairwise_distances as sklearn_pairwise_distances
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from scipy.stats import mode, linregress, norm
from array import array
from os.path import join
from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding, MDS,trustworthiness
from sklearn.utils import resample
from scipy.stats import multivariate_normal
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# MNIST Data Loader Class

In [28]:
# MNIST Data Loader Class

class MnistDataloader(object):
    def __init__(self, training_images_filepath, training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath
    
    def read_images_labels(self, images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())
        
        # Convert labels to NumPy array
        labels = np.array(labels)
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img            
        
        return images, labels
            
    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        
        return (x_train, y_train), (x_test, y_test)

## Verify Reading Dataset via MNISTDataloader class

In [None]:
# Verify Reading Dataset via MnistDataloader class
%matplotlib inline

# Set file paths of MNIST Datasets
input_path = 'C:/Users/Lorenzo/OneDrive/Documents/DTU/Python/2024 Fall/MSc Thesis'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')

# Helper function to show a list of images with their relating titles
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images)/cols) + 1
    plt.figure(figsize=(30,20))
    index = 1    
    for x in zip(images, title_texts):        
        image = x[0]        
        title_text = x[1]
        plt.subplot(rows, cols, index)        
        plt.imshow(image, cmap=plt.cm.gray)
        if (title_text != ''):
            plt.title(title_text, fontsize = 15);        
        index += 1

# Load MINST dataset
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()

# Show some random training and test images 
images_2_show = []
titles_2_show = []
for i in range(0, 10):
    r = random.randint(1, 60000)
    images_2_show.append(x_train[r])
    titles_2_show.append('training image [' + str(r) + '] = ' + str(y_train[r]))    

for i in range(0, 5):
    r = random.randint(1, 10000)
    images_2_show.append(x_test[r])        
    titles_2_show.append('test image [' + str(r) + '] = ' + str(y_test[r]))    

show_images(images_2_show, titles_2_show)

In [30]:
# List of images to a NumPy array
x_train_array = np.array(x_train)
x_test_array = np.array(x_test)

- Option 1) Normalization done by Standarization (zero mean and unit variance). Recommended for DR techniques and clustering

In [None]:
from sklearn.preprocessing import StandardScaler

# Flatten images if needed
x_train_flattened1 = x_train_array.reshape(x_train_array.shape[0], -1)
x_test_flattened1 = x_test_array.reshape(x_test_array.shape[0], -1)
# Standardize the data
scaler = StandardScaler()
x_train_standardized1 = scaler.fit_transform(x_train_flattened1)
x_test_standardized1 = scaler.transform(x_test_flattened1)

print("Data standardized: Mean =", x_train_standardized1.mean(), "Std Dev =", x_train_standardized1.std())

- Option 2) Normalizaiton done by Scailing reshaping the images scaling to [0,1]. However more in use for neural networks.

In [5]:
# Flatten the 28x28 images into 784-dimensional vectors
x_train_flattened = np.array([img.flatten() for img in x_train_array])
x_test_flattened = np.array([img.flatten() for img in x_test_array])

In [6]:
# Normalizing by 255 scales the pixel intensity values to the [0, 1] range.
# Hhelps improve performance and consistency in clustering and dimensionality reduction algorithms. 
# Making it a common practice in image-based data processing.

x_train_normalized = x_train_flattened / 255.0
x_test_normalized = x_test_flattened / 255.0

In [None]:
# Check normalization for x_train_normalized
train_mean = x_train_normalized.mean(axis=0)  # Mean for each feature
train_std = x_train_normalized.std(axis=0)    # Standard deviation for each feature

# Check normalization for x_test_normalized
test_mean = x_test_normalized.mean(axis=0)  # Mean for each feature
test_std = x_test_normalized.std(axis=0)    # Standard deviation for each feature

# Print results
print("Train Data - Mean (per feature):")
print(train_mean)
print("Train Data - Standard Deviation (per feature):")
print(train_std)

print("\nTest Data - Mean (per feature):")
print(test_mean)
print("Test Data - Standard Deviation (per feature):")
print(test_std)

# Verify if data is normalized
if np.allclose(train_mean, 0, atol=1e-2) and np.allclose(train_std, 1, atol=1e-2):
    print("\nx_train_normalized is properly normalized (zero mean, unit variance).")
else:
    print("\nx_train_normalized is NOT properly normalized.")

if np.allclose(test_mean, 0, atol=1e-2) and np.allclose(test_std, 1, atol=1e-2):
    print("x_test_normalized is properly normalized (zero mean, unit variance).")
else:
    print("x_test_normalized is NOT properly normalized.")


----------------

# Literature Review Analysis

## PCA Clustering

In [None]:
# Step 2: Apply PCA
pca = PCA(0.95)
x_train_pca = pca.fit_transform(x_train_normalized)
x_test_pca = pca.transform(x_test_normalized)

print(f"Original number of features: {x_train_normalized.shape[1]}")
print(f"Reduced number of features: {x_train_pca.shape[1]}")

In [25]:
# ARI
ari_pca_c2 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(x_train_pca, y_train).predict(x_test_pca)) # second argument is y_test_pred_pca

# Silhouette Score
silhouette_pca_c2 = silhouette_score(x_test_pca, KNeighborsClassifier(n_neighbors=1).fit(x_train_pca, y_train).predict(x_test_pca))

In [None]:
print(ari_pca_c2)
print(silhouette_pca_c2)

In [None]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_pca_c2, y_train)
y_test_pred_pca_c2 = svm_clf.predict(x_test_pca_c2)
svm_accuracy_pca = accuracy_score(y_test, y_test_pred_pca_c2)

After applying PCA the 784-dimensional data was reduced to 154 dimensions while preserving 95% of the variance. Meaning that 154 new features (principal components) retain almost all the important information (variance) from the original 784 features, with minimal information loss.

In [10]:
# Save PCA embeddings to .npy file
np.save("pca_embeddings.npy", x_train_pca)

In [22]:
pca_embeddings=np.load("pca_embeddings.npy")

In [None]:
# Step 3: K-Means Clustering
kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(x_train_pca)
train_cluster_labels = kmeans.labels_
test_cluster_labels = kmeans.predict(x_test_pca)

# Step 4: Cluster Evaluation
cluster_label_mapping = {}

for cluster in range(10):
    indices = np.where(train_cluster_labels == cluster)
    if len(indices[0]) > 0:
        most_common_label = mode(np.array(y_train)[indices]).mode[0]
        cluster_label_mapping[cluster] = most_common_label
    else:
        cluster_label_mapping[cluster] = -1

print("\nCluster to Label Mapping:")
for cluster, label in cluster_label_mapping.items():
    print(f"Cluster {cluster}: Label {label}")

- **Cluster 3** and **Cluster 7** both correspond to Label 0 (digit zero). Indicating that the clustering algorithm created two different clusters that mostly contain images of 0.
- Similarly, **Cluster 2** and **Cluster 9** both correspond to Label 1. This overlap suggests that the digits might not be completely separated into unique clusters by K-Means.
- This overlap may arise because of variation in handwriting styles or because some digits (like 0, 1, or 8) have distinct shapes depending on how they are written.

In [12]:
# Save K-Means cluster labels from PCA-reduced data
np.save("kmeans_labels_pca.npy", train_cluster_labels)

# Save cluster-to-label mapping for PCA-based K-Means
with open("cluster_to_label_mapping_pca.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Cluster", "Label"])
    for cluster, label in cluster_label_mapping.items():
        writer.writerow([cluster, label])

In [None]:
# Step 5: Predict Test Labels
test_predicted_labels = [cluster_label_mapping[cluster] for cluster in test_cluster_labels]

# Step 6: Evaluate Performance
accuracy = accuracy_score(y_test, test_predicted_labels)
print(f"\nClustering-based classification accuracy on test set: {accuracy:.4f}")

After applying PCA unsupervised clusters and mapping them to labels, can be correctly classified about 59.5% of the test images.

*Possible options for improving*

- **Dimensionality Reduction Tuning** try retaining 90-99% variance and observe if it affects the clustering accuracy.
- **Alternative Clustering Methods** check other clustering algorithms (e.g., Gaussian Mixture Models, Agglomerative Clustering)
- **Feature Engineering** try extracting additional features (e.g., edges, corners) that might be more informative for clustering.

In [44]:
# PCA results load

train_cluster_labels = np.load(f"kmeans_labels_pca.npy")
x_train_pca = np.load(f'pca_embeddings.npy')

##### Plots & visualizations

In [6]:
# Visualize Clusters
def plot_cluster_images(cluster_number, num_images=5):
    indices = np.where(train_cluster_labels == cluster_number)[0]
    if len(indices) == 0:
        print(f"No images found for cluster {cluster_number}")
        return
    selected_indices = np.random.choice(indices, size=min(num_images, len(indices)), replace=False)
    plt.figure(figsize=(10, 2))
    for i, idx in enumerate(selected_indices):
        plt.subplot(1, num_images, i + 1)
        plt.imshow(x_train_array[idx], cmap='gray')
        plt.axis('off')
        plt.title(f"Label: {y_train[idx]}")
    plt.suptitle(f"Images from Cluster {cluster_number}")
    plt.show()

# for cluster in range(10):
#     plot_cluster_images(cluster_number=cluster, num_images=5)

In [None]:
# Explained variance
explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Principal Components')
plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(20, 0.85, '95% Variance Threshold', color='red', fontsize=12)
plt.grid()
plt.show()

In [None]:
# # Reduce to 2 components for visualization purposes
# pca_2d = PCA(n_components=2)
# x_train_pca_2d = pca_2d.fit_transform(x_train_normalized)

# Plot the 2D projection with cluster labels
plt.figure(figsize=(10, 8))
sns.scatterplot(x=x_train_pca[:, 0], y=x_train_pca[:, 1], hue=train_cluster_labels, palette='tab10', s=10, legend='full')
plt.title("2D Scatter Plot of PCA-reduced Data with Cluster Labels")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Map predicted clusters to actual labels for y_train
y_train_predicted_labels = [cluster_label_mapping[cluster] for cluster in train_cluster_labels]

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_train, y_train_predicted_labels, labels=range(10))

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=range(10), yticklabels=range(10))
plt.xlabel("Predicted Cluster Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix of Cluster-Label Mapping")
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error

reconstructed_data = pca.inverse_transform(x_reduced)
reconstruction_error = mean_squared_error(x_train_normalized, reconstructed_data)
print(f"Reconstruction Error: {reconstruction_error}")

In [18]:
# Save confusion matrix for PCA-based K-Means to CSV
conf_matrix_pca_df = pd.DataFrame(conf_matrix, index=range(10), columns=range(10))
conf_matrix_pca_df.to_csv("confusion_matrix_pca.csv", index_label="True Label")

In [None]:
# Code to measure how well PCA-reduced representation preserves the separability of classes (digits).
from sklearn.linear_model import LogisticRegression
# Train a logistic regression model on PCA-reduced training data
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(x_train_pca, y_train)

# Evaluate accuracy on the test set
y_test_pred = clf.predict(x_test_pca)
pca_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Classification accuracy with PCA features: {pca_accuracy:.4f}")

- Logistic Regression (or any classifier) can utilize the PCA-reduced data to classify the digits.

- This approach evaluates the quality of PCA embeddings for a supervised task.

In [None]:
silhouette_pca = silhouette_score(x_train_pca, y_train)
ari_pca = adjusted_rand_score(y_train, train_cluster_labels)

print(f"Silhouette Score: {silhouette_pca:.4f}")
print(f"Adjusted Rand Index: {ari_pca:.4f}")


## T- SNE

In [49]:
# t-SNE to reduce to 2D
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
x_train_tsne = tsne.fit_transform(x_train_normalized)

In [20]:
# Save t-SNE embeddings to .npy file
np.save("tsne_embeddings.npy", x_train_tsne)

In [35]:
x_train_tsne= np.load(f'tsne_embeddings.npy')

In [None]:
# Visualize the 2D embeddings

plt.figure(figsize=(8, 6))
sns.scatterplot(x=x_train_tsne[:, 0], y=x_train_tsne[:, 1], hue=y_train, palette='tab10', s=10, legend='full')
plt.title("t-SNE - 2D Scatter Plot of MNIST Data (True Labels)")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.legend(title="True Label", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Apply K-Means on the 2D t-SNE output
kmeans_tsne = KMeans(n_clusters=10, random_state=42)
kmeans_tsne.fit(x_train_tsne)

# Retrieve cluster labels assigned by K-Means
train_cluster_labels_tsne = kmeans_tsne.labels_

In [23]:
# Save K-Means cluster labels from t-SNE-reduced data
np.save("kmeans_labels_tsne.npy", train_cluster_labels_tsne)

In [None]:
# Create a mapping from cluster labels to actual digit labels
cluster_label_mapping_tsne = {}

for cluster in range(10):
    # Get indices of samples in the current cluster
    indices = np.where(train_cluster_labels_tsne == cluster)[0]
    
    # Find the most common actual label among these samples
    if len(indices) > 0:
        most_common_label = mode(np.array(y_train)[indices]).mode[0]
        cluster_label_mapping_tsne[cluster] = most_common_label
    else:
        cluster_label_mapping_tsne[cluster] = -1  # Assign -1 if the cluster is empty

print("Cluster to Label Mapping for t-SNE + K-Means:")
for cluster, label in cluster_label_mapping_tsne.items():
    print(f"Cluster {cluster}: Label {label}")

In [None]:
# Predict labels based on the cluster-to-label mapping
train_predicted_labels_tsne = [cluster_label_mapping_tsne[cluster] for cluster in train_cluster_labels_tsne]

# Calculate accuracy
accuracy_tsne = accuracy_score(y_train, train_predicted_labels_tsne)
print(f"Clustering-based classification accuracy on training set (t-SNE + K-Means): {accuracy_tsne:.4f}")

In [None]:
# Save cluster-to-label mapping for t-SNE-based K-Means
with open("cluster_to_label_mapping_tsne.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Cluster", "Label"])
    for cluster, label in cluster_label_mapping_tsne.items():
        writer.writerow([cluster, label])

print("Cluster-to-label mappings saved.")

In [None]:
# Generate confusion matrix
conf_matrix_tsne = confusion_matrix(y_train, train_predicted_labels_tsne, labels=range(10))

# Save confusion matrix for t-SNE-based K-Means to CSV
conf_matrix_tsne_df = pd.DataFrame(conf_matrix_tsne, index=range(10), columns=range(10))
conf_matrix_tsne_df.to_csv("confusion_matrix_tsne.csv", index_label="True Label")

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix_tsne, annot=True, fmt="d", cmap="Blues", xticklabels=range(10), yticklabels=range(10))
plt.xlabel("Predicted Cluster Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix of Cluster-Label Mapping (t-SNE + K-Means)")
plt.show()

In [None]:
# Load t-SNE embeddings (already saved as .npy file)
x_train_tsne = np.load("tsne_embeddings.npy")

# Train logistic regression on t-SNE-reduced training data
clf_tsne = LogisticRegression(max_iter=1000, random_state=42)
clf_tsne.fit(x_train_tsne, y_train)

# Predict and evaluate accuracy
y_test_tsne = tsne.fit_transform(x_test_normalized)  # Reduce test data to 2D using t-SNE
y_test_pred_tsne = clf_tsne.predict(y_test_tsne)
tsne_accuracy = accuracy_score(y_test, y_test_pred_tsne)

print(f"Classification accuracy with t-SNE features: {tsne_accuracy:.4f}")

In [None]:
# Compute silhouette score for t-SNE embeddings
silhouette_tsne = silhouette_score(x_train_tsne, train_cluster_labels_tsne)

# Compute ARI for t-SNE embeddings
ari_tsne = adjusted_rand_score(y_train, train_cluster_labels_tsne)

print(f"Silhouette Score (t-SNE): {silhouette_tsne:.4f}")
print(f"Adjusted Rand Index (t-SNE): {ari_tsne:.4f}")


## ISOMAP

*Computational Cost:* Isomap can be slow on large datasets, so consider experimenting with a subset of the data if computation time is an issue. **Data was FLATTENED & NORMALIZED, the downsampled use was 35%**

With the full set the algorithm was running for more than 2 hs and never ending

In [None]:
# # 10.000 images instead of the full dataset. It should keep the subset representative without overwhelming computational resources.
# subset_size = 10000

# # Randomly sample indices for the subset
# subset_indices = np.random.choice(len(x_train_normalized), subset_size, replace=False)

# # Create subset for Isomap
# x_train_subset = x_train_normalized[subset_indices]
# y_train_subset = np.array(y_train)[subset_indices]

In [19]:
# Step 1: Downsample the Dataset Consistently
def downsample_mnist_consistent(x_data, y_labels, sample_fraction=0.35):
    """
    Downsample the dataset consistently, returning indices to ensure
    the same points are selected in both spaces.
    """
    sampled_indices = []
    unique_labels = np.unique(y_labels)
    for label in unique_labels:
        # Select indices for the current label
        label_indices = np.where(y_labels == label)[0]
        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False
        )
        sampled_indices.extend(sampled_indices_label)
    return np.array(sampled_indices)

# Get consistent indices for sampling
sampled_indices = downsample_mnist_consistent(x_train_normalized, y_train, sample_fraction=0.35)

# Step 2: Use the Sampled Indices to Extract Points from Both Spaces
# Downsample the high-dimensional original space
x_sampled = x_train_normalized[sampled_indices]
y_sampled = y_train[sampled_indices]

In [None]:
# Save the sampled indices
np.save("sampled_indices.npy", sampled_indices)

# Save the downsampled dataset
np.save("x_sampled.npy", x_sampled)
np.save("y_sampled.npy", y_sampled)

print("Downsampling saved successfully!")

In [None]:
x_train_normalized

In [31]:
# Load sampled indices and sets
sampled_indices= np.load("sampled_indices.npy")
x_sampled= np.load("x_sampled.npy")
y_sampled= np.load("y_sampled.npy")

In [20]:
# Apply Isomap to the subset of data
isomap = Isomap(n_components=50, n_neighbors=5)  # Reduced n_neighbors to 5 for faster computation
# x_train_isomap_subset = isomap.fit_transform(x_train_subset)
x_reduced = isomap.fit_transform(x_sampled)

In [None]:
#Save the Isomap-reduced data
# np.save("isomap_embedding_subsets.npy", x_train_isomap_subset)
np.save("isomap_embedding_subsets_evenly_downsampled.npy", x_reduced)
print("Isomap embeddings saved.")

In [12]:
x_reduced= np.load(f"isomap_embedding_subsets_evenly_downsampled.npy")
train_cluster_labels_isomap_downsampled = np.load(f"kmeans_labels_isomap_evenly_downsampled.npy")

In [None]:
# Apply K-Means on the Isomap-reduced data
kmeans_isomap_downsampled = KMeans(n_clusters=10, random_state=42)
kmeans_isomap_downsampled.fit(x_reduced)

# Retrieve the cluster labels assigned by K-Means
train_cluster_labels_isomap_downsampled = kmeans_isomap_downsampled.labels_

# Save K-Means cluster labels
np.save("kmeans_labels_isomap_evenly_downsampled.npy", train_cluster_labels_isomap_downsampled)
print("K-Means cluster labels saved.")

In [None]:
# Ensure y_sampled corresponds to the sampled indices of y_train
assert np.array_equal(y_sampled, y_train[sampled_indices]), "Mismatch between y_sampled and sampled y_train"

# Create a mapping from cluster labels to actual digit labels
from collections import Counter

cluster_label_mapping_isomap_subset = {}

for cluster in range(10):
    # Get indices of samples in the current cluster
    indices = np.where(train_cluster_labels_isomap_downsampled == cluster)[0]
    
    if len(indices) > 0:
        # Find the most common actual label among these samples
        most_common_label = Counter(y_sampled[indices]).most_common(1)[0][0]
        cluster_label_mapping_isomap_subset[cluster] = most_common_label
    else:
        cluster_label_mapping_isomap_subset[cluster] = -1  # Assign -1 if the cluster is empty

# Print the cluster-to-label mapping
print("Cluster to Label Mapping for Isomap + K-Means:")
for cluster, label in cluster_label_mapping_isomap_subset.items():
    print(f"Cluster {cluster}: Label {label}")

In [None]:
# Save cluster-to-label mapping as a CSV file
with open("cluster_to_label_mapping_isomap_downsampled.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Cluster", "Label"])
    for cluster, label in cluster_label_mapping_isomap_subset.items():
        writer.writerow([cluster, label])

print("Cluster-to-label mapping saved.")

In [None]:
# Predict labels based on the cluster-to-label mapping
train_predicted_labels_isomap_downsampled = [cluster_label_mapping_isomap_subset[cluster] for cluster in train_cluster_labels_isomap_downsampled]

# Calculate accuracy
accuracy_isomap_subset = accuracy_score(y_sampled, train_predicted_labels_isomap_downsampled)
print(f"Clustering-based classification accuracy on training subset (Isomap + K-Means): {accuracy_isomap_subset:.4f}")

In [None]:
# Visualize the 2D embeddings

plt.figure(figsize=(8, 6))
sns.scatterplot(x=x_reduced[:, 0], y=x_reduced[:, 1], hue=y_sampled, palette='tab10', s=10, legend='full')
plt.title("Isomap - 2D Scatter Plot of MNIST Data")
plt.xlabel("Isomap Component 1")
plt.ylabel("Isomap Component 2")
plt.legend(title="True Label", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Generate and plot the confusion matrix
conf_matrix_isomap_subset = confusion_matrix(y_train_subset, train_predicted_labels_isomap_subset, labels=range(10))

plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix_isomap_subset, annot=True, fmt="d", cmap="Blues", xticklabels=range(10), yticklabels=range(10))
plt.xlabel("Predicted Cluster Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix of Cluster-Label Mapping (Isomap + K-Means on Subset)")
plt.show()

# Save the confusion matrix for future reference
conf_matrix_isomap_df = pd.DataFrame(conf_matrix_isomap_subset, index=range(10), columns=range(10))
conf_matrix_isomap_df.to_csv("confusion_matrix_isomap_subset.csv", index_label="True Label")

print("Confusion matrix saved.")

## HDBSCAN

In [38]:
# Load the PCA projections
x_train_pca = np.load(f'pca_embeddings.npy')

In [None]:
# Apply HDBSCAN clustering
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=30, min_samples=10, metric='euclidean')
train_cluster_labels_hdbscan = hdbscan_clusterer.fit_predict(x_train_pca)

# Save HDBSCAN cluster labels for future use
np.save("hdbscan_labels.npy", train_cluster_labels_hdbscan)
print("HDBSCAN cluster labels saved.")

In [55]:
train_cluster_labels_hdbscan = np.load(f'hdbscan_labels.npy')

In [None]:
cluster_sizes = np.bincount(train_cluster_labels_hdbscan[train_cluster_labels_hdbscan != -1])  # Exclude noise
plt.figure(figsize=(8, 6))
plt.bar(range(len(cluster_sizes)), cluster_sizes, color='blue', alpha=0.7)
plt.title('Cluster Sizes (Excluding Noise)')
plt.xlabel('Cluster ID')
plt.ylabel('Number of Points')
plt.show()

In [None]:
# Load PCA projections and HDBSCAN cluster labels
x_train_pca = np.load('pca_embeddings.npy')
train_cluster_labels_hdbscan = np.load('hdbscan_labels.npy')

# Create a scatter plot
plt.figure(figsize=(10, 8))
scatter = sns.scatterplot(
    x=x_train_pca[:, 0], 
    y=x_train_pca[:, 1], 
    hue=train_cluster_labels_hdbscan, 
    palette='tab20',  # Adjust the palette if needed
    s=10, 
    legend='full'
)

# Customize plot
plt.title("HDBSCAN Clustering on PCA-Reduced Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster Labels", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


In [None]:
noise_ratio = np.sum(train_cluster_labels_hdbscan == -1) / len(train_cluster_labels_hdbscan)
print(f"Noise Ratio: {noise_ratio:.2%}")

In [None]:
# Filter out noise points (noise is labeled as -1 by HDBSCAN)
valid_indices = train_cluster_labels_hdbscan != -1
filtered_labels = train_cluster_labels_hdbscan[valid_indices]
filtered_y_train = np.array(
y_train)[valid_indices]

# Create a mapping from clusters to actual digit labels
cluster_label_mapping_hdbscan = {}

for cluster in np.unique(filtered_labels):
    indices = np.where(filtered_labels == cluster)[0]
    if len(indices) > 0:
        most_common_label = mode(filtered_y_train[indices]).mode[0]
        cluster_label_mapping_hdbscan[cluster] = most_common_label
    else:
        cluster_label_mapping_hdbscan[cluster] = -1

print("Cluster to Label Mapping for HDBSCAN:")
for cluster, label in cluster_label_mapping_hdbscan.items():
    print(f"Cluster {cluster}: Label {label}")

In [None]:
with open("cluster_to_label_mapping_hdbscan.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Cluster", "Label"])
    for cluster, label in cluster_label_mapping_hdbscan.items():
        writer.writerow([cluster, label])

print("Cluster-to-label mapping for HDBSCAN saved.")

In [61]:
# Predict labels using the cluster-to-label mapping, ignoring noise points
train_predicted_labels_hdbscan = [cluster_label_mapping_hdbscan.get(cluster, -1) for cluster in train_cluster_labels_hdbscan]
valid_predicted_labels = [label for i, label in enumerate(train_predicted_labels_hdbscan) if train_cluster_labels_hdbscan[i] != -1]
valid_true_labels = [y_train[i] for i in range(len(y_train)) if train_cluster_labels_hdbscan[i] != -1]

In [None]:
# Calculate accuracy
accuracy_hdbscan = accuracy_score(valid_true_labels, valid_predicted_labels)
print(f"Clustering-based classification accuracy on valid points (HDBSCAN): {accuracy_hdbscan:.4f}")

In [None]:
# Generate confusion matrix for valid points
conf_matrix_hdbscan = confusion_matrix(valid_true_labels, valid_predicted_labels, labels=range(10))

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix_hdbscan, annot=True, fmt="d", cmap="Blues", xticklabels=range(10), yticklabels=range(10))
plt.xlabel("Predicted Cluster Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix of Cluster-Label Mapping (HDBSCAN)")
plt.show()

# Save the confusion matrix as a CSV
conf_matrix_hdbscan_df = pd.DataFrame(conf_matrix_hdbscan, index=range(10), columns=range(10))
conf_matrix_hdbscan_df.to_csv("confusion_matrix_hdbscan.csv", index_label="True Label")

print("Confusion matrix for HDBSCAN saved.")

## LLE

In [None]:
x_sampled

In [None]:
# Step 3: Apply Locally Linear Embedding (LLE)
n_components = 50  # Number of dimensions to reduce to
n_neighbors = 10   # Adjust based on your data
lle = LocallyLinearEmbedding(n_components=n_components, n_neighbors=n_neighbors)

print("Running LLE...")
x_train_lle = lle.fit_transform(x_sampled)

In [None]:
# Save the LLE-reduced data for future use
np.save("lle_embeddings.npy", x_train_lle)
print("LLE embeddings saved.")

In [None]:
# Apply K-Means on the LLE-reduced data
kmeans_lle = KMeans(n_clusters=10, random_state=42)
kmeans_lle.fit(x_train_lle)

# Retrieve cluster labels assigned by K-Means
train_cluster_labels_lle = kmeans_lle.labels_

# Save K-Means cluster labels for future use
np.save("kmeans_labels_lle.npy", train_cluster_labels_lle)
print("K-Means cluster labels for LLE saved.")

In [None]:
# Create a mapping from clusters to actual digit labels
cluster_label_mapping_lle = {}

for cluster in range(10):
    # Get indices of samples in the current cluster
    indices = np.where(train_cluster_labels_lle == cluster)[0]
    
    # Find the most common actual label among these samples
    if len(indices) > 0:
        most_common_label = Counter(y_sampled[indices]).most_common(1)[0][0]
        cluster_label_mapping_lle[cluster] = most_common_label
    else:
        cluster_label_mapping_lle[cluster] = -1  # Assign -1 if the cluster is empty

# Save cluster-to-label mapping to CSV
with open("cluster_to_label_mapping_lle.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Cluster", "Label"])
    for cluster, label in cluster_label_mapping_lle.items():
        writer.writerow([cluster, label])
print("Cluster-to-label mapping for LLE saved.")

In [None]:
# Step 6: Predict Labels and Calculate Accuracy
train_predicted_labels_lle = [cluster_label_mapping_lle[cluster] for cluster in train_cluster_labels_lle]
accuracy_lle = accuracy_score(y_sampled, train_predicted_labels_lle)
print(f"Clustering-based classification accuracy on training set (LLE + K-Means): {accuracy_lle:.4f}")

In [None]:
# Reduce LLE-reduced data to 2 components for visualization using PCA
pca_2d = PCA(n_components=2)
x_train_lle_2d = pca_2d.fit_transform(x_train_lle)

# Plot the second and third components of LLE-reduced data (components 1 and 2 if zero-indexed)
plt.figure(figsize=(10, 8))
sns.scatterplot(x=x_train_lle[:, 1], y=x_train_lle[:, 2], hue=train_cluster_labels_lle, palette='tab10', s=10, legend='full')
plt.title("2D Scatter Plot of LLE-reduced Data with Cluster Labels (Components 2 and 3)")
plt.xlabel("LLE Component 2")
plt.ylabel("LLE Component 3")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Predict labels using the cluster-to-label mapping
train_predicted_labels_lle = [cluster_label_mapping_lle[cluster] for cluster in train_cluster_labels_lle]

# Generate confusion matrix
conf_matrix_lle = confusion_matrix(y_train_subset, train_predicted_labels_lle, labels=range(10))

# Plot the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix_lle, annot=True, fmt="d", cmap="Blues", xticklabels=range(10), yticklabels=range(10))
plt.xlabel("Predicted Cluster Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix of Cluster-Label Mapping (LLE + K-Means)")
plt.show()

# Save the confusion matrix as a CSV
conf_matrix_lle_df = pd.DataFrame(conf_matrix_lle, index=range(10), columns=range(10))
conf_matrix_lle_df.to_csv("confusion_matrix_lle.csv", index_label="True Label")

print("Confusion matrix for LLE saved.")

## UMAP

In [None]:
# Step 1: Apply UMAP for Dimensionality Reduction
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)

# Fit and transform UMAP on the training set
x_train_umap = reducer.fit_transform(x_train_normalized)

In [None]:
# Step 2: Visualize the UMAP Results
plt.figure(figsize=(10, 8))
plt.scatter(x_train_umap[:, 0], x_train_umap[:, 1], c=y_train, cmap="tab10", s=5, alpha=0.8)
plt.title("UMAP Projection of MNIST Dataset")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.colorbar(label="MNIST Labels")
plt.show()

In [14]:
np.save('x_train_umap.npy', x_train_umap)

------

## Visual comparison between methods

In [None]:
# Load the sampled indices
sampled_indices = np.load("sampled_indices.npy")

# Load the downsampled dataset
x_sampled = np.load("x_sampled.npy")
y_sampled = np.load("y_sampled.npy")

print("Downsampling loaded successfully!")

In [47]:
# Load the embeddings and cluster labels
x_train_pca = np.load(f'pca_embeddings.npy')
train_cluster_labels_pca = np.load(f'kmeans_labels_pca.npy')

x_train_tsne = np.load('tsne_embeddings.npy')
train_cluster_labels_tsne = np.load('kmeans_labels_tsne.npy')

x_reduced= np.load(f"isomap_embedding_subsets_evenly_downsampled.npy")
train_cluster_labels_isomap_downsampled = np.load(f"kmeans_labels_isomap_evenly_downsampled.npy")

train_cluster_labels_hdbscan = np.load('hdbscan_labels.npy')

x_train_lle = np.load('lle_embeddings.npy')
train_cluster_labels_lle = np.load('kmeans_labels_lle.npy')

In [29]:
def calculate_trustworthiness(original_data, reduced_data, n_neighbors=5):
    """
    Calculate Trustworthiness.

    Parameters:
    - original_data: High-dimensional original dataset (n_samples, n_features)
    - reduced_data: Reduced-dimensional embeddings (n_samples, n_components)
    - n_neighbors: Number of neighbors for trustworthiness calculation

    Returns:
    - trust: Trustworthiness score
    """
    # Check for matching number of rows
    if original_data.shape[0] != reduced_data.shape[0]:
        raise ValueError(f"Shape mismatch: original_data has {original_data.shape[0]} rows, "
                         f"but reduced_data has {reduced_data.shape[0]} rows.")
    
    # Calculate Trustworthiness
    trust = trustworthiness(original_data, reduced_data, n_neighbors=n_neighbors)
    print(f"Trustworthiness: {trust:.4f}")
    return trust

def calculate_neighborhood_preservation(original_data, reduced_data):
    """
    Calculate Neighborhood Preservation by correlating pairwise distances.
    
    Parameters:
    - original_data: High-dimensional original dataset
    - reduced_data: Reduced-dimensional embeddings
    
    Returns:
    - neighborhood_preservation: Correlation of pairwise distances between spaces
    """
    original_distances = cdist(original_data, original_data)
    reduced_distances = cdist(reduced_data, reduced_data)
    neighborhood_preservation = np.corrcoef(
        original_distances.flatten(), reduced_distances.flatten()
    )[0, 1]
    print(f"Neighborhood Preservation: {neighborhood_preservation:.4f}")
    return neighborhood_preservation

def calculate_silhouette_score(reduced_data, cluster_labels):
    """
    Calculate Silhouette Score for clustering in the reduced space.
    
    Parameters:
    - reduced_data: Reduced-dimensional embeddings
    - cluster_labels: Cluster labels from K-Means or another clustering algorithm
    
    Returns:
    - silhouette: Silhouette score
    """
    silhouette = silhouette_score(reduced_data, cluster_labels)
    print(f"Silhouette Score: {silhouette:.4f}")
    return silhouette

In [30]:
# Extract embeddings and cluster labels for the downsampled subset
x_sampled_pca = x_train_pca[sampled_indices]  # PCA embeddings for the downsampled subset
x_sampled_tsne = x_train_tsne[sampled_indices]  # t-SNE embeddings for the downsampled subset
cluster_labels_pca_sampled = train_cluster_labels_pca[sampled_indices]
cluster_labels_tsne_sampled = train_cluster_labels_tsne[sampled_indices]

# Isomap and LLE embeddings already computed on the downsampled dataset
x_sampled_isomap = x_reduced  # Isomap embeddings for downsampled subset
cluster_labels_isomap_sampled = train_cluster_labels_isomap_downsampled
x_sampled_lle = x_train_lle
cluster_labels_lle_sampled = train_cluster_labels_lle


In [None]:
print(f"x_sampled shape: {x_sampled.shape}")
for name, (reduced_data, _) in techniques_downsampled.items():
    print(f"{name} reduced_data shape: {reduced_data.shape}")

In [None]:
# Techniques dictionary with aligned embeddings and cluster labels
techniques_downsampled = {
    "PCA": (x_sampled_pca, cluster_labels_pca_sampled),
    "t-SNE": (x_sampled_tsne, cluster_labels_tsne_sampled),
    "Isomap": (x_sampled_isomap, cluster_labels_isomap_sampled),
    "LLE": (x_sampled_lle, cluster_labels_lle_sampled),
}

# Initialize results dictionary
results = {name: {} for name in techniques_downsampled.keys()}

# Calculate metrics for each technique
for name, (reduced_data, cluster_labels) in techniques.items():
    print(f"\nEvaluating metrics for {name}...")

    # Trustworthiness
    results[name]["Trustworthiness"] = calculate_trustworthiness(x_sampled, reduced_data)

    # Neighborhood Preservation
    results[name]["Neighborhood Preservation"] = calculate_neighborhood_preservation(x_sampled, reduced_data)

    # Silhouette Score
    results[name]["Silhouette Score"] = calculate_silhouette_score(reduced_data, cluster_labels)

In [None]:
# Embedding Comparison
def plot_embeddings(embeddings, labels, title):
    plt.figure(figsize=(10, 7))

    # Replace -1 (noise) with NaN for better plotting
    valid_labels = np.array(labels, dtype=float)
    valid_labels[valid_labels == -1] = np.nan
    sns.scatterplot(x=embeddings[:, 0], y=embeddings[:, 1], hue=valid_labels, 
                    palette='tab10', s=50, legend='full')
    plt.title(title)
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

plot_embeddings(x_train_pca, train_cluster_labels_pca, 'PCA Embeddings')
plot_embeddings(x_train_tsne, train_cluster_labels_tsne, 'T-SNE Embeddings')
plot_embeddings(x_reduced, train_cluster_labels_isomap_downsampled, 'Isomap Embeddings')
plot_embeddings(x_train_lle, train_cluster_labels_lle, 'LLE Embeddings')

# Silhouette Scores and Plots
def plot_silhouette(embeddings, labels, title):
    # Exclude noise points for silhouette calculation
    valid_indices = labels != -1
    valid_embeddings = embeddings[valid_indices]
    valid_labels = labels[valid_indices]

    silhouette_avg = silhouette_score(valid_embeddings, valid_labels)
    sample_silhouette_values = silhouette_samples(valid_embeddings, valid_labels)

    plt.figure(figsize=(10, 7))
    y_lower = 10
    for i in np.unique(valid_labels):
        ith_cluster_silhouette_values = sample_silhouette_values[valid_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values)
        plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10

    plt.title(f'Silhouette plot for {title} (avg score: {silhouette_avg:.2f})')
    plt.xlabel("Silhouette coefficient values")
    plt.ylabel("Cluster label")
    plt.axvline(x=silhouette_avg, color="red", linestyle="--")
    plt.tight_layout()
    plt.show()

plot_silhouette(x_train_pca, train_cluster_labels_pca, 'PCA')
plot_silhouette(x_train_tsne, train_cluster_labels_tsne, 'T-SNE')
plot_silhouette(x_reduced, train_cluster_labels_isomap_downsampled, 'Isomap')
plot_silhouette(x_train_lle, train_cluster_labels_lle, 'LLE')

In [None]:
# Display results in a table
results_df = pd.DataFrame(
    results,
    columns=["Technique", "Trustworthiness", "Neighborhood Preservation", "Silhouette Score"]
)

display(results_df)
# tools.display_dataframe_to_user(name="Dimensionality Reduction Metrics Comparison", dataframe=results_df)

In [None]:
def plot_pairwise_distances(embeddings, title):
    # Sample data to avoid memory issues
    sample_size = 1000
    if len(embeddings) > sample_size:
        indices = np.random.choice(len(embeddings), sample_size, replace=False)
        embeddings_sampled = embeddings[indices]
    else:
        embeddings_sampled = embeddings

    distances = pairwise_distances(embeddings_sampled)
    plt.figure(figsize=(10, 7))
    sns.heatmap(distances, cmap='viridis', cbar_kws={'label': 'Pairwise Distance'})
    plt.title(f'Pairwise Distance Heatmap for {title}')
    plt.xlabel('Sample Points')
    plt.ylabel('Sample Points')
    plt.tight_layout()
    plt.show()

plot_pairwise_distances(x_train_pca, 'PCA')
plot_pairwise_distances(x_train_tsne, 'T-SNE')
plot_pairwise_distances(x_reduced, 'Isomap')
plot_pairwise_distances(x_train_lle, 'LLE')

In [None]:
# Load embeddings and cluster labels
x_train_pca = np.load('pca_embeddings.npy')
train_cluster_labels_pca = np.load('kmeans_labels_pca.npy')

x_train_tsne = np.load('tsne_embeddings.npy')
# train_cluster_labels_tsne = np.load('kmeans_labels_tsne.npy')

x_reduced_isomap = np.load('isomap_embedding_subsets_evenly_downsampled.npy')
train_cluster_labels_isomap = np.load('kmeans_labels_isomap_evenly_downsampled.npy')

x_train_lle = np.load('lle_embeddings.npy')
train_cluster_labels_lle = np.load('kmeans_labels_lle.npy')

x_train_umap = np.load('x_train_umap.npy')

# Define the embeddings and labels
methods = {
    'PCA': (x_train_pca, train_cluster_labels_pca),
    'Isomap': (x_reduced_isomap, train_cluster_labels_isomap),
    'LLE': (x_train_lle, train_cluster_labels_lle),
    'MDS': (x_train_lle, train_cluster_labels_lle),
    't-SNE': (x_train_tsne, y_train),
    'UMAP': (x_train_umap, y_train)
}

# Create a grid of subplots
fig, axes = plt.subplots(len(methods), 1, figsize=(10, 18))  # Adjust for clearer layout
fig.tight_layout(pad=6.0)

# Define the label names (digits 0-9)
label_names = [f"Digit {i}" for i in range(10)]

for i, (ax, (method, (embedding, labels))) in enumerate(zip(axes, methods.items())):
    scatter = sns.scatterplot(
        x=embedding[:, 0], 
        y=embedding[:, 1], 
        hue=labels, 
        palette='tab10', 
        s=15,  # Increased for better visibility
        ax=ax,
        legend=(i == 0)  # Add legend only to the first plot
    )
    ax.set_title(f'{method} Embeddings', fontsize=14, pad=10, loc='center')

    # Hide x and y axis ticks and labels
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    
    # Set equal aspect ratio for symmetry
    ax.set_aspect('equal')

    # Customize the legend for the first plot
    if i == 0:
        handles, _ = scatter.get_legend_handles_labels()
        ax.legend(handles, label_names, title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
    else:
        ax.legend().remove()

# Add a super title for all plots
# plt.suptitle('Dimensionality Reduction Methods on MNIST', y=1, fontsize=14)
plt.show()

------

# UMAP Experiments

## n_neighbours Trials

**Code Behavior:**
- *UMAP and KMeans Runs:* It performs 35 runs, applying UMAP for dimensionality reduction and KMeans for clustering.
- *Centroid Calculation:* After each run, it calculates the centroids of the KMeans clusters and stores them.
- *Mean and Standard Deviation Calculation:* Once all runs are complete, it calculates the mean and standard deviation of the centroids across the runs.
- *Save Results:* The UMAP projections and KMeans centroids are saved as .npy files.

### n_neighbors = 5, n_runs = 35, n_clusters = 10 (for KMeans)

In [2]:
# Load the n_neighbors Analysis
umap_projections_5 = np.load(f'umap_projections_neighbors_5.npy')
centroid_mean_5_35= np.load(f'centroid_mean_5_35.npy')
centroid_std_5_35= np.load(f'centroid_std_5_35.npy')
kmeans_centroids_5 = np.load(f"kmeans_centroids_neighbors_5.npy")
df_results_v2=pd.read_csv('result_table_neighbors_v2_5_35.csv')
mean_distance_matrix_5_35= np.load(f'mean_distance_matrix_neighbors_5_35.npy')
distance_matrix_std_5_35= np.load(f"distance_matrix_std_5_35.npy")
normalized_distance_matrix_std_5_35= np.load(f'normalized_distance_matrix_std_5_35.npy')
normalized_mean_distance_matrix_5_35= np.load(f'normalized_mean_distance_matrix_5_35.npy')
mst_std_5_35= np.load(f'mst_std_5_35.npy')
mst_5_35= np.load(f'mst_5_35.npy')

In [None]:
### NO NEED TO RE RUN ###

# Define the number of UMAP and KMeans runs
n_runs = 35
n_clusters = 10  # Set the number of clusters (for KMeans)
n_neighbors = 5

# Store UMAP and KMeans results for each run
umap_projections = []
kmeans_centroids_list = []  # Use this to store centroids for each run

# Define a helper function to calculate the centroid of each cluster
def calculate_centroids(kmeans, x_umap):
    centroids = []
    for i in range(n_clusters):
        cluster_points = x_umap[kmeans.labels_ == i]
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    return np.array(centroids)

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.1, n_components=2, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap = umap_model.fit_transform(x_train_flattened)
    
    # Apply KMeans clustering on the UMAP projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_train_umap)

    # Calculate centroids for this run
    centroids = calculate_centroids(kmeans, x_train_umap)
    
    # Store the UMAP projections and KMeans models
    umap_projections.append(x_train_umap)
    kmeans_centroids_list.append(centroids)

# Now we calculate the mean and standard deviation of the centroids across all runs
kmeans_centroids = np.array(kmeans_centroids_list)  # Shape: (n_runs, n_clusters, 2)

# Calculate mean and std deviation for centroids' coordinates
centroid_mean = np.mean(kmeans_centroids, axis=0)
centroid_std = np.std(kmeans_centroids, axis=0)

# Save the UMAP projections and KMeans centroids
np.save(f'umap_projections_neighbors_{n_neighbors}.npy', np.array(umap_projections))
np.save(f'kmeans_centroids_neighbors_{n_neighbors}.npy', np.array(kmeans_centroids_list))

In [None]:
### NO NEED TO RE RUN ###

# Save the centroid_mean and centroid_std
np.save(f'centroid_mean_{n_neighbors}_35.npy', np.array(centroid_mean))
np.save(f'centroid_std_{n_neighbors}_35.npy', np.array(centroid_std))

------------------------

Standard deviation calculation

In [None]:
### NO NEED TO RE RUN ###
# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_5[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_5[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:
### NO NEED TO RE RUN ###
# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_5[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_5_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:
### NO NEED TO RE RUN ###
# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
### NO NEED TO RE RUN ###
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:
### NO NEED TO RE RUN ###
# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{5}_35.csv', index=False)

-----------------

Removal outliers process

In [19]:
# Convert the NumPy array into a DataFrame with 'Cluster', 'x_mean', and 'y_mean'
centroid_mean_neighbors_5_df = pd.DataFrame(centroid_mean_5_35, columns=['x_mean', 'y_mean'])
centroid_mean_neighbors_5_df['Cluster'] = np.arange(10)

In [20]:
# Extract x and y coordinates
df_results_v2[['x', 'y']] = pd.DataFrame(df_results_v2['Centroid Coord'].tolist(), index=df_results_v2.index)

# Merge the mean centroids dataframe with the results dataframe on 'Cluster'
df_merged = pd.merge(df_results_v2, centroid_mean_neighbors_5_df, on='Cluster', how='left')

In [None]:
# Plot changes in X-coordinate for each cluster over all runs
n_runs = 35
n_clusters = 10
n_neighbors = 5

for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_5[:, cluster, 0], marker='o', linestyle='-', color='b')
    plt.title(f'Cluster {cluster} X-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('X Centroid Coordinate')
    plt.grid(True)
    plt.show()

# Plot changes in Y-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_5[:, cluster, 1], marker='o', linestyle='-', color='g')
    plt.title(f'Cluster {cluster} Y-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('Y Centroid Coordinate')
    plt.grid(True)
    plt.show()

In [None]:
# Calculate Euclidean distance from each centroid to its cluster's mean
df_merged['Distance_to_Mean'] = np.sqrt((df_merged['x'] - df_merged['x_mean'])**2 + (df_merged['y'] - df_merged['y_mean'])**2)

# Apply an outlier threshold (e.g., 90th percentile of the distance per cluster)
def filter_outliers(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return df[df['Distance_to_Mean'] <= threshold]

# Apply the filtering function for each cluster
df_no_outliers = df_merged.groupby('Cluster').apply(filter_outliers).reset_index(drop=True)

# Step 7: Drop unnecessary columns if needed (like 'x' and 'y' if only the distance matters)
df_no_outliers_cleaned = df_no_outliers.drop(columns=['x', 'y', 'x_mean', 'y_mean'])

# Step 8: Check the size of the resulting dataframe
print(f"Original DataFrame size: {df_merged.shape}")
print(f"DataFrame size after removing outliers: {df_no_outliers_cleaned.shape}")

# Display the final dataframe to the user
df_no_outliers_cleaned

In [26]:
# Group the dataframe by 'Cluster'
clusters_grouped = df_no_outliers_cleaned.groupby('Cluster')

# Create a dictionary to store arrays for each cluster's centroids
clusters_centroids = {}

# Loop through each group (cluster) and store the centroids in arrays
for cluster, group in clusters_grouped:
    # Extract centroids (x, y) as a NumPy array
    centroids_array = np.array(group['Centroid Coord'].tolist())  # Assuming 'Centroid Coord' contains [x, y] pairs
    clusters_centroids[cluster] = centroids_array

In [None]:
# Create a dictionary to store the size of each cluster
cluster_sizes = {cluster: len(centroids) for cluster, centroids in clusters_centroids.items()}

# Print the size of each cluster
for cluster, size in cluster_sizes.items():
    print(f"Cluster {cluster} has {size} centroids considered.")

--------

Check to verify that if it is fine to have all clusters with the same number of centroids after filtering out outliers. This must be due to:
- The Distance Distributions are Likely Very Similar
- Uniform Data structure

In [None]:
# Loop through each cluster and plot the distribution of distances
for cluster, group in clusters_grouped:
    plt.figure(figsize=(10, 5))
    plt.hist(group['Distance_to_Mean'], bins=10, edgecolor='black')
    plt.title(f'Cluster {cluster}: Distance to Mean Distribution')
    plt.xlabel('Distance to Mean')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
# Percentile threshold per cluster check
def check_percentiles(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return threshold

# Function applied to each cluster and print the result
for cluster, group in clusters_grouped:
    threshold = check_percentiles(group)
    print(f"Cluster {cluster}: 90th percentile threshold = {threshold}")

In [None]:
# For each cluster, calculate the 70th percentile of distances and filter accordingly
for cluster, group in clusters_grouped:
    # Calculate the 70th percentile threshold for the current cluster
    threshold = np.percentile(group['Distance_to_Mean'], 70)
    
    # Filter centroids based on the 70th percentile
    filtered_group = group[group['Distance_to_Mean'] <= threshold]
    
    # Print the size of the group before and after filtering
    print(f"Cluster {cluster}: Original size = {len(group)}, Filtered size = {len(filtered_group)}")

--------

#### Distance matrix n=5

##### Distance Mean matrix
Elemnt d_{ij} has the distance between the center of cluster i and cluster j.

In [None]:
# Store distance matrices for each run
distance_matrices_5_35 = []

# Iterate over all runs and calculate the distance matrix for each run
for run_centroids in kmeans_centroids_5:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix_5_35 = cdist(run_centroids, run_centroids, metric='euclidean')
    distance_matrices_5_35.append(distance_matrix_5_35)

# Convert the list of distance matrices to a numpy array (35 runs, 10x10 distance matrices)
distance_matrices_5_35 = np.array(distance_matrices_5_35)

# Calculate the mean distance matrix across all runs
mean_distance_matrix_5_35 = np.mean(distance_matrices_5_35, axis=0)

# Normalize the mean distance matrix
normalized_mean_distance_matrix = (mean_distance_matrix_5_35 - np.min(mean_distance_matrix_5_35)) / (np.max(mean_distance_matrix_5_35) - np.min(mean_distance_matrix_5_35))

# Plot of the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=5)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_all_runs.npy', distance_matrices_5_35)
np.save('mean_distance_matrix_neighbors_5_35.npy', mean_distance_matrix_5_35)

# Mean distance matrix
print(f"Mean distance matrix across all runs:\n{mean_distance_matrix_5_35}")

**MST Analysis**

In [10]:
mean_distance_matrix_5_35= np.load(f'mean_distance_matrix_neighbors_5_35.npy')
# mean_distance_matrix_5_35=np.round(mean_distance_matrix_5_35,3)

In [None]:
# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_35 = (mean_distance_matrix_5_35 - np.min(mean_distance_matrix_5_35)) / (np.max(mean_distance_matrix_5_35) - np.min(mean_distance_matrix_5_35))
np.save('normalized_mean_distance_matrix_5_35.npy', normalized_mean_distance_matrix_5_35)

In [None]:
# Create a graph from the distance matrix
G_5_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_5_35,3))
np.save('G_5_35.npy',G_5_35)

# Draw the graph
pos = nx.spring_layout(G_5_35, seed=42)  # positions for all nodes
nx.draw(G_5_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_5_35, 'weight')
nx.draw_networkx_edge_labels(G_5_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_5_35 = nx.minimum_spanning_tree(G_5_35)
np.save('mst_5_35.npy', mst_5_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_5_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_5_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_5_35, 'weight')
nx.draw_networkx_edge_labels(mst_5_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST - n_neighbors=5")
plt.show()

##### Distance Std. dev. Matrix

In [None]:
# Calculate the pairwise distance matrix for the standard deviations
distance_matrix_std_5_35 = cdist(centroid_std_5_35, centroid_std_5_35, metric='euclidean')

# Normalize the distance matrix
normalized_distance_matrix_std_5_35 = (distance_matrix_std_5_35 - np.min(distance_matrix_std_5_35)) / (np.max(distance_matrix_std_5_35) - np.min(distance_matrix_std_5_35))

# Visualize the normalized distance matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(normalized_distance_matrix_std_5_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=5)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()

# Save the distance matrix for later analysis
np.save("distance_matrix_std_5_35.npy", distance_matrix_std_5_35)
np.save("normalized_distance_matrix_std_5_35.npy", normalized_distance_matrix_std_5_35)


In [None]:
# Create a graph from the distance matrix
G_std_5_35 = nx.from_numpy_array(np.round(normalized_distance_matrix_std_5_35,3))
np.save('G_std_5_35.npy',G_std_5_35)

# Draw the graph
pos = nx.spring_layout(G_std_5_35, seed=42)  # positions for all nodes
nx.draw(G_std_5_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_std_5_35, 'weight')
nx.draw_networkx_edge_labels(G_std_5_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_std_5_35 = nx.minimum_spanning_tree(G_std_5_35)
np.save('mst_std_5_35.npy',mst_std_5_35)

# Define positions for all nodes
pos_std_5_35 = nx.spring_layout(mst_std_5_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_std_5_35, pos_std_5_35, with_labels=True, node_color='lightyellow', edge_color='green', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels_std_5_35 = nx.get_edge_attributes(mst_std_5_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_5_35, pos_std_5_35, edge_labels=edge_labels_std_5_35, font_size=8, label_pos=0.3)

plt.title("MST Std. Deviation - n_neighbors=5")
plt.show()

Lower upper bound MST

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_5_35, "MST - Mean Distances", axes[0], color='red')
plot_mst(norm_lower_limit_intconf_matrix_5_35, "MST - Lower Limit", axes[1], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_5_35, "MST - Upper Limit", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

In [None]:
# Create a graph from the distance matrix
G_std_5_35 = nx.from_numpy_array(np.round(normalized_distance_matrix_std_5_35,3))
np.save('G_std_5_35.npy',G_std_5_35)

# Draw the graph
pos = nx.spring_layout(G_std_5_35, seed=42)  # positions for all nodes
nx.draw(G_std_5_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_std_5_35, 'weight')
nx.draw_networkx_edge_labels(G_std_5_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_std_5_35 = nx.minimum_spanning_tree(G_std_5_35)
np.save('mst_std_5_35.npy',mst_std_5_35)

# Define positions for all nodes
pos_std_5_35 = nx.spring_layout(mst_std_5_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_std_5_35, pos_std_5_35, with_labels=True, node_color='lightyellow', edge_color='green', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels_std_5_35 = nx.get_edge_attributes(mst_std_5_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_5_35, pos_std_5_35, edge_labels=edge_labels_std_5_35, font_size=8, label_pos=0.3)

plt.title("MST Std. Deviation - n_neighbors=5")
plt.show()

In [None]:
def plot_heatmap(matrix, title, xlabel, ylabel, figsize=(10, 8), cmap="viridis", annot=True):
    """
    Plots a heatmap for a given matrix with customizable parameters.

    Args:
        matrix (ndarray): The 2D matrix to plot as a heatmap.
        title (str): Title of the heatmap.
        xlabel (str): Label for the x-axis.
        ylabel (str): Label for the y-axis.
        figsize (tuple): Size of the figure (default: (10, 8)).
        cmap (str): Color map to use (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    plt.figure(figsize=figsize)
    sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

# Call the function for both heatmaps
plot_heatmap(
    normalized_distance_matrix_std_5_35,
    title="Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=5)",
    xlabel="Cluster",
    ylabel="Cluster",
    figsize=(8, 6)
)

plot_heatmap(
    normalized_mean_distance_matrix_5_35,
    title="Normalized Mean Distance Matrix (k=10, n_neighbors=5)",
    xlabel="Cluster",
    ylabel="Cluster",
    figsize=(8, 6)
)

In [None]:
# Define a function to create heatmaps
def plot_heatmaps_side_by_side(matrices, titles, figsize=(16, 8), cmap="viridis", annot=True):
    """
    Plots multiple heatmaps side by side for given matrices and titles.

    Args:
        matrices (list): List of 2D matrices to plot as heatmaps.
        titles (list): List of titles corresponding to each matrix.
        figsize (tuple): Size of the entire figure (default: (16, 8)).
        cmap (str): Color map to use for all heatmaps (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    n = len(matrices)  # Number of heatmaps
    fig, axes = plt.subplots(1, n, figsize=figsize)

    for i, (matrix, title) in enumerate(zip(matrices, titles)):
        sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5, ax=axes[i])
        axes[i].set_title(title)
        axes[i].set_xlabel("Cluster")
        axes[i].set_ylabel("Cluster" if i == 0 else "")  # Only label y-axis for the first plot

    plt.tight_layout()
    plt.show()

# Call the function with the two heatmaps
plot_heatmaps_side_by_side(
    matrices=[
        normalized_distance_matrix_std_5_35,
        normalized_mean_distance_matrix_5_35
    ],
    titles=[
        "Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=5)",
        "Normalized Mean Distance Matrix (k=10, n_neighbors=5)"
    ]
)

- Mean Matrix --> spatial distances --> **position** of clusters in UMAP.
- Std. Deviation --> variability differences --> focuses on **stability**.

Standard deviation heatmap complements the mean matrix by showing not just where clusters are located in the embedding but also how consistently they are represented across multiple runs. Together, they provide a complete picture of spatial relationships and stability in your UMAP analysis.

**Why Look for Low Values in Both Graphs?**
- Spatial Proximity (Mean Matrix):

    Clusters that are spatially close (low values in the mean matrix) may share relationships in the original data, such as being part of the same neighborhood or having similar data features.
- Similar Variability (Std Deviation Matrix):

    Clusters with low variability differences (low values in the std deviation matrix) are consistently represented across multiple UMAP runs. This indicates that their behavior is robust and not affected by randomness or parameter sensitivity.

*Clusters that are spatially close and consistently represented likely:*

- Belong to the same data manifold.
- Represent well-defined groups in your dataset.
- Are stable and reliable in your UMAP projection.

**What Do Low-Low Clusters Tell Us?**
- Stable Relationships:

    If two clusters have low values in both graphs, their relationship is stable and meaningful across runs.
    They might share similar structures or characteristics in the data.
- Robustness:

    Clusters that show low variability are less sensitive to UMAP parameters, making them more reliable for downstream tasks (e.g., classification, clustering).
- Potential for Merging:

    If two clusters have consistently low distances in both graphs, they might represent subclusters of the same group. This could indicate that they can be merged into a single cluster, depending on your analysis goals.

In [None]:
# Define a threshold for "low" values
threshold = 0.65

# Identify pairs of clusters with low values in both matrices
low_low_pairs = []
for i in range(normalized_mean_distance_matrix_5_35.shape[0]):
    for j in range(normalized_mean_distance_matrix_5_35.shape[1]):
        if i != j:  # Skip diagonal
            mean_value = normalized_mean_distance_matrix_5_35[i, j]
            std_value = normalized_distance_matrix_std_5_35[i, j]
            if mean_value < threshold and std_value < threshold:
                low_low_pairs.append((i, j, mean_value, std_value))

# Display the results
for pair in low_low_pairs:
    print(f"Clusters {pair[0]} and {pair[1]}: Mean Distance = {pair[2]:.2f}, Std Distance = {pair[3]:.2f}")

0.65 can seems like a high value, since it is on the upper-mid range.

Depending on the goal of the analysis we can think of it as:
- If the aim is to identify the strongest relationships between clusters, a lower threshold would make more sense.
- If we want to explore the broader connections, then it is fine.

In [None]:
# Example: Replace with your cluster pairs from low_low_pairs
low_low_pairs = [(0, 9, 0.64, 0.51), (1, 7, 0.65, 0.30),(1, 8, 0.62, 0.50)]  # Example cluster pairs

# UMAP projections and cluster labels (replace with your actual data)
umap_projections = np.load("umap_projections_neighbors_5.npy")
kmeans_labels = np.load("kmeans_labels_list_5_35.npy")  # Shape: (n_runs, n_samples)

# Function to plot clusters
def plot_clusters(umap_projection, labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair
    points_a = umap_projection[labels == cluster_a]
    points_b = umap_projection[labels == cluster_b]

    plt.figure(figsize=(8, 6))
    plt.scatter(points_a[:, 0], points_a[:, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.6)
    plt.scatter(points_b[:, 0], points_b[:, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.6)
    plt.title(f"Run {run_idx}: Cluster {cluster_a} vs. Cluster {cluster_b}")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Analyze each cluster pair
for cluster_pair in low_low_pairs:
    cluster_a, cluster_b = cluster_pair[0], cluster_pair[1]
    print(f"Analyzing Cluster Pair: {cluster_a} and {cluster_b}")
    
    # For simplicity, visualize them in a specific UMAP run (e.g., the first run)
    run_idx = 0  # Use the first run for visualization
    plot_clusters(umap_projections[run_idx], kmeans_labels[run_idx], (cluster_a, cluster_b), run_idx)

    # Calculate additional statistics if needed
    distances_a_to_b = np.linalg.norm(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a].mean(axis=0) - 
                                      umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b].mean(axis=0))
    print(f"Mean Centroid Distance (Run {run_idx}): {distances_a_to_b:.2f}")

    # Variability comparison
    cluster_a_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a], axis=0)
    cluster_b_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b], axis=0)
    print(f"Cluster {cluster_a} Std Dev: {cluster_a_std}")
    print(f"Cluster {cluster_b} Std Dev: {cluster_b_std}")
    print("\n")

**Cluster 1 and Cluster 8** have a moderate spatial relationship with visible overlap in the UMAP space. Their differing variability patterns suggest distinct structures, but the overlap points might represent shared features or transitions between the clusters.
The large spatial separation between their centroids suggests they represent distinct structures or classes in the data.

**Cluster 0 and Cluster 9** 9 appears more compact and stable, while Cluster 0 is larger and more variable.
Their distinct regions in the UMAP space and differing standard deviations reinforce their meaningful separation.
Insights from Variability:

The variability of Cluster 0 could indicate sensitivity to UMAP parameters or noise in the data.

In [None]:
def examine_mnist_overlap(umap_projection, kmeans_labels, mnist_labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair

    # Get points in Cluster A and Cluster B
    points_a_indices = np.where(kmeans_labels == cluster_a)[0]
    points_b_indices = np.where(kmeans_labels == cluster_b)[0]

    # Find the overlapping points (indices)
    overlap_indices = np.intersect1d(points_a_indices, points_b_indices)

    # Get the original labels of overlapping points
    overlap_labels = np.array(mnist_labels)[overlap_indices]

    # Analyze the original labels
    overlap_label_counts = pd.Series(overlap_labels).value_counts()

    # Display the overlap statistics
    print(f"Overlap between Cluster {cluster_a} and Cluster {cluster_b} (Run {run_idx}):")
    print(f"Number of overlapping points: {len(overlap_indices)}")
    print(f"Original label distribution of overlapping points:\n{overlap_label_counts}")

    # Visualize the overlap
    plt.figure(figsize=(8, 6))
    plt.scatter(umap_projection[points_a_indices, 0], umap_projection[points_a_indices, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.5)
    plt.scatter(umap_projection[points_b_indices, 0], umap_projection[points_b_indices, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.5)
    if len(overlap_indices) > 0:
        plt.scatter(umap_projection[overlap_indices, 0], umap_projection[overlap_indices, 1], color="red", label="Overlap", alpha=0.7)
    plt.title(f"Cluster Overlap: Cluster {cluster_a} vs. Cluster {cluster_b} (Run {run_idx})")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Example usage:
# Load your MNIST data
dataloader = MnistDataloader(
    training_images_filepath="train-images.idx3-ubyte",
    training_labels_filepath="train-labels.idx1-ubyte",
    test_images_filepath="t10k-images.idx3-ubyte",
    test_labels_filepath="t10k-labels.idx1-ubyte"
)

# Load data
(x_train, y_train), (x_test, y_test) = dataloader.load_data()

# Flatten the training images for UMAP (if needed for alignment with projections)
x_train_flattened = np.array([np.array(img).flatten() for img in x_train])

# Example variables (replace these with your actual data)
run_idx = 0  # Analyze the first UMAP run
cluster_pair = (1, 8)  # Compare Cluster 1 and Cluster 8
umap_projection = umap_projections[run_idx]  # UMAP projection for the given run
kmeans_labels = kmeans_labels[run_idx]  # KMeans labels for the given run

# Examine overlap
examine_mnist_overlap(umap_projection, kmeans_labels, y_train, cluster_pair, run_idx)

Interval of confidence

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution

n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_5_35 = distance_matrix_std_5_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_5_35 = z_score * sem_matrix_5_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_5_35 = mean_distance_matrix_5_35 - margin_of_error_matrix_5_35
upper_limit_intconf_matrix_5_35 = mean_distance_matrix_5_35 + margin_of_error_matrix_5_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_5_35 = np.maximum(lower_limit_intconf_matrix_5_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_5_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_5_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_5_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_5_35.npy', lower_limit_intconf_matrix_5_35)
np.save('upper_limit_intconf_matrix_5_35.npy', upper_limit_intconf_matrix_5_35)

In [9]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_5_35 = normalize_matrix(lower_limit_intconf_matrix_5_35)
norm_upper_limit_intconf_matrix_5_35 = normalize_matrix(upper_limit_intconf_matrix_5_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_5_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=5)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_5_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=5)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_5_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=5)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define clusters
clusters = np.arange(1, 10)  # Assuming we are skipping Cluster 0 (comparison against itself)

# Define the data for each n_neighbors
data = {
    5: {
        "mean": mean_distance_matrix_5_35[0, 1:],  # Distances from cluster 0 to other clusters
        "lower": lower_limit_intconf_matrix_5_35[0, 1:],  # Lower bounds
        "upper": upper_limit_intconf_matrix_5_35[0, 1:]    # Upper bounds
    },
    10: {
        "mean": mean_distance_matrix_10_35[0, 1:],  # Distances from cluster 0 to other clusters
        "lower": lower_limit_intconf_matrix_10_35[0, 1:],  # Lower bounds
        "upper": upper_limit_intconf_matrix_10_35[0, 1:]    # Upper bounds
    },
    20: {
        "mean": mean_distance_matrix_20_35[0, 1:],  # Distances from cluster 0 to other clusters
        "lower": lower_limit_intconf_matrix_20_35[0, 1:],  # Lower bounds
        "upper": upper_limit_intconf_matrix_20_35[0, 1:]    # Upper bounds
    },
    30: {
        "mean": mean_distance_matrix_30_35[0, 1:],  # Distances from cluster 0 to other clusters
        "lower": lower_limit_intconf_matrix_30_35[0, 1:],  # Lower bounds
        "upper": upper_limit_intconf_matrix_30_35[0, 1:]    # Upper bounds
    },
    50: {
        "mean": mean_distance_matrix_50_35[0, 1:],  # Distances from cluster 0 to other clusters
        "lower": lower_limit_intconf_matrix_50_35[0, 1:],  # Lower bounds
        "upper": upper_limit_intconf_matrix_50_35[0, 1:]    # Upper bounds
    },
    100: {
    "mean": mean_distance_matrix_100_35[0, 1:],  # Distances from cluster 0 to other clusters
    "lower": lower_limit_intconf_matrix_100_35[0, 1:],  # Lower bounds
    "upper": upper_limit_intconf_matrix_100_35[0, 1:]    # Upper bounds
    }
}

# Define colors for each n_neighbors
colors = {5: "orange", 10: "blue", 20: "yellow", 30: "grey", 50: "green", 100: "red"}

# Plotting
fig, ax = plt.subplots(figsize=(16, 8))

width = 0.15  # Bar width
x = np.arange(len(clusters))  # X positions for clusters

for idx, (n_neighbors, values) in enumerate(data.items()):
    # Calculate positions for the current set of bars
    x_positions = x + (idx - len(data)/2) * width

    # Plot bars for the mean distances
    ax.bar(
        x_positions,
        values["mean"],  # Mean distances
        yerr=[
            values["mean"] - values["lower"],  # Lower error
            values["upper"] - values["mean"]   # Upper error
        ],
        width=width,
        color=colors[n_neighbors],
        alpha=0.7,
        label=f"n={n_neighbors}",
        capsize=5
    )

# Add labels, title, and legend
ax.set_xlabel("Clusters", fontsize=14)
ax.set_ylabel("Distance", fontsize=14)
ax.set_title("Confidence Intervals of Distances from Cluster 0 to Other Clusters", fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels([f"{i}" for i in clusters], fontsize=12)
ax.legend(title="n_neighbors", fontsize=10)
ax.grid(axis="y", linestyle="--", alpha=0.7)

plt.tight_layout()
plt.show()



In [None]:
from matplotlib.backends.backend_pdf import PdfPages

# Define the clusters
clusters = np.arange(10)  # Include all clusters (0 to 9)

# Start a PDF document to save the plots
with PdfPages('confidence_intervals_clusters.pdf') as pdf:
    for base_cluster in clusters:
        # Define the data for each n_neighbors
        data = {
            5: {
                "mean": mean_distance_matrix_5_35[base_cluster, np.arange(10) != base_cluster],  # Skip the base cluster
                "lower": lower_limit_intconf_matrix_5_35[base_cluster, np.arange(10) != base_cluster],
                "upper": upper_limit_intconf_matrix_5_35[base_cluster, np.arange(10) != base_cluster]
            },
            10: {
                "mean": mean_distance_matrix_10_35[base_cluster, np.arange(10) != base_cluster],
                "lower": lower_limit_intconf_matrix_10_35[base_cluster, np.arange(10) != base_cluster],
                "upper": upper_limit_intconf_matrix_10_35[base_cluster, np.arange(10) != base_cluster]
            },
            20: {
                "mean": mean_distance_matrix_20_35[base_cluster, np.arange(10) != base_cluster],
                "lower": lower_limit_intconf_matrix_20_35[base_cluster, np.arange(10) != base_cluster],
                "upper": upper_limit_intconf_matrix_20_35[base_cluster, np.arange(10) != base_cluster]
            },
            30: {
                "mean": mean_distance_matrix_30_35[base_cluster, np.arange(10) != base_cluster],
                "lower": lower_limit_intconf_matrix_30_35[base_cluster, np.arange(10) != base_cluster],
                "upper": upper_limit_intconf_matrix_30_35[base_cluster, np.arange(10) != base_cluster]
            },
            50: {
                "mean": mean_distance_matrix_50_35[base_cluster, np.arange(10) != base_cluster],
                "lower": lower_limit_intconf_matrix_50_35[base_cluster, np.arange(10) != base_cluster],
                "upper": upper_limit_intconf_matrix_50_35[base_cluster, np.arange(10) != base_cluster]
            },
            100: {
                "mean": mean_distance_matrix_100_35[base_cluster, np.arange(10) != base_cluster],
                "lower": lower_limit_intconf_matrix_100_35[base_cluster, np.arange(10) != base_cluster],
                "upper": upper_limit_intconf_matrix_100_35[base_cluster, np.arange(10) != base_cluster]
            }
        }

        # Define colors for each n_neighbors
        colors = {5: "orange", 10: "blue", 20: "yellow", 30: "grey", 50: "green", 100: "red"}

        # Plotting
        fig, ax = plt.subplots(figsize=(12, 6))

        width = 0.15  # Bar width
        x = np.arange(len(clusters) - 1)  # X positions for clusters

        for idx, (n_neighbors, values) in enumerate(data.items()):
            # Calculate positions for the current set of bars
            x_positions = x + (idx - len(data) / 2) * width

            # Plot bars for the mean distances
            ax.bar(
                x_positions,
                values["mean"],  # Mean distances
                yerr=[
                    values["mean"] - values["lower"],  # Lower error
                    values["upper"] - values["mean"]   # Upper error
                ],
                width=width,
                color=colors[n_neighbors],
                alpha=0.7,
                label=f"n={n_neighbors}",
                capsize=5
            )

        # Add labels, title, and legend
        ax.set_xlabel("Clusters", fontsize=12)
        ax.set_ylabel("Distance", fontsize=12)
        ax.set_title(f"Confidence Intervals of Distances from Cluster {base_cluster} to Other Clusters", fontsize=14)
        ax.set_xticks(x)
        ax.set_xticklabels([f"{i}" for i in np.arange(10) if i != base_cluster], fontsize=10)
        ax.legend(title="n_neighbors", fontsize=10)
        ax.grid(axis="y", linestyle="--", alpha=0.7)

        plt.tight_layout()

        # Save the figure to the PDF
        pdf.savefig(fig)
        plt.close(fig)

print("All graphs have been saved to 'confidence_intervals_clusters.pdf'.")

-------------

#### Intra class evaluation

Option A) with fix radius

In [None]:
# Define parameters
n_clusters = 10
radius = 0.5

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_5_35 = []
neighbor_counts_5_35 = []
kmeans_labels_list_5_35 = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_5):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_5_35 = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_5_35.append(kmeans_labels_5_35)
    
    run_max_distances_5_35 = []
    run_neighbor_counts_5_35 = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_5_35 == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_5_35 = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_5_35.append(max_distance_5_35)
        
        # Calculate number of neighbors within the radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_5_35 = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_5_35.append(neighbors_within_radius_5_35)
    
    # Append results for this run
    max_distances_5_35.append(run_max_distances_5_35)
    neighbor_counts_5_35.append(run_neighbor_counts_5_35)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_5_35 = np.array(max_distances_5_35)  # Shape: (n_runs, n_clusters)
neighbor_counts_5_35 = np.array(neighbor_counts_5_35)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_5_35 = np.array(kmeans_labels_list_5_35)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_5_35.npy', max_distances_5_35)
np.save('neighbor_counts_within_radius_5_35.npy', neighbor_counts_5_35)
np.save('kmeans_labels_list_5_35 .npy', kmeans_labels_list_5_35)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_5_35)
print("\nNeighbor counts within radius for each run and each cluster:\n", neighbor_counts_5_35)
print("\nKMeans labels saved successfully.")

Option B) with radius --> half of the min. distance between two clusters.

It is calculated from dist. matrix of a specific run. So, it can be found in any of the 35 runs but it won't necessarily appear in the mean dist. matrix because here vlues are averaged across all runs and thus will likely not match specific values from an individual run's matrix.

In [4]:
max_distances_5_35_d = np.load(f'max_intra_cluster_distances_dynamic_5_35.npy')
neighbor_counts_5_35_d = np.load(f'neighbor_counts_within_dynamic_radius_5_35.npy')
kmeans_labels_list_5_35_d = np.load(f'kmeans_labels_list_5_35.npy')

In [None]:
# # Calculate the minimum distance between cluster centroids across all runs
# all_min_distances = []

# for run_centroids in kmeans_centroids_5:
#     # Compute pairwise distances between centroids
#     pairwise_distances = cdist(run_centroids, run_centroids, metric='euclidean')
#     # Get the minimum non-zero distance
#     min_distance = np.min(pairwise_distances[np.nonzero(pairwise_distances)])
#     all_min_distances.append(min_distance)

# # Use half the smallest minimum distance as the radius
# dynamic_radius = min(all_min_distances) / 2
# print(f"Dynamic radius for neighbor count: {dynamic_radius}")

In [None]:
# Variables to track the minimum distance and corresponding clusters
overall_min_distance = float('inf')
min_distance_clusters = None
min_distance_run_idx = None

for run_idx, run_centroids in enumerate(kmeans_centroids_5):
    # Compute pairwise distances between centroids
    pairwise_distances_5 = cdist(run_centroids, run_centroids, metric='euclidean')
    
    # Get the indices of the minimum non-zero distance
    np.fill_diagonal(pairwise_distances_5, np.inf)  # Ignore zero distances (self-comparisons)
    min_distance = np.min(pairwise_distances_5)
    if min_distance < overall_min_distance:
        overall_min_distance = min_distance
        # Find the indices of the clusters corresponding to the minimum distance
        cluster_indices = np.unravel_index(np.argmin(pairwise_distances_5), pairwise_distances_5.shape)
        min_distance_clusters = cluster_indices
        min_distance_run_idx = run_idx

# Calculate dynamic radius
dynamic_radius_5_35 = overall_min_distance / 2
print(f"Dynamic radius: {dynamic_radius_5_35}")
print(f"Minimum distance: {overall_min_distance}")
print(f"Clusters contributing to minimum distance: {min_distance_clusters}")
print(f"Run index: {min_distance_run_idx}")

# Save dynamic radius
np.save('dynamic_radius_results_5_35.npy', dynamic_radius_5_35)

In [None]:
# Define parameters
n_clusters = 10

# Dynamic radius, previously calculated
radius = dynamic_radius_5_35

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_5_35_d = []
neighbor_counts_5_35_d = []
kmeans_labels_list_5_35_d = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_5):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_5_35_d = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_5_35_d.append(kmeans_labels_5_35_d)
    
    run_max_distances_5_35_d = []
    run_neighbor_counts_5_35_d = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_5_35_d == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_5_35_d = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_5_35_d.append(max_distance_5_35_d)
        
        # Calculate number of neighbors within the dynamic radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_5_35_d = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_5_35_d.append(neighbors_within_radius_5_35_d)
    
    # Append results for this run
    max_distances_5_35_d.append(run_max_distances_5_35_d)
    neighbor_counts_5_35_d.append(run_neighbor_counts_5_35_d)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_5_35_d = np.array(max_distances_5_35_d)  # Shape: (n_runs, n_clusters)
neighbor_counts_5_35_d = np.array(neighbor_counts_5_35_d)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_5_35_d = np.array(kmeans_labels_list_5_35_d)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_dynamic_5_35.npy', max_distances_5_35_d)
np.save('neighbor_counts_within_dynamic_radius_5_35.npy', neighbor_counts_5_35_d)
np.save('kmeans_labels_list_5_35.npy', kmeans_labels_list_5_35_d)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_5_35_d)
print("\nNeighbor counts within dynamic radius for each run and each cluster:\n", neighbor_counts_5_35_d)
print("\nKMeans labels saved successfully.")

In [None]:
# Plot neighbor counts for each cluster across all runs
plt.figure(figsize=(16, 12))

# Iterate through each cluster
for cluster_idx in range(neighbor_counts_5_35_d.shape[1]):
    plt.subplot(2, 5, cluster_idx + 1)  # Create subplots for 10 clusters (2 rows, 5 columns)
    plt.plot(range(1, neighbor_counts_5_35_d.shape[0] + 1), neighbor_counts_5_35_d[:, cluster_idx], marker='o')
    plt.title(f'Cluster {cluster_idx}')
    plt.xlabel('Run')
    plt.ylabel('Neighbor Count')
    plt.xticks(range(1, neighbor_counts_5_35_d.shape[0] + 1, 5))  # Show every 5th run on the x-axis for clarity
    plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.suptitle('Neighbor Counts per Cluster Across Runs', y=1.02, fontsize=16)  # Add a global title
plt.show()

In [None]:
from scipy.stats import linregress
# Load neighbor counts (if not already loaded)
neighbor_counts_5_35_d = np.load('neighbor_counts_within_dynamic_radius_5_35.npy')  # Shape: (n_runs, n_clusters)

# Calculate mean and max values across clusters for each run
mean_neighbors = np.mean(neighbor_counts_5_35_d, axis=1)  # Shape: (n_runs,)
max_neighbors = np.max(neighbor_counts_5_35_d, axis=1)    # Shape: (n_runs,)

# Compute trend lines for mean and max
runs = np.arange(1, len(mean_neighbors) + 1)
mean_slope, mean_intercept, _, _, _ = linregress(runs, mean_neighbors)
max_slope, max_intercept, _, _, _ = linregress(runs, max_neighbors) 

# Calculate trend line values
mean_trend = mean_slope * runs + mean_intercept
max_trend = max_slope * runs + max_intercept

# Plot the results
plt.figure(figsize=(12, 6))

# Mean neighbor counts
plt.plot(runs, mean_neighbors, label='Mean Neighbor Count', marker='o', color='blue')

# Max neighbor counts
plt.plot(runs, max_neighbors, label='Max Neighbor Count', marker='s', color='orange')

# Trend lines
plt.plot(runs, mean_trend, linestyle='--', color='green',label='Mean Trend Line')
plt.plot(runs, max_trend, linestyle='--', color='green', label='Max Trend Line')

# Add labels, legend, and title
plt.title('n=5 Neighbor Counts Across Runs (Mean vs. Max with Trend Lines)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Neighbor Count', fontsize=12)
plt.xticks(range(1, len(mean_neighbors) + 1, 5))  # Show every 5th run for readability
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the plot to a file
plt.savefig(f'neighbor_counts_plot_n_5_35.png', dpi=300)

# Show the plot
plt.show()

In [None]:
from matplotlib.patches import Circle

# Load the UMAP projections and KMeans labels
umap_projections = np.load('umap_projections_neighbors_5.npy')
kmeans_centroids = np.load('kmeans_centroids_neighbors_5.npy')
kmeans_labels = np.load('kmeans_labels_list_5_35.npy')

# Define the dynamic radius
radius = dynamic_radius_5_35

# Select a specific run to visualize
run_idx = 3  # Choose the run index (e.g., run 3)
x_umap = umap_projections[run_idx]
centroids = kmeans_centroids[run_idx]
labels = kmeans_labels[run_idx]

# Plot the scatter plot
plt.figure(figsize=(12, 8))
for cluster_idx in range(n_clusters):
    # Get points in the current cluster
    cluster_points = x_umap[labels == cluster_idx]
    
    # Plot the points
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster_idx}', alpha=0.7)
    
    # Add a circle with the dynamic radius around the centroid
    centroid = centroids[cluster_idx]
    circle = Circle(centroid, radius, color='black', fill=False, linestyle='--', alpha=0.8)
    plt.gca().add_artist(circle)
    
    # Plot the centroid
    plt.scatter(centroid[0], centroid[1], color='red', s=100, edgecolor='black', label=f'Centroid {cluster_idx}')

# Final touches
plt.title(f"Cluster Visualization with Dynamic Radius (Run {run_idx})")
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.axis('equal')
plt.tight_layout()
plt.show()


-------

### n_neighbors= 10, n_runs = 35, n_clusters = 10 (for KMeans)

In [8]:
# Load the n_neighbors Analysis
umap_projections_10 = np.load(f'umap_projections_neighbors_10.npy')
centroid_mean_10_35= np.load(f'centroid_mean_10_35.npy')
centroid_std_10_35= np.load(f'centroid_std_10_35.npy')
kmeans_centroids_10 = np.load(f"kmeans_centroids_neighbors_10.npy")
df_results_v2=pd.read_csv('result_table_neighbors_10_35.csv')
mean_distance_matrix_10_35= np.load(f'mean_distance_matrix_neighbors_10_35.npy')
distance_matrix_std_10_35= np.load(f"distance_matrix_std_10_35.npy")
normalized_distance_matrix_std_10_35= np.load(f'normalized_distance_matrix_std_10_35.npy')
normalized_mean_distance_matrix_10_35= np.load(f'normalized_mean_distance_matrix_10_35.npy')
mst_std_10_35= np.load(f'mst_std_10_35.npy')
mst_10_35= np.load(f'mst_10_35.npy')

In [None]:
# Define the number of UMAP and KMeans runs
n_runs = 35
n_clusters = 10  # Set the number of clusters (for KMeans)
n_neighbors = 10

# Store UMAP and KMeans results for each run
umap_projections_10_35 = []
kmeans_centroids_list_10_35 = []  # Use this to store centroids for each run

# Define a helper function to calculate the centroid of each cluster
def calculate_centroids(kmeans, x_umap):
    centroids = []
    for i in range(n_clusters):
        cluster_points = x_umap[kmeans.labels_ == i]
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    return np.array(centroids)

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.1, n_components=2, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap = umap_model.fit_transform(x_train_flattened)
    
    # Apply KMeans clustering on the UMAP projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_train_umap)

    # Calculate centroids for this run
    centroids = calculate_centroids(kmeans, x_train_umap)
    
    # Store the UMAP projections and KMeans models
    umap_projections_10_35.append(x_train_umap)
    kmeans_centroids_list_10_35.append(centroids)

# Now we calculate the mean and standard deviation of the centroids across all runs
kmeans_centroids_10_35 = np.array(kmeans_centroids_list_10_35)  # Shape: (n_runs, n_clusters, 2)

# Calculate mean and std deviation for centroids' coordinates
centroid_mean = np.mean(kmeans_centroids_10_35, axis=0)
centroid_std = np.std(kmeans_centroids_10_35, axis=0)

# Save the UMAP projections and KMeans centroids using NumPy
np.save(f'umap_projections_neighbors_{n_neighbors}.npy', np.array(umap_projections_10_35))
np.save(f'kmeans_centroids_neighbors_{n_neighbors}.npy', np.array(kmeans_centroids_list_10_35))

In [None]:
# Load the UMAP projections
umap_projections_10 = np.load(f'umap_projections_neighbors_10.npy')

# To see the contents of the UMAP projections
print(umap_projections_10)

In [14]:
kmeans_centroids_10 = np.load(f"kmeans_centroids_neighbors_10.npy")  # Load the saved centroids data

In [41]:
# Save the centroid_mean and centroid_std
np.save(f'centroid_mean_{n_neighbors}_35.npy', np.array(centroid_mean))
np.save(f'centroid_std_{n_neighbors}_35.npy', np.array(centroid_std))

In [15]:
centroid_mean_10_35= np.load(f'centroid_mean_10_35.npy')
centroid_std_10_35= np.load(f'centroid_std_10_35.npy')

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_10 = np.zeros(10)
std_dev_y_10 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_10[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_10[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_10[i] = np.std(cluster_x_coords)
    std_dev_y_10[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_10)
print("Standard deviation of y coordinates per cluster:", std_dev_y_10)

In [46]:
# Create an empty list to hold the data for the DataFrame
data_v2_n10 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_10[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_std_10_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std_v2_n10 = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2_n10.append([trial + 1, cluster, centroid_coord, inside_2_std_v2_n10])

# Create a DataFrame from the list of data
df_results_v2_n10 = pd.DataFrame(data_v2_n10, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [47]:
# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true_v2_n10 = df_results_v2_n10.groupby('Trial')['Inside 2 std'].all()

In [48]:
# Filter the trials where all clusters were True
trials_with_all_true_v2_n10 = trials_all_true_v2_n10[trials_all_true_v2_n10].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true_v2_n10)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false_v2_n10 = trials_all_true_v2_n10[~trials_all_true_v2_n10].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [51]:
# Save the result table to a CSV file
df_results_v2_n10.to_csv(f'result_table_neighbors_v2_{10}_35.csv', index=False)

-------------

Removal outliers process

In [53]:
# Convert the NumPy array into a DataFrame with 'Cluster', 'x_mean', and 'y_mean'
centroid_mean_neighbors_10_df = pd.DataFrame(centroid_mean_10_35, columns=['x_mean', 'y_mean'])
centroid_mean_neighbors_10_df['Cluster'] = np.arange(10)

In [54]:
# Extract x and y coordinates
df_results_v2[['x', 'y']] = pd.DataFrame(df_results_v2['Centroid Coord'].tolist(), index=df_results_v2.index)

# Merge the mean centroids dataframe with the results dataframe on 'Cluster'
df_merged = pd.merge(df_results_v2, centroid_mean_neighbors_10_df, on='Cluster', how='left')

In [None]:
# Plot changes in X-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_10[:, cluster, 0], marker='o', linestyle='-', color='b')
    plt.title(f'Cluster {cluster} X-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('X Centroid Coordinate')
    plt.grid(True)
    plt.show()

# Plot changes in Y-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_10[:, cluster, 1], marker='o', linestyle='-', color='g')
    plt.title(f'Cluster {cluster} Y-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('Y Centroid Coordinate')
    plt.grid(True)
    plt.show()

In [None]:
# Calculate Euclidean distance from each centroid to its cluster's mean
df_merged['Distance_to_Mean'] = np.sqrt((df_merged['x'] - df_merged['x_mean'])**2 + (df_merged['y'] - df_merged['y_mean'])**2)

# Apply an outlier threshold (e.g., 90th percentile of the distance per cluster)
def filter_outliers(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return df[df['Distance_to_Mean'] <= threshold]

# Apply the filtering function for each cluster
df_no_outliers = df_merged.groupby('Cluster').apply(filter_outliers).reset_index(drop=True)

# Step 7: Drop unnecessary columns if needed (like 'x' and 'y' if only the distance matters)
df_no_outliers_cleaned_10_35 = df_no_outliers.drop(columns=['x', 'y', 'x_mean', 'y_mean'])

# Step 8: Check the size of the resulting dataframe
print(f"Original DataFrame size: {df_merged.shape}")
print(f"DataFrame size after removing outliers: {df_no_outliers_cleaned.shape}")

# Display the final dataframe
df_no_outliers_cleaned_10_35

In [60]:
# Group the dataframe by 'Cluster'
clusters_grouped_10_35 = df_no_outliers_cleaned_10_35.groupby('Cluster')

# Create a dictionary to store arrays for each cluster's centroids
clusters_centroids_10_35 = {}

# Loop through each group (cluster) and store the centroids in arrays
for cluster, group in clusters_grouped_10_35:
    # Extract centroids (x, y) as a NumPy array
    centroids_array = np.array(group['Centroid Coord'].tolist())  # Assuming 'Centroid Coord' contains [x, y] pairs
    clusters_centroids_10_35[cluster] = centroids_array

In [None]:
# Create a dictionary to store the size of each cluster
cluster_sizes_10_35 = {cluster: len(centroids) for cluster, centroids in clusters_centroids_10_35.items()}

# Print the size of each cluster
for cluster, size in cluster_sizes_10_35.items():
    print(f"Cluster {cluster} has {size} centroids considered.")

--------

Check to verify that if it is fine to have all clusters with the same number of centroids after filtering out outliers. This must be due to:
- The Distance Distributions are Likely Very Similar
- Uniform Data structure

In [None]:
# Loop through each cluster and plot the distribution of distances
for cluster, group in clusters_grouped:
    plt.figure(figsize=(10, 5))
    plt.hist(group['Distance_to_Mean'], bins=10, edgecolor='black')
    plt.title(f'Cluster {cluster}: Distance to Mean Distribution')
    plt.xlabel('Distance to Mean')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
# Percentile threshold per cluster check
def check_percentiles(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return threshold

# Function applied to each cluster and print the result
for cluster, group in clusters_grouped:
    threshold = check_percentiles(group)
    print(f"Cluster {cluster}: 90th percentile threshold = {threshold}")

In [None]:
# For each cluster, calculate the 70th percentile of distances and filter accordingly
for cluster, group in clusters_grouped:
    # Calculate the 70th percentile threshold for the current cluster
    threshold = np.percentile(group['Distance_to_Mean'], 70)
    
    # Filter centroids based on the 70th percentile
    filtered_group = group[group['Distance_to_Mean'] <= threshold]
    
    # Print the size of the group before and after filtering
    print(f"Cluster {cluster}: Original size = {len(group)}, Filtered size = {len(filtered_group)}")

------

#### Distance matrix n=10

##### Distance Mean Matrix

**Distance matrix**: elemnt d_{ij} has the distance between the center of cluster i and cluster j.

In [None]:
# Store distance matrices for each run
distance_matrices_10_35 = []

# Iterate over all runs and calculate the distance matrix for each run
for run_centroids in kmeans_centroids_10:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix_10_35 = cdist(run_centroids, run_centroids, metric='euclidean')
    distance_matrices_10_35.append(distance_matrix_10_35)

# Convert the list of distance matrices to a numpy array (35 runs, 10x10 distance matrices)
distance_matrices_10_35 = np.array(distance_matrices_10_35)

# Calculate the mean distance matrix across all runs
mean_distance_matrix_10_35 = np.mean(distance_matrices_10_35, axis=0)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_10_35 = (mean_distance_matrix_10_35 - np.min(mean_distance_matrix_10_35)) / (np.max(mean_distance_matrix_10_35) - np.min(mean_distance_matrix_5_35))

# Plot of the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_10_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_10_all_runs.npy', distance_matrices_10_35)
np.save('mean_distance_matrix_neighbors_10_35.npy', mean_distance_matrix_10_35)

# Mean distance matrix
print(f"Mean distance matrix across all runs:\n{mean_distance_matrix_10_35}")

In [52]:
mean_distance_matrix_10_35= np.load(f'mean_distance_matrix_neighbors_10_35.npy')
# mean_distance_matrix_10_35=np.round(mean_distance_matrix_10_35,3)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_10_35 = (mean_distance_matrix_10_35 - np.min(mean_distance_matrix_10_35)) / (np.max(mean_distance_matrix_10_35) - np.min(mean_distance_matrix_10_35))

In [None]:
# Create a graph from the distance matrix
G_10_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_10_35,3))
np.save('G_10_35.npy',G_10_35)

# Draw the graph
pos = nx.spring_layout(G_10_35, seed=42)  # positions for all nodes
nx.draw(G_10_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_10_35, 'weight')
nx.draw_networkx_edge_labels(G_10_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Minimum spanning tree

# Compute the minimum spanning tree of the graph
mst_10_35 = nx.minimum_spanning_tree(G_10_35)
np.save('mst_10_35.npy',mst_10_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_10_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_10_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_10_35, 'weight')
nx.draw_networkx_edge_labels(mst_10_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST - n_neighbors=10")
plt.show()

##### Distance Std. dev. Matrix

In [None]:
# Calculate the pairwise distance matrix for the standard deviations
distance_matrix_std_10_35 = cdist(centroid_std_10_35, centroid_std_10_35, metric='euclidean')

# Normalize the distance matrix
normalized_distance_matrix_std_10_35 = (distance_matrix_std_10_35 - np.min(distance_matrix_std_10_35)) / (np.max(distance_matrix_std_10_35) - np.min(distance_matrix_std_10_35))

# Visualize the normalized distance matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(normalized_distance_matrix_std_10_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=10)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()

# Save the distance matrix for later analysis
np.save("distance_matrix_std_10_35.npy", distance_matrix_std_10_35)
np.save("normalized_distance_matrix_std_10_35.npy", normalized_distance_matrix_std_10_35)


In [None]:
# Create a graph from the distance matrix
G_std_10_35 = nx.from_numpy_array(np.round(normalized_distance_matrix_std_10_35,3))
np.save('G_std_10_35.npy', G_std_10_35)

# Draw the graph
pos = nx.spring_layout(G_std_10_35, seed=42)  # positions for all nodes
nx.draw(G_std_10_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_std_10_35, 'weight')
nx.draw_networkx_edge_labels(G_std_10_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_std_10_35 = nx.minimum_spanning_tree(G_std_10_35)
np.save('mst_std_10_35.npy', mst_std_10_35)

# Define positions for all nodes
pos_std_10_35 = nx.spring_layout(mst_std_10_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_std_10_35, pos_std_10_35, with_labels=True, node_color='lightyellow', edge_color='green', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels_std_10_35 = nx.get_edge_attributes(mst_std_10_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_10_35, pos_std_10_35, edge_labels=edge_labels_std_10_35, font_size=8, label_pos=0.3)

plt.title("MST Std. Deviation - n_neighbors=10")
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_10_35, "MST - Mean Distances", axes[0], color='red')
plot_mst(norm_lower_limit_intconf_matrix_10_35, "MST - Lower Limit", axes[1], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_10_35, "MST - Upper Limit", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

In [None]:
def plot_heatmap(matrix, title, xlabel, ylabel, figsize=(10, 8), cmap="viridis", annot=True):
    """
    Plots a heatmap for a given matrix with customizable parameters.

    Args:
        matrix (ndarray): The 2D matrix to plot as a heatmap.
        title (str): Title of the heatmap.
        xlabel (str): Label for the x-axis.
        ylabel (str): Label for the y-axis.
        figsize (tuple): Size of the figure (default: (10, 8)).
        cmap (str): Color map to use (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    plt.figure(figsize=figsize)
    sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

# Call the function for both heatmaps
plot_heatmap(
    normalized_distance_matrix_std_10_35,
    title="Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=10)",
    xlabel="Cluster",
    ylabel="Cluster",
    figsize=(8, 6)
)

plot_heatmap(
    normalized_mean_distance_matrix_10_35,
    title="Normalized Mean Distance Matrix (k=10, n_neighbors=10)",
    xlabel="Cluster",
    ylabel="Cluster",
    figsize=(8, 6)
)

In [None]:
# Define a function to create heatmaps
def plot_heatmaps_side_by_side(matrices, titles, figsize=(16, 8), cmap="viridis", annot=True):
    """
    Plots multiple heatmaps side by side for given matrices and titles.

    Args:
        matrices (list): List of 2D matrices to plot as heatmaps.
        titles (list): List of titles corresponding to each matrix.
        figsize (tuple): Size of the entire figure (default: (16, 8)).
        cmap (str): Color map to use for all heatmaps (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    n = len(matrices)  # Number of heatmaps
    fig, axes = plt.subplots(1, n, figsize=figsize)

    for i, (matrix, title) in enumerate(zip(matrices, titles)):
        sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5, ax=axes[i])
        axes[i].set_title(title)
        axes[i].set_xlabel("Cluster")
        axes[i].set_ylabel("Cluster" if i == 0 else "")  # Only label y-axis for the first plot

    plt.tight_layout()
    plt.show()

# Call the function with the two heatmaps
plot_heatmaps_side_by_side(
    matrices=[
        normalized_distance_matrix_std_10_35,
        normalized_mean_distance_matrix_10_35
    ],
    titles=[
        "Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=5)",
        "Normalized Mean Distance Matrix (k=10, n_neighbors=5)"
    ]
)

In [None]:
# Define a threshold for "low" values
threshold = 0.65

# Identify pairs of clusters with low values in both matrices
low_low_pairs = []
for i in range(normalized_mean_distance_matrix_10_35.shape[0]):
    for j in range(normalized_mean_distance_matrix_10_35.shape[1]):
        if i != j:  # Skip diagonal
            mean_value = normalized_mean_distance_matrix_10_35[i, j]
            std_value = normalized_distance_matrix_std_10_35[i, j]
            if mean_value < threshold and std_value < threshold:
                low_low_pairs.append((i, j, mean_value, std_value))

# Display the results
for pair in low_low_pairs:
    print(f"Clusters {pair[0]} and {pair[1]}: Mean Distance = {pair[2]:.2f}, Std Distance = {pair[3]:.2f}")

0.65 can seem like a high value, since it is on the upper-mid range.

Depending on the goal of the analysis we can think of it as:
- If the aim is to identify the strongest relationships between clusters, a lower threshold would make more sense.
- If we want to explore the broader connections, then it is fine.

In [None]:
# Example: Replace with your cluster pairs from low_low_pairs
low_low_pairs = [(0, 9, 0.64, 0.18), (0, 6, 0.64, 0.23),(7, 9, 0.62, 0.26)]  # Example cluster pairs

# UMAP projections and cluster labels (replace with your actual data)
umap_projections = np.load("umap_projections_neighbors_10.npy")
kmeans_labels = np.load("kmeans_labels_list_10_35.npy")  # Shape: (n_runs, n_samples)

# Function to plot clusters
def plot_clusters(umap_projection, labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair
    points_a = umap_projection[labels == cluster_a]
    points_b = umap_projection[labels == cluster_b]

    plt.figure(figsize=(8, 6))
    plt.scatter(points_a[:, 0], points_a[:, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.6)
    plt.scatter(points_b[:, 0], points_b[:, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.6)
    plt.title(f"Run {run_idx}: Cluster {cluster_a} vs. Cluster {cluster_b}")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Analyze each cluster pair
for cluster_pair in low_low_pairs:
    cluster_a, cluster_b = cluster_pair[0], cluster_pair[1]
    print(f"Analyzing Cluster Pair: {cluster_a} and {cluster_b}")
    
    # For simplicity, visualize them in a specific UMAP run (e.g., the first run)
    run_idx = 0  # Use the first run for visualization
    plot_clusters(umap_projections[run_idx], kmeans_labels[run_idx], (cluster_a, cluster_b), run_idx)

    # Calculate additional statistics if needed
    distances_a_to_b = np.linalg.norm(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a].mean(axis=0) - 
                                      umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b].mean(axis=0))
    print(f"Mean Centroid Distance (Run {run_idx}): {distances_a_to_b:.2f}")

    # Variability comparison
    cluster_a_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a], axis=0)
    cluster_b_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b], axis=0)
    print(f"Cluster {cluster_a} Std Dev: {cluster_a_std}")
    print(f"Cluster {cluster_b} Std Dev: {cluster_b_std}")
    print("\n")

**Cluster 1 and Cluster 8** have a moderate spatial relationship with visible overlap in the UMAP space. Their differing variability patterns suggest distinct structures, but the overlap points might represent shared features or transitions between the clusters.
The large spatial separation between their centroids suggests they represent distinct structures or classes in the data.

**Cluster 0 and Cluster 9** 9 appears more compact and stable, while Cluster 0 is larger and more variable.
Their distinct regions in the UMAP space and differing standard deviations reinforce their meaningful separation.
Insights from Variability:

The variability of Cluster 0 could indicate sensitivity to UMAP parameters or noise in the data.

In [None]:
def examine_mnist_overlap(umap_projection, kmeans_labels, mnist_labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair

    # Get points in Cluster A and Cluster B
    points_a_indices = np.where(kmeans_labels == cluster_a)[0]
    points_b_indices = np.where(kmeans_labels == cluster_b)[0]

    # Find the overlapping points (indices)
    overlap_indices = np.intersect1d(points_a_indices, points_b_indices)

    # Get the original labels of overlapping points
    overlap_labels = np.array(mnist_labels)[overlap_indices]

    # Analyze the original labels
    overlap_label_counts = pd.Series(overlap_labels).value_counts()

    # Display the overlap statistics
    print(f"Overlap between Cluster {cluster_a} and Cluster {cluster_b} (Run {run_idx}):")
    print(f"Number of overlapping points: {len(overlap_indices)}")
    print(f"Original label distribution of overlapping points:\n{overlap_label_counts}")

    # Visualize the overlap
    plt.figure(figsize=(8, 6))
    plt.scatter(umap_projection[points_a_indices, 0], umap_projection[points_a_indices, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.5)
    plt.scatter(umap_projection[points_b_indices, 0], umap_projection[points_b_indices, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.5)
    if len(overlap_indices) > 0:
        plt.scatter(umap_projection[overlap_indices, 0], umap_projection[overlap_indices, 1], color="red", label="Overlap", alpha=0.7)
    plt.title(f"Cluster Overlap: Cluster {cluster_a} vs. Cluster {cluster_b} (Run {run_idx})")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Example usage:
# Load your MNIST data
dataloader = MnistDataloader(
    training_images_filepath="train-images.idx3-ubyte",
    training_labels_filepath="train-labels.idx1-ubyte",
    test_images_filepath="t10k-images.idx3-ubyte",
    test_labels_filepath="t10k-labels.idx1-ubyte"
)

# Load data
(x_train, y_train), (x_test, y_test) = dataloader.load_data()

# Flatten the training images for UMAP (if needed for alignment with projections)
x_train_flattened = np.array([np.array(img).flatten() for img in x_train])

# Example variables (replace these with your actual data)
run_idx = 0  # Analyze the first UMAP run
cluster_pair = (1, 8)  # Compare Cluster 1 and Cluster 8
umap_projection = umap_projections[run_idx]  # UMAP projection for the given run
kmeans_labels = kmeans_labels[run_idx]  # KMeans labels for the given run

# Examine overlap
examine_mnist_overlap(umap_projection, kmeans_labels, y_train, cluster_pair, run_idx)

Interval of confidence

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution

n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_10_35 = distance_matrix_std_10_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_10_35 = z_score * sem_matrix_10_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_10_35 = mean_distance_matrix_10_35 - margin_of_error_matrix_10_35
upper_limit_intconf_matrix_10_35 = mean_distance_matrix_10_35 + margin_of_error_matrix_10_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_10_35 = np.maximum(lower_limit_intconf_matrix_10_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_10_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_10_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_10_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_10_35.npy', lower_limit_intconf_matrix_10_35)
np.save('upper_limit_intconf_matrix_10_35.npy', upper_limit_intconf_matrix_10_35)

In [21]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_10_35 = normalize_matrix(lower_limit_intconf_matrix_10_35)
norm_upper_limit_intconf_matrix_10_35 = normalize_matrix(upper_limit_intconf_matrix_10_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_10_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=10)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_10_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=10)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_10_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=10)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

#### Intra class evaluation

In [None]:
# Define parameters
n_clusters = 10
radius = 0.5

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_10_35 = []
neighbor_counts_10_35 = []
kmeans_labels_list_10_35 = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_10):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_10_35 = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_10_35.append(kmeans_labels_10_35)
    
    run_max_distances_10_35 = []
    run_neighbor_counts_10_35 = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_10_35 == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_10_35 = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_10_35.append(max_distance_10_35)
        
        # Calculate number of neighbors within the radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_10_35 = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_10_35.append(neighbors_within_radius_10_35)
    
    # Append results for this run
    max_distances_10_35.append(run_max_distances_10_35)
    neighbor_counts_10_35.append(run_neighbor_counts_10_35)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_10_35 = np.array(max_distances_10_35)  # Shape: (n_runs, n_clusters)
neighbor_counts_10_35 = np.array(neighbor_counts_10_35)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_10_35 = np.array(kmeans_labels_list_10_35)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_10_35.npy', max_distances_10_35)
np.save('neighbor_counts_within_radius_10_35.npy', neighbor_counts_10_35)
np.save('kmeans_labels_list_10_35 .npy', kmeans_labels_list_10_35)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_10_35)
print("\nNeighbor counts within radius for each run and each cluster:\n", neighbor_counts_10_35)
print("\nKMeans labels saved successfully.")

In [48]:
max_distances_10_35_d= np.load(f'max_intra_cluster_distances_dynamic_10_35.npy')
neighbor_counts_10_35_d= np.load(f'neighbor_counts_within_dynamic_radius_10_35.npy')
kmeans_labels_list_10_35_d= np.load(f'kmeans_labels_list_10_35.npy')

In [None]:
# Variables to track the minimum distance and corresponding clusters
overall_min_distance_10 = float('inf')
min_distance_clusters_10 = None
min_distance_run_idx_10= None

for run_idx, run_centroids in enumerate(kmeans_centroids_10):
    # Compute pairwise distances between centroids
    pairwise_distances_10 = cdist(run_centroids, run_centroids, metric='euclidean')
    
    # Get the indices of the minimum non-zero distance
    np.fill_diagonal(pairwise_distances_10, np.inf)  # Ignore zero distances (self-comparisons)
    min_distance = np.min(pairwise_distances_10)
    if min_distance < overall_min_distance_10:
        overall_min_distance_10 = min_distance
        # Find the indices of the clusters corresponding to the minimum distance
        cluster_indices = np.unravel_index(np.argmin(pairwise_distances_10), pairwise_distances_10.shape)
        min_distance_clusters_10 = cluster_indices
        min_distance_run_idx_10 = run_idx

# Calculate dynamic radius
dynamic_radius_10_35 = overall_min_distance_10 / 2
print(f"Dynamic radius: {dynamic_radius_10_35}")
print(f"Minimum distance: {overall_min_distance_10}")
print(f"Clusters contributing to minimum distance: {min_distance_clusters_10}")
print(f"Run index: {min_distance_run_idx_10}")

# Save dynamic radius
np.save('dynamic_radius_results_10_35.npy', dynamic_radius_10_35)


In [None]:
# Define parameters
n_clusters = 10

# Dynamic radius, previously calculated
radius = dynamic_radius_10_35

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_10_35_d = []
neighbor_counts_10_35_d = []
kmeans_labels_list_10_35_d = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_10):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_10_35_d = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_10_35_d.append(kmeans_labels_10_35_d)
    
    run_max_distances_10_35_d = []
    run_neighbor_counts_10_35_d = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_10_35_d == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_10_35_d = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_10_35_d.append(max_distance_10_35_d)
        
        # Calculate number of neighbors within the dynamic radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_10_35_d = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_10_35_d.append(neighbors_within_radius_10_35_d)
    
    # Append results for this run
    max_distances_10_35_d.append(run_max_distances_10_35_d)
    neighbor_counts_10_35_d.append(run_neighbor_counts_10_35_d)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_10_35_d = np.array(max_distances_10_35_d)  # Shape: (n_runs, n_clusters)
neighbor_counts_10_35_d = np.array(neighbor_counts_10_35_d)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_10_35_d = np.array(kmeans_labels_list_10_35_d)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_dynamic_10_35.npy', max_distances_10_35_d)
np.save('neighbor_counts_within_dynamic_radius_10_35.npy', neighbor_counts_10_35_d)
np.save('kmeans_labels_list_10_35.npy', kmeans_labels_list_10_35_d)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_10_35_d)
print("\nNeighbor counts within dynamic radius for each run and each cluster:\n", neighbor_counts_10_35_d)
print("\nKMeans labels saved successfully.")

In [None]:
# Plot neighbor counts for each cluster across all runs
plt.figure(figsize=(16, 12))

# Iterate through each cluster
for cluster_idx in range(neighbor_counts_10_35_d.shape[1]):
    plt.subplot(2, 5, cluster_idx + 1)  # Create subplots for 10 clusters (2 rows, 5 columns)
    plt.plot(range(1, neighbor_counts_10_35_d.shape[0] + 1), neighbor_counts_10_35_d[:, cluster_idx], marker='o')
    plt.title(f'Cluster {cluster_idx}')
    plt.xlabel('Run')
    plt.ylabel('Neighbor Count')
    plt.xticks(range(1, neighbor_counts_10_35_d.shape[0] + 1, 5))  # Show every 5th run on the x-axis for clarity
    plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.suptitle('N=10 Neighbor Counts per Cluster Across Runs', y=1.02, fontsize=16)  # Add a global title
plt.show()

In [None]:
# Calculate mean and max values across clusters for each run
mean_neighbors_10 = np.mean(neighbor_counts_10_35_d, axis=1)  # Shape: (n_runs,)
max_neighbors_10 = np.max(neighbor_counts_10_35_d, axis=1)    # Shape: (n_runs,)

# Compute trend lines for mean and max
runs = np.arange(1, len(mean_neighbors_10) + 1)
mean_slope, mean_intercept, _, _, _ = linregress(runs, mean_neighbors_10)
max_slope, max_intercept, _, _, _ = linregress(runs, max_neighbors_10)

# Calculate trend line values
mean_trend_10 = mean_slope * runs + mean_intercept
max_trend_10 = max_slope * runs + max_intercept

# Plot the results
plt.figure(figsize=(12, 6))

# Mean neighbor counts
plt.plot(runs, mean_neighbors_10, label='Mean Neighbor Count', marker='o', color='blue')

# Max neighbor counts
plt.plot(runs, max_neighbors_10, label='Max Neighbor Count', marker='s', color='orange')

# Trend lines
plt.plot(runs, mean_trend_10, linestyle='--', color='green',label='Mean Trend Line')
plt.plot(runs, max_trend_10, linestyle='--', color='green', label='Max Trend Line')

# Add labels, legend, and title
plt.title('N=10 Neighbor Counts Across Runs (Mean vs. Max with Trend Lines)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Neighbor Count', fontsize=12)
plt.xticks(range(1, len(mean_neighbors_10) + 1, 5))  # Show every 5th run for readability
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the plot to a file
plt.savefig(f'neighbor_counts_plot_n_10_35.png', dpi=300)

# Show the plot
plt.show()

-----------

### n_neighbours= 20, n_runs = 35, n_clusters = 10 (for KMeans)

In [23]:
# Load the n_neighbors Analysis
umap_projections_20 = np.load(f'umap_projections_neighbors_20.npy')
centroid_mean_20_35= np.load(f'centroid_mean_20_35.npy')
centroid_std_20_35= np.load(f'centroid_std_20_35.npy')
kmeans_centroids_20 = np.load(f"kmeans_centroids_neighbors_20.npy")
df_results_v2=pd.read_csv('result_table_neighbors_20_35.csv')
mean_distance_matrix_20_35= np.load(f'mean_distance_matrix_neighbors_20_35.npy')
distance_matrix_std_20_35= np.load(f"distance_matrix_std_20_35.npy")
normalized_distance_matrix_std_20_35= np.load(f'normalized_distance_matrix_std_20_35.npy')
normalized_mean_distance_matrix_20_35= np.load(f'normalized_mean_distance_matrix_20_35.npy')
mst_std_20_35= np.load(f"mst_std_20_35.npy")
mst_20_35= np.load(f"mst_20_35.npy")

In [None]:
# Define the number of UMAP and KMeans runs
n_runs = 35
n_clusters = 10  # Set the number of clusters (for KMeans)
n_neighbors = 20

# Store UMAP and KMeans results for each run
umap_projections_20_35 = []
kmeans_centroids_list_20_35 = []  # Use this to store centroids for each run

# Define a helper function to calculate the centroid of each cluster
def calculate_centroids(kmeans, x_umap):
    centroids = []
    for i in range(n_clusters):
        cluster_points = x_umap[kmeans.labels_ == i]
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    return np.array(centroids)

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.1, n_components=2, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap = umap_model.fit_transform(x_train_flattened)
    
    # Apply KMeans clustering on the UMAP projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_train_umap)

    # Calculate centroids for this run
    centroids = calculate_centroids(kmeans, x_train_umap)
    
    # Store the UMAP projections and KMeans models
    umap_projections_20_35.append(x_train_umap)
    kmeans_centroids_list_20_35.append(centroids)

# Now we calculate the mean and standard deviation of the centroids across all runs
kmeans_centroids_20_35 = np.array(kmeans_centroids_list_20_35)  

# Calculate mean and std deviation for centroids' coordinates
centroid_mean_20_35 = np.mean(kmeans_centroids_20_35, axis=0)
centroid_std_20_35 = np.std(kmeans_centroids_20_35, axis=0)

# Save the UMAP projections and KMeans centroids using NumPy
np.save(f'umap_projections_neighbors_{n_neighbors}.npy', np.array(umap_projections_20_35))
np.save(f'kmeans_centroids_neighbors_{n_neighbors}.npy', np.array(kmeans_centroids_list_20_35))

In [None]:
# Load the UMAP projections
umap_projections_20_35 = np.load(f'umap_projections_neighbors_20.npy')

# To see the contents of the UMAP projections
print(umap_projections_20_35)

In [71]:
### NO NEED TO RE RUN ###

# Save the centroid_mean and centroid_std
np.save(f'centroid_mean_{n_neighbors}_35.npy', np.array(centroid_mean_20_35))
np.save(f'centroid_std_{n_neighbors}_35.npy', np.array(centroid_std_20_35))

In [19]:
centroid_mean_20_35= np.load(f'centroid_mean_20_35.npy')
centroid_std_20_35= np.load(f'centroid_std_20_35.npy')

In [20]:
kmeans_centroids_20_35 = np.load(f"kmeans_centroids_neighbors_20.npy")  # Load the saved centroids data

------------------------

Standard deviation calculation

In [None]:
### NO NEED TO RE RUN ###
# Initialize arrays to store standard deviations
std_dev_x_20_35 = np.zeros(10)
std_dev_y_20_35 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_20_35[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_20_35[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_20_35[i] = np.std(cluster_x_coords)
    std_dev_y_20_35[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_20_35)
print("Standard deviation of y coordinates per cluster:", std_dev_y_20_35)

In [75]:
### NO NEED TO RE RUN ###
# Create an empty list to hold the data for the DataFrame
data_20_35 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_20_35[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_20_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x_20_35[cluster], mean_x + 2 * std_dev_x_20_35[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y_20_35[cluster], mean_y + 2 * std_dev_y_20_35[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_20_35.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_20_35 = pd.DataFrame(data_20_35, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [76]:
### NO NEED TO RE RUN ###
# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true_20_35 = df_results_20_35.groupby('Trial')['Inside 2 std'].all()

In [77]:
### NO NEED TO RE RUN ###
# Filter the trials where all clusters were True
trials_with_all_true_20_35 = trials_all_true_20_35[trials_all_true_20_35].index.tolist()

In [None]:
### NO NEED TO RE RUN ###
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true_20_35)

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where not all clusters were True
trials_with_some_false_20_35 = trials_all_true_20_35[~trials_all_true_20_35].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false_20_35)

In [80]:
### NO NEED TO RE RUN ###
# Save the result table to a CSV file
df_results_20_35.to_csv(f'result_table_neighbors_v2_{20}_35.csv', index=False)

-----------------

Removal outliers process

In [21]:
df_results_20_35=pd.read_csv('result_table_neighbors_v2_20_35.csv')

In [83]:
# Convert the NumPy array into a DataFrame with 'Cluster', 'x_mean', and 'y_mean'
centroid_mean_20_35_df = pd.DataFrame(centroid_mean_20_35, columns=['x_mean', 'y_mean'])
centroid_mean_20_35_df['Cluster'] = np.arange(10)

In [99]:
# Extract x and y coordinates
df_results_20_35[['x', 'y']] = pd.DataFrame(df_results_20_35['Centroid Coord'].tolist(), index=df_results_20_35.index)

# Merge the mean centroids dataframe with the results dataframe on 'Cluster'
df_merged_20_35 = pd.merge(df_results_20_35, centroid_mean_20_35_df, on='Cluster', how='left')

In [None]:
# Plot changes in X-coordinate for each cluster over all runs
n_runs = 35
n_clusters = 10
n_neighbors = 5

for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_20_35[:, cluster, 0], marker='o', linestyle='-', color='b')
    plt.title(f'Cluster {cluster} X-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('X Centroid Coordinate')
    plt.grid(True)
    plt.show()

# Plot changes in Y-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_20_35[:, cluster, 1], marker='o', linestyle='-', color='g')
    plt.title(f'Cluster {cluster} Y-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('Y Centroid Coordinate')
    plt.grid(True)
    plt.show()

In [None]:
# Calculate Euclidean distance from each centroid to its cluster's mean
df_merged_20_35['Distance_to_Mean'] = np.sqrt((df_merged_20_35['x'] - df_merged_20_35['x_mean'])**2 + (df_merged_20_35['y'] - df_merged_20_35['y_mean'])**2)

# Apply an outlier threshold (e.g., 90th percentile of the distance per cluster)
def filter_outliers(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return df[df['Distance_to_Mean'] <= threshold]

# Apply the filtering function for each cluster
df_no_outliers_20_35 = df_merged_20_35.groupby('Cluster').apply(filter_outliers).reset_index(drop=True)

# Step 7: Drop unnecessary columns if needed (like 'x' and 'y' if only the distance matters)
df_no_outliers_cleaned_20_35 = df_no_outliers_20_35.drop(columns=['x', 'y', 'x_mean', 'y_mean'])

# Step 8: Check the size of the resulting dataframe
print(f"Original DataFrame size: {df_merged_20_35.shape}")
print(f"DataFrame size after removing outliers: {df_no_outliers_cleaned_20_35.shape}")

# Display the final dataframe to the user
df_no_outliers_cleaned_20_35

In [103]:
# Group the dataframe by 'Cluster'
clusters_grouped_20_35 = df_no_outliers_cleaned_20_35.groupby('Cluster')

# Create a dictionary to store arrays for each cluster's centroids
clusters_centroids_20_35 = {}

# Loop through each group (cluster) and store the centroids in arrays
for cluster, group in clusters_grouped_20_35:
    # Extract centroids (x, y) as a NumPy array
    centroids_array = np.array(group['Centroid Coord'].tolist())  # Assuming 'Centroid Coord' contains [x, y] pairs
    clusters_centroids_20_35[cluster] = centroids_array

In [None]:
# Create a dictionary to store the size of each cluster
cluster_sizes_20_35 = {cluster: len(centroids) for cluster, centroids in clusters_centroids_20_35.items()}

# Print the size of each cluster
for cluster, size in cluster_sizes_20_35.items():
    print(f"Cluster {cluster} has {size} centroids considered.")

--------

Check to verify that if it is fine to have all clusters with the same number of centroids after filtering out outliers. This must be due to:
- The Distance Distributions are Likely Very Similar
- Uniform Data structure

In [None]:
# Loop through each cluster and plot the distribution of distances
for cluster, group in clusters_grouped_20_35:
    plt.figure(figsize=(10, 5))
    plt.hist(group['Distance_to_Mean'], bins=10, edgecolor='black')
    plt.title(f'Cluster {cluster}: Distance to Mean Distribution')
    plt.xlabel('Distance to Mean')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
# Percentile threshold per cluster check
def check_percentiles(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return threshold

# Function applied to each cluster and print the result
for cluster, group in clusters_grouped_20_35:
    threshold = check_percentiles(group)
    print(f"Cluster {cluster}: 90th percentile threshold = {threshold}")

In [None]:
# For each cluster, calculate the 70th percentile of distances and filter accordingly
for cluster, group in clusters_grouped_20_35:
    # Calculate the 70th percentile threshold for the current cluster
    threshold = np.percentile(group['Distance_to_Mean'], 70)
    
    # Filter centroids based on the 70th percentile
    filtered_group = group[group['Distance_to_Mean'] <= threshold]
    
    # Print the size of the group before and after filtering
    print(f"Cluster {cluster}: Original size = {len(group)}, Filtered size = {len(filtered_group)}")

--------

#### Mean Distance matrix n=20

##### Distance Mean matrix

**Distance matrix**: elemnt d_{ij} has the distance between the center of cluster i and cluster j.

In [None]:
# Store distance matrices for each run
distance_matrices_20_35 = []

# Iterate over all runs and calculate the distance matrix for each run
for run_centroids in kmeans_centroids_20_35:
    # Calculate the pairwise Euclidean distance between centroids for this run
    dist_matrix = cdist(run_centroids, run_centroids, metric='euclidean')
    distance_matrices_20_35.append(dist_matrix)

# Convert the list of distance matrices to a numpy array (35 runs, 10x10 distance matrices)
distance_matrices_20_35 = np.array(distance_matrices_20_35)

# Calculate the mean distance matrix across all runs
mean_distance_matrix_20_35 = np.mean(distance_matrices_20_35, axis=0)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_20_35 = (mean_distance_matrix_20_35 - np.min(mean_distance_matrix_20_35)) / (np.max(mean_distance_matrix_20_35) - np.min(mean_distance_matrix_20_35))

# Plot of the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_20_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=20)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_20_all_runs.npy', distance_matrices_20_35)
np.save('mean_distance_matrix_neighbors_20_35.npy', mean_distance_matrix_20_35)

# Mean distance matrix
print(f"Mean distance matrix across all runs:\n{mean_distance_matrix_20_35}")

In [45]:
# mean_distance_matrix_20_35= np.load(f'mean_distance_matrix_neighbors_20_35.npy')
# mean_distance_matrix_20_35=np.round(mean_distance_matrix_20_35,3)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_20_35 = (mean_distance_matrix_20_35 - np.min(mean_distance_matrix_20_35)) / (np.max(mean_distance_matrix_20_35) - np.min(mean_distance_matrix_20_35))
np.save('normalized_mean_distance_matrix_20_35.npy', normalized_mean_distance_matrix_20_35)

In [None]:
# Create a graph from the distance matrix
G_20_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_20_35,3))
np.save('G_20_35.npy', G_20_35)

# Draw the graph
pos = nx.spring_layout(G_20_35, seed=42)  # positions for all nodes
nx.draw(G_20_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_20_35, 'weight')
nx.draw_networkx_edge_labels(G_20_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Minimum spanning tree

# Compute the minimum spanning tree of the graph
mst_20_35 = nx.minimum_spanning_tree(G_20_35)
np.save("mst_20_35.npy",mst_20_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_20_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_20_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_20_35, 'weight')
nx.draw_networkx_edge_labels(mst_20_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST - n_neighbors=20")
plt.show()

**Distance Std. dev. Matrix**

##### Distance Std. dev. Matrix

In [None]:
# Calculate the pairwise distance matrix for the standard deviations
distance_matrix_std_20_35 = cdist(centroid_std_20_35, centroid_std_20_35, metric='euclidean')

# Normalize the distance matrix
normalized_distance_matrix_std_20_35 = (distance_matrix_std_20_35 - np.min(distance_matrix_std_20_35)) / (np.max(distance_matrix_std_20_35) - np.min(distance_matrix_std_20_35))

# Visualize the normalized distance matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(normalized_distance_matrix_std_20_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=20)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()

# Save the distance matrix for later analysis
np.save("distance_matrix_std_20_35.npy", distance_matrix_std_20_35)
np.save("normalized_distance_matrix_std_20_35.npy", normalized_distance_matrix_std_20_35)


In [None]:
# Create a graph from the distance matrix
G_std_20_35 = nx.from_numpy_array(np.round(normalized_distance_matrix_std_20_35,3))
np.save('G_std_20_35.npy',G_std_20_35)

# Draw the graph
pos = nx.spring_layout(G_std_20_35, seed=42)  # positions for all nodes
nx.draw(G_std_20_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_std_20_35, 'weight')
nx.draw_networkx_edge_labels(G_std_20_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_std_20_35 = nx.minimum_spanning_tree(G_std_20_35)
np.save('mst_std_20_35.npy',G_std_20_35)

# Define positions for all nodes
pos_std_20_35 = nx.spring_layout(mst_std_20_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_std_20_35, pos_std_20_35, with_labels=True, node_color='lightyellow', edge_color='green', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels_std_20_35 = nx.get_edge_attributes(mst_std_20_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_20_35, pos_std_20_35, edge_labels=edge_labels_std_20_35, font_size=8, label_pos=0.3)

plt.title("MST Std. Deviation - n_neighbors=20")
plt.show()

In [None]:
# Define a function to create heatmaps
def plot_heatmaps_side_by_side(matrices, titles, figsize=(16, 8), cmap="viridis", annot=True):
    """
    Plots multiple heatmaps side by side for given matrices and titles.

    Args:
        matrices (list): List of 2D matrices to plot as heatmaps.
        titles (list): List of titles corresponding to each matrix.
        figsize (tuple): Size of the entire figure (default: (16, 8)).
        cmap (str): Color map to use for all heatmaps (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    n = len(matrices)  # Number of heatmaps
    fig, axes = plt.subplots(1, n, figsize=figsize)

    for i, (matrix, title) in enumerate(zip(matrices, titles)):
        sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5, ax=axes[i])
        axes[i].set_title(title)
        axes[i].set_xlabel("Cluster")
        axes[i].set_ylabel("Cluster" if i == 0 else "")  # Only label y-axis for the first plot

    plt.tight_layout()
    plt.show()

# Call the function with the two heatmaps
plot_heatmaps_side_by_side(
    matrices=[
        normalized_distance_matrix_std_20_35,
        normalized_mean_distance_matrix_20_35
    ],
    titles=[
        "Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=20)",
        "Normalized Mean Distance Matrix (k=10, n_neighbors=20)"
    ]
)

In [None]:
# Define a threshold for "low" values
threshold = 0.6

# Identify pairs of clusters with low values in both matrices
low_low_pairs = []
for i in range(normalized_mean_distance_matrix_20_35.shape[0]):
    for j in range(normalized_mean_distance_matrix_20_35.shape[1]):
        if i != j:  # Skip diagonal
            mean_value = normalized_mean_distance_matrix_20_35[i, j]
            std_value = normalized_distance_matrix_std_20_35[i, j]
            if mean_value < threshold and std_value < threshold:
                low_low_pairs.append((i, j, mean_value, std_value))

# Display the results
for pair in low_low_pairs:
    print(f"Clusters {pair[0]} and {pair[1]}: Mean Distance = {pair[2]:.2f}, Std Distance = {pair[3]:.2f}")

0.6 is the lowest threshold so far.

Depending on the goal of the analysis we can think of it as:
- If the aim is to identify the strongest relationships between clusters, a lower threshold would make more sense.
- If we want to explore the broader connections, then it is fine.

In [None]:
# Example: Replace with your cluster pairs from low_low_pairs
low_low_pairs = [(0, 4, 0.58, 0.57), (3, 8, 0.58, 0.59), (6, 8, 0.60, 0.35), (7, 8, 0.60, 0.33), (7, 9, 0.58, 0.37)]

# UMAP projections and cluster labels (replace with your actual data)
umap_projections = np.load("umap_projections_neighbors_20.npy")
kmeans_labels = np.load("kmeans_labels_list_20_35.npy")  # Shape: (n_runs, n_samples)

# Function to plot clusters
def plot_clusters(umap_projection, labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair
    points_a = umap_projection[labels == cluster_a]
    points_b = umap_projection[labels == cluster_b]

    plt.figure(figsize=(8, 6))
    plt.scatter(points_a[:, 0], points_a[:, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.6)
    plt.scatter(points_b[:, 0], points_b[:, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.6)
    plt.title(f"Run {run_idx}: Cluster {cluster_a} vs. Cluster {cluster_b}")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Analyze each cluster pair
for cluster_pair in low_low_pairs:
    cluster_a, cluster_b = cluster_pair[0], cluster_pair[1]
    print(f"Analyzing Cluster Pair: {cluster_a} and {cluster_b}")
    
    # For simplicity, visualize them in a specific UMAP run (e.g., the first run)
    run_idx = 0  # Use the first run for visualization
    plot_clusters(umap_projections[run_idx], kmeans_labels[run_idx], (cluster_a, cluster_b), run_idx)

    # Calculate additional statistics if needed
    distances_a_to_b = np.linalg.norm(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a].mean(axis=0) - 
                                      umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b].mean(axis=0))
    print(f"Mean Centroid Distance (Run {run_idx}): {distances_a_to_b:.2f}")

    # Variability comparison
    cluster_a_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a], axis=0)
    cluster_b_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b], axis=0)
    print(f"Cluster {cluster_a} Std Dev: {cluster_a_std}")
    print(f"Cluster {cluster_b} Std Dev: {cluster_b_std}")
    print("\n")

**Cluster 1 and Cluster 8** have a moderate spatial relationship with visible overlap in the UMAP space. Their differing variability patterns suggest distinct structures, but the overlap points might represent shared features or transitions between the clusters.
The large spatial separation between their centroids suggests they represent distinct structures or classes in the data.

**Cluster 0 and Cluster 9** 9 appears more compact and stable, while Cluster 0 is larger and more variable.
Their distinct regions in the UMAP space and differing standard deviations reinforce their meaningful separation.
Insights from Variability:

The variability of Cluster 0 could indicate sensitivity to UMAP parameters or noise in the data.

Interval of confidence

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution

n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_20_35 = distance_matrix_std_20_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_20_35 = z_score * sem_matrix_20_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_20_35 = mean_distance_matrix_20_35 - margin_of_error_matrix_20_35
upper_limit_intconf_matrix_20_35 = mean_distance_matrix_20_35 + margin_of_error_matrix_20_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_20_35 = np.maximum(lower_limit_intconf_matrix_20_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_20_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_20_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_20_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_20_35.npy', lower_limit_intconf_matrix_20_35)
np.save('upper_limit_intconf_matrix_20_35.npy', upper_limit_intconf_matrix_20_35)

In [25]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_20_35 = normalize_matrix(lower_limit_intconf_matrix_20_35)
norm_upper_limit_intconf_matrix_20_35 = normalize_matrix(upper_limit_intconf_matrix_20_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_20_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=20)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_20_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=20)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_20_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=20)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_20_35, "MST - Mean Distances", axes[0], color='red')
plot_mst(norm_lower_limit_intconf_matrix_20_35, "MST - Lower Limit", axes[1], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_20_35, "MST - Upper Limit", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

#### Intra class evaluation

In [None]:
# Define parameters
n_clusters = 10
radius = 0.5

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_20_35 = []
neighbor_counts_20_35 = []
kmeans_labels_list_20_35 = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_20_35):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_20_35 = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_20_35.append(kmeans_labels_20_35)
    
    run_max_distances_20_35 = []
    run_neighbor_counts_20_35 = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_20_35 == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_20_35 = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_20_35.append(max_distance_20_35)
        
        # Calculate number of neighbors within the radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_20_35 = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_20_35.append(neighbors_within_radius_20_35)
    
    # Append results for this run
    max_distances_20_35.append(run_max_distances_20_35)
    neighbor_counts_20_35.append(run_neighbor_counts_20_35)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_20_35 = np.array(max_distances_20_35)  # Shape: (n_runs, n_clusters)
neighbor_counts_20_35 = np.array(neighbor_counts_20_35)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_20_35 = np.array(kmeans_labels_list_20_35)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_5_35.npy', max_distances_20_35)
np.save('neighbor_counts_within_radius_5_35.npy', neighbor_counts_20_35)
np.save('kmeans_labels_list_5_35 .npy', kmeans_labels_list_20_35)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_20_35)
print("\nNeighbor counts within radius for each run and each cluster:\n", neighbor_counts_20_35)
print("\nKMeans labels saved successfully.")

In [50]:
max_distances_20_35_d= np.load(f'max_intra_cluster_distances_dynamic_20_35.npy')
neighbor_counts_20_35_d= np.load(f'neighbor_counts_within_dynamic_radius_20_35.npy')
kmeans_labels_list_20_35_d= np.load(f'kmeans_labels_list_20_35.npy')

In [None]:
# Variables to track the minimum distance and corresponding clusters
overall_min_distance_20 = float('inf')
min_distance_clusters_20 = None
min_distance_run_idx_20= None

for run_idx, run_centroids in enumerate(kmeans_centroids_20_35):
    # Compute pairwise distances between centroids
    pairwise_distances_20 = cdist(run_centroids, run_centroids, metric='euclidean')
    
    # Get the indices of the minimum non-zero distance
    np.fill_diagonal(pairwise_distances_20, np.inf)  # Ignore zero distances (self-comparisons)
    min_distance = np.min(pairwise_distances_20)
    if min_distance < overall_min_distance_20:
        overall_min_distance_20 = min_distance
        # Find the indices of the clusters corresponding to the minimum distance
        cluster_indices = np.unravel_index(np.argmin(pairwise_distances_20), pairwise_distances_20.shape)
        min_distance_clusters_20 = cluster_indices
        min_distance_run_idx_20 = run_idx

# Calculate dynamic radius
dynamic_radius_20_35 = overall_min_distance_20 / 2
print(f"Dynamic radius: {dynamic_radius_20_35}")
print(f"Minimum distance: {overall_min_distance_20}")
print(f"Clusters contributing to minimum distance: {min_distance_clusters_20}")
print(f"Run index: {min_distance_run_idx_20}")

# Save dynamic radius
np.save('dynamic_radius_results_20_35.npy', dynamic_radius_20_35)


In [None]:
# Define parameters
n_clusters = 10

# Dynamic radius, previously calculated
radius = dynamic_radius_20_35

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_20_35_d = []
neighbor_counts_20_35_d = []
kmeans_labels_list_20_35_d = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_20_35):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_20_35_d = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_20_35_d.append(kmeans_labels_20_35_d)
    
    run_max_distances_20_35_d = []
    run_neighbor_counts_20_35_d = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_20_35_d == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_20_35_d = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_20_35_d.append(max_distance_20_35_d)
        
        # Calculate number of neighbors within the dynamic radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_20_35_d = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_20_35_d.append(neighbors_within_radius_20_35_d)
    
    # Append results for this run
    max_distances_20_35_d.append(run_max_distances_20_35_d)
    neighbor_counts_20_35_d.append(run_neighbor_counts_20_35_d)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_20_35_d = np.array(max_distances_20_35_d)  # Shape: (n_runs, n_clusters)
neighbor_counts_20_35_d = np.array(neighbor_counts_20_35_d)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_20_35_d = np.array(kmeans_labels_list_20_35_d)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_dynamic_20_35.npy', max_distances_20_35_d)
np.save('neighbor_counts_within_dynamic_radius_20_35.npy', neighbor_counts_20_35_d)
np.save('kmeans_labels_list_20_35.npy', kmeans_labels_list_20_35_d)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_20_35_d)
print("\nNeighbor counts within dynamic radius for each run and each cluster:\n", neighbor_counts_20_35_d)
print("\nKMeans labels saved successfully.")

In [None]:
# Plot neighbor counts for each cluster across all runs
plt.figure(figsize=(16, 12))

# Iterate through each cluster
for cluster_idx in range(neighbor_counts_20_35_d.shape[1]):
    plt.subplot(2, 5, cluster_idx + 1)  # Create subplots for 10 clusters (2 rows, 5 columns)
    plt.plot(range(1, neighbor_counts_20_35_d.shape[0] + 1), neighbor_counts_20_35_d[:, cluster_idx], marker='o')
    plt.title(f'Cluster {cluster_idx}')
    plt.xlabel('Run')
    plt.ylabel('Neighbor Count')
    plt.xticks(range(1, neighbor_counts_20_35_d.shape[0] + 1, 5))  # Show every 5th run on the x-axis for clarity
    plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.suptitle('N=20 Neighbor Counts per Cluster Across Runs', y=1.02, fontsize=16)  # Add a global title
plt.show()

In [None]:
# Calculate mean and max values across clusters for each run
mean_neighbors_20 = np.mean(neighbor_counts_20_35_d, axis=1)  # Shape: (n_runs,)
max_neighbors_20 = np.max(neighbor_counts_20_35_d, axis=1)    # Shape: (n_runs,)

# Compute trend lines for mean and max
runs = np.arange(1, len(mean_neighbors_20) + 1)
mean_slope, mean_intercept, _, _, _ = linregress(runs, mean_neighbors_20)
max_slope, max_intercept, _, _, _ = linregress(runs, max_neighbors_20)

# Calculate trend line values
mean_trend_20 = mean_slope * runs + mean_intercept
max_trend_20 = max_slope * runs + max_intercept

# Plot the results
plt.figure(figsize=(12, 6))

# Mean neighbor counts
plt.plot(runs, mean_neighbors_20, label='Mean Neighbor Count', marker='o', color='blue')

# Max neighbor counts
plt.plot(runs, max_neighbors_20, label='Max Neighbor Count', marker='s', color='orange')

# Trend lines
plt.plot(runs, mean_trend_20, linestyle='--', color='green',label='Mean Trend Line')
plt.plot(runs, max_trend_20, linestyle='--', color='green', label='Max Trend Line')

# Add labels, legend, and title
plt.title('N=20 Neighbor Counts Across Runs (Mean vs. Max with Trend Lines)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Neighbor Count', fontsize=12)
plt.xticks(range(1, len(mean_neighbors_20) + 1, 5))  # Show every 5th run for readability
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the plot to a file
plt.savefig(f'neighbor_counts_plot_n_20_35.png', dpi=300)

# Show the plot
plt.show()

-------------

### n_neighbors = 30, n_runs = 35, n_clusters = 10 (for KMeans)

In [27]:
# Load the n_neighbors Analysis
umap_projections_30 = np.load(f'umap_projections_neighbors_30.npy')
centroid_mean_30_35= np.load(f'centroid_mean_30_35.npy')
centroid_std_30_35= np.load(f'centroid_std_30_35.npy')
kmeans_centroids_30 = np.load(f"kmeans_centroids_neighbors_30.npy")
df_results_v2=pd.read_csv(f'result_table_neighbors_30_35.csv')
mean_distance_matrix_30_35= np.load(f'mean_distance_matrix_neighbors_30_35.npy')
distance_matrix_std_30_35= np.load(f"distance_matrix_std_30_35.npy")
normalized_distance_matrix_std_30_35= np.load(f'normalized_distance_matrix_std_30_35.npy')
normalized_mean_distance_matrix_30_35= np.load(f'normalized_mean_distance_matrix_neighbors_30_35.npy')
mst_std_30_35= np.load(f'mst_std_30_35.npy')
mst_30_35= np.load(f'mst_30_35.npy')

In [None]:
### NO NEED TO RE RUN ###

# Define the number of UMAP and KMeans runs
n_runs = 35
n_clusters = 10  # Set the number of clusters (for KMeans)
n_neighbors = 30

# Store UMAP and KMeans results for each run
umap_projections_30_35 = []
kmeans_centroids_list_30_35 = []  # Use this to store centroids for each run

# Define a helper function to calculate the centroid of each cluster
def calculate_centroids(kmeans, x_umap):
    centroids = []
    for i in range(n_clusters):
        cluster_points = x_umap[kmeans.labels_ == i]
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    return np.array(centroids)

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.1, n_components=2, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap = umap_model.fit_transform(x_train_flattened)
    
    # Apply KMeans clustering on the UMAP projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_train_umap)

    # Calculate centroids for this run
    centroids = calculate_centroids(kmeans, x_train_umap)
    
    # Store the UMAP projections and KMeans models
    umap_projections_30_35.append(x_train_umap)
    kmeans_centroids_list_30_35.append(centroids)

# Now we calculate the mean and standard deviation of the centroids across all runs
kmeans_centroids = np.array(kmeans_centroids_list_30_35)  # Shape: (n_runs, n_clusters, 2)

# Calculate mean and std deviation for centroids' coordinates
centroid_mean = np.mean(kmeans_centroids, axis=0)
centroid_std = np.std(kmeans_centroids, axis=0)

# Save the UMAP projections and KMeans centroids
np.save(f'umap_projections_neighbors_{n_neighbors}.npy', np.array(umap_projections_30_35))
np.save(f'kmeans_centroids_neighbors_{n_neighbors}.npy', np.array(kmeans_centroids_list_30_35))

In [None]:
# Load the UMAP projections
umap_projections_30_35 = np.load(f'umap_projections_neighbors_30.npy')

# To see the contents of the UMAP projections
print(umap_projections_30_35)

In [88]:
### NO NEED TO RE RUN ###

# Save the centroid_mean and centroid_std
np.save(f'centroid_mean_{n_neighbors}_35.npy', np.array(centroid_mean))
np.save(f'centroid_std_{n_neighbors}_35.npy', np.array(centroid_std))

In [None]:
centroid_mean_30_35= np.load(f'centroid_mean_30_35.npy')
centroid_std_30_35= np.load(f'centroid_std_30_35.npy')

In [None]:
kmeans_centroids_30_35 = np.load(f"kmeans_centroids_neighbors_30.npy")  # Load the saved centroids data

------------------------

Standard deviation calculation

In [None]:
### NO NEED TO RE RUN ###
# Initialize arrays to store standard deviations
std_dev_x_30_35 = np.zeros(10)
std_dev_y_30_35 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_30_35[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_30_35[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_30_35[i] = np.std(cluster_x_coords)
    std_dev_y_30_35[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_30_35)
print("Standard deviation of y coordinates per cluster:", std_dev_y_30_35)

In [None]:
### NO NEED TO RE RUN ###
# Create an empty list to hold the data for the DataFrame
data_30_35 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_30_35[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_30_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x_30_35[cluster], mean_x + 2 * std_dev_x_30_35[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y_30_35[cluster], mean_y + 2 * std_dev_y_30_35[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_30_35.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_30_35 = pd.DataFrame(data_30_35, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:
### NO NEED TO RE RUN ###
# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true_30_35 = df_results_30_35.groupby('Trial')['Inside 2 std'].all()

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where all clusters were True
trials_with_all_true_30_35 = trials_all_true_30_35[trials_all_true_30_35].index.tolist()

In [None]:
### NO NEED TO RE RUN ###
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true_30_35)

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where not all clusters were True
trials_with_some_false_30_35 = trials_all_true_30_35[~trials_all_true_30_35].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false_30_35)

In [None]:
### NO NEED TO RE RUN ###
# Save the result table to a CSV file
df_results_30_35.to_csv(f'result_table_neighbors_30_35.csv', index=False)

-----------------

Removal outliers process

In [None]:
df_results_30_35=pd.read_csv('result_table_neighbors_30_35.csv')

In [None]:
# Convert the NumPy array into a DataFrame with 'Cluster', 'x_mean', and 'y_mean'
centroid_mean_neighbors_30_35_df = pd.DataFrame(centroid_mean_30_35, columns=['x_mean', 'y_mean'])
centroid_mean_neighbors_30_35_df['Cluster'] = np.arange(10)

In [None]:
# Step 1: Add commas between numbers in 'Centroid Coord' entries if they are missing
df_results_30_35['Centroid Coord'] = df_results_30_35['Centroid Coord'].str.replace(
    r'(\-?\d+\.\d+)\s+(\-?\d+\.\d+)', r'\1, \2', regex=True
)

# Step 2: Convert 'Centroid Coord' from string to list
df_results_30_35['Centroid Coord'] = df_results_30_35['Centroid Coord'].apply(ast.literal_eval)

# Step 3: Verify if each entry in 'Centroid Coord' is a list of length 2
invalid_rows = df_results_30_35[df_results_30_35['Centroid Coord'].apply(lambda x: not (isinstance(x, list) and len(x) == 2))]


In [None]:
# Extract x and y coordinates
df_results_30_35[['x', 'y']] = pd.DataFrame(df_results_30_35['Centroid Coord'].tolist(), index=df_results_30_35.index)

# Merge the mean centroids dataframe with the results dataframe on 'Cluster'
df_merged_30_35 = pd.merge(df_results_30_35, centroid_mean_neighbors_30_35_df, on='Cluster', how='left')

In [None]:
# Plot changes in X-coordinate for each cluster over all runs
n_runs = 35
n_clusters = 10
n_neighbors = 30

for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_30_35[:, cluster, 0], marker='o', linestyle='-', color='b')
    plt.title(f'Cluster {cluster} X-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('X Centroid Coordinate')
    plt.grid(True)
    plt.show()

# Plot changes in Y-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_30_35[:, cluster, 1], marker='o', linestyle='-', color='g')
    plt.title(f'Cluster {cluster} Y-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('Y Centroid Coordinate')
    plt.grid(True)
    plt.show()

In [None]:
# Calculate Euclidean distance from each centroid to its cluster's mean
df_merged_30_35['Distance_to_Mean'] = np.sqrt((df_merged_30_35['x'] - df_merged_30_35['x_mean'])**2 + (df_merged_30_35['y'] - df_merged_30_35['y_mean'])**2)

# Apply an outlier threshold (e.g., 90th percentile of the distance per cluster)
def filter_outliers(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return df[df['Distance_to_Mean'] <= threshold]

# Apply the filtering function for each cluster
df_no_outliers = df_merged_30_35.groupby('Cluster').apply(filter_outliers).reset_index(drop=True)

# Step 7: Drop unnecessary columns if needed (like 'x' and 'y' if only the distance matters)
df_no_outliers_cleaned_30_35 = df_no_outliers.drop(columns=['x', 'y', 'x_mean', 'y_mean'])

# Step 8: Check the size of the resulting dataframe
print(f"Original DataFrame size: {df_merged_30_35.shape}")
print(f"DataFrame size after removing outliers: {df_no_outliers_cleaned_30_35.shape}")

# Display the final dataframe to the user
df_no_outliers_cleaned_30_35

In [None]:
# Group the dataframe by 'Cluster'
clusters_grouped_30_35 = df_no_outliers_cleaned_30_35.groupby('Cluster')

# Create a dictionary to store arrays for each cluster's centroids
clusters_centroids_30_35 = {}

# Loop through each group (cluster) and store the centroids in arrays
for cluster, group in clusters_grouped_30_35:
    # Extract centroids (x, y) as a NumPy array
    centroids_array = np.array(group['Centroid Coord'].tolist())  # Assuming 'Centroid Coord' contains [x, y] pairs
    clusters_centroids_30_35[cluster] = centroids_array

In [None]:
# Create a dictionary to store the size of each cluster
cluster_sizes = {cluster: len(centroids) for cluster, centroids in clusters_centroids_30_35.items()}

# Print the size of each cluster
for cluster, size in cluster_sizes.items():
    print(f"Cluster {cluster} has {size} centroids considered.")

--------

Check to verify that if it is fine to have all clusters with the same number of centroids after filtering out outliers. This must be due to:
- The Distance Distributions are Likely Very Similar
- Uniform Data structure

In [None]:
# Loop through each cluster and plot the distribution of distances
for cluster, group in clusters_grouped_30_35:
    plt.figure(figsize=(10, 5))
    plt.hist(group['Distance_to_Mean'], bins=10, edgecolor='black')
    plt.title(f'Cluster {cluster}: Distance to Mean Distribution')
    plt.xlabel('Distance to Mean')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
# Percentile threshold per cluster check
def check_percentiles(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return threshold

# Function applied to each cluster and print the result
for cluster, group in clusters_grouped_30_35:
    threshold = check_percentiles(group)
    print(f"Cluster {cluster}: 90th percentile threshold = {threshold}")

In [None]:
# For each cluster, calculate the 70th percentile of distances and filter accordingly
for cluster, group in clusters_grouped_30_35:
    # Calculate the 70th percentile threshold for the current cluster
    threshold = np.percentile(group['Distance_to_Mean'], 70)
    
    # Filter centroids based on the 70th percentile
    filtered_group = group[group['Distance_to_Mean'] <= threshold]
    
    # Print the size of the group before and after filtering
    print(f"Cluster {cluster}: Original size = {len(group)}, Filtered size = {len(filtered_group)}")

--------

#### Mean Distance matrix n= 30

##### Distance Mean matrix

**Distance matrix**: elemnt d_{ij} has the distance between the center of cluster i and cluster j.

In [None]:
# Store distance matrices for each run
distance_matrices_30_35 = []

# Iterate over all runs and calculate the distance matrix for each run
for run_centroids in kmeans_centroids_30_35:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix_30_35 = cdist(run_centroids, run_centroids, metric='euclidean')
    distance_matrices_30_35.append(distance_matrix_30_35)

# Convert the list of distance matrices to a numpy array (35 runs, 10x10 distance matrices)
distance_matrices_30_35 = np.array(distance_matrices_30_35)

# Calculate the mean distance matrix across all runs
mean_distance_matrix_30_35 = np.mean(distance_matrices_30_35, axis=0)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_30_35 = (mean_distance_matrix_30_35 - np.min(mean_distance_matrix_30_35)) / (np.max(mean_distance_matrix_30_35) - np.min(mean_distance_matrix_30_35))

# Plot of the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_30_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=30)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_30_all_runs.npy', distance_matrices_30_35)
np.save('mean_distance_matrix_neighbors_30_35.npy', mean_distance_matrix_30_35)
np.save('normalized_mean_distance_matrix_neighbors_30_35.npy', normalized_mean_distance_matrix_30_35)
# Mean distance matrix
print(f"Mean distance matrix across all runs:\n{mean_distance_matrix_30_35}")

In [None]:
mean_distance_matrix_30_35= np.load(f'mean_distance_matrix_neighbors_30_35.npy')
# mean_distance_matrix_30_35=np.round(mean_distance_matrix_30_35,3)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_30_35 = (mean_distance_matrix_30_35 - np.min(mean_distance_matrix_30_35)) / (np.max(mean_distance_matrix_30_35) - np.min(mean_distance_matrix_30_35))

In [None]:
# Create a graph from the distance matrix
G_30_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_30_35,3))
np.save('G_30_35.npy', G_30_35)

# Draw the graph
pos = nx.spring_layout(G_30_35, seed=42)  # positions for all nodes
nx.draw(G_30_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_30_35, 'weight')
nx.draw_networkx_edge_labels(G_30_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_30_35 = nx.minimum_spanning_tree(G_30_35)
np.save('mst_30_35.npy', mst_30_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_30_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_30_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_30_35, 'weight')
nx.draw_networkx_edge_labels(mst_30_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST - n_neighbors=30")
plt.show()

**Distance Std. dev. Matrix**

-------------

##### Distance Std. dev. Matrix

In [None]:
# Calculate the pairwise distance matrix for the standard deviations
distance_matrix_std_30_35 = cdist(centroid_std_30_35, centroid_std_30_35, metric='euclidean')

# Normalize the distance matrix
normalized_distance_matrix_std_30_35 = (distance_matrix_std_30_35 - np.min(distance_matrix_std_30_35)) / (np.max(distance_matrix_std_30_35) - np.min(distance_matrix_std_30_35))

# Visualize the normalized distance matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(normalized_distance_matrix_std_30_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=30)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()

# Save the distance matrix for later analysis
np.save("distance_matrix_std_30_35.npy", distance_matrix_std_30_35)
np.save("normalized_distance_matrix_std_30_35.npy", normalized_distance_matrix_std_30_35)


In [None]:
# Create a graph from the distance matrix
G_std_30_35 = nx.from_numpy_array(np.round(normalized_distance_matrix_std_30_35,3))
np.save('G_std_30_35.npy', G_std_30_35)

# Draw the graph
pos = nx.spring_layout(G_std_30_35, seed=42)  # positions for all nodes
nx.draw(G_std_30_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_std_30_35, 'weight')
nx.draw_networkx_edge_labels(G_std_30_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_std_30_35 = nx.minimum_spanning_tree(G_std_30_35)
np.save('mst_std_30_35.npy', mst_std_30_35)

# Define positions for all nodes
pos_std_30_35 = nx.spring_layout(mst_std_30_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_std_30_35, pos_std_30_35, with_labels=True, node_color='lightyellow', edge_color='green', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels_std_30_35 = nx.get_edge_attributes(mst_std_30_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_30_35, pos_std_30_35, edge_labels=edge_labels_std_30_35, font_size=8, label_pos=0.3)

plt.title("MST Std. Deviation - n_neighbors=30")
plt.show()

In [None]:
# Define a function to create heatmaps
def plot_heatmaps_side_by_side(matrices, titles, figsize=(16, 8), cmap="viridis", annot=True):
    """
    Plots multiple heatmaps side by side for given matrices and titles.

    Args:
        matrices (list): List of 2D matrices to plot as heatmaps.
        titles (list): List of titles corresponding to each matrix.
        figsize (tuple): Size of the entire figure (default: (16, 8)).
        cmap (str): Color map to use for all heatmaps (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    n = len(matrices)  # Number of heatmaps
    fig, axes = plt.subplots(1, n, figsize=figsize)

    for i, (matrix, title) in enumerate(zip(matrices, titles)):
        sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5, ax=axes[i])
        axes[i].set_title(title)
        axes[i].set_xlabel("Cluster")
        axes[i].set_ylabel("Cluster" if i == 0 else "")  # Only label y-axis for the first plot

    plt.tight_layout()
    plt.show()

# Call the function with the two heatmaps
plot_heatmaps_side_by_side(
    matrices=[
        normalized_distance_matrix_std_30_35,
        normalized_mean_distance_matrix_30_35
    ],
    titles=[
        "Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=30)",
        "Normalized Mean Distance Matrix (k=10, n_neighbors=30)"
    ]
)

In [None]:
# Define a threshold for "low" values
threshold = 0.55

# Identify pairs of clusters with low values in both matrices
low_low_pairs = []
for i in range(normalized_mean_distance_matrix_30_35.shape[0]):
    for j in range(normalized_mean_distance_matrix_30_35.shape[1]):
        if i != j:  # Skip diagonal
            mean_value = normalized_mean_distance_matrix_30_35[i, j]
            std_value = normalized_distance_matrix_std_30_35[i, j]
            if mean_value < threshold and std_value < threshold:
                low_low_pairs.append((i, j, mean_value, std_value))

# Display the results
for pair in low_low_pairs:
    print(f"Clusters {pair[0]} and {pair[1]}: Mean Distance = {pair[2]:.2f}, Std Distance = {pair[3]:.2f}")

**0.55  lowest threshold so far.**

Depending on the goal of the analysis we can think of it as:
- If the aim is to identify the strongest relationships between clusters, a lower threshold would make more sense.
- If we want to explore the broader connections, then it is fine.

In [None]:
# Example: Replace with your cluster pairs from low_low_pairs
low_low_pairs = [(7, 8, 0.53, 0.11), (7, 9, 0.55, 0.31)]

# UMAP projections and cluster labels (replace with your actual data)
umap_projections = np.load("umap_projections_neighbors_30.npy")
kmeans_labels = np.load("kmeans_labels_list_30_35.npy")  # Shape: (n_runs, n_samples)

# Function to plot clusters
def plot_clusters(umap_projection, labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair
    points_a = umap_projection[labels == cluster_a]
    points_b = umap_projection[labels == cluster_b]

    plt.figure(figsize=(8, 6))
    plt.scatter(points_a[:, 0], points_a[:, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.6)
    plt.scatter(points_b[:, 0], points_b[:, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.6)
    plt.title(f"Run {run_idx}: Cluster {cluster_a} vs. Cluster {cluster_b}")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Analyze each cluster pair
for cluster_pair in low_low_pairs:
    cluster_a, cluster_b = cluster_pair[0], cluster_pair[1]
    print(f"Analyzing Cluster Pair: {cluster_a} and {cluster_b}")
    
    # For simplicity, visualize them in a specific UMAP run (e.g., the first run)
    run_idx = 0  # Use the first run for visualization
    plot_clusters(umap_projections[run_idx], kmeans_labels[run_idx], (cluster_a, cluster_b), run_idx)

    # Calculate additional statistics if needed
    distances_a_to_b = np.linalg.norm(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a].mean(axis=0) - 
                                      umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b].mean(axis=0))
    print(f"Mean Centroid Distance (Run {run_idx}): {distances_a_to_b:.2f}")

    # Variability comparison
    cluster_a_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a], axis=0)
    cluster_b_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b], axis=0)
    print(f"Cluster {cluster_a} Std Dev: {cluster_a_std}")
    print(f"Cluster {cluster_b} Std Dev: {cluster_b_std}")
    print("\n")

**Cluster 1 and Cluster 8** have a moderate spatial relationship with visible overlap in the UMAP space. Their differing variability patterns suggest distinct structures, but the overlap points might represent shared features or transitions between the clusters.
The large spatial separation between their centroids suggests they represent distinct structures or classes in the data.

**Cluster 0 and Cluster 9** 9 appears more compact and stable, while Cluster 0 is larger and more variable.
Their distinct regions in the UMAP space and differing standard deviations reinforce their meaningful separation.
Insights from Variability:

The variability of Cluster 0 could indicate sensitivity to UMAP parameters or noise in the data.

Interval of confidence

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution

n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_30_35 = distance_matrix_std_30_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_30_35 = z_score * sem_matrix_30_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_30_35 = mean_distance_matrix_30_35 - margin_of_error_matrix_30_35
upper_limit_intconf_matrix_30_35 = mean_distance_matrix_30_35 + margin_of_error_matrix_30_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_30_35 = np.maximum(lower_limit_intconf_matrix_30_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_30_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_30_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_30_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_30_35.npy', lower_limit_intconf_matrix_30_35)
np.save('upper_limit_intconf_matrix_30_35.npy', upper_limit_intconf_matrix_30_35)

In [29]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_30_35 = normalize_matrix(lower_limit_intconf_matrix_30_35)
norm_upper_limit_intconf_matrix_30_35 = normalize_matrix(upper_limit_intconf_matrix_30_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(normalized_mean_distance_matrix_30_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=30)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("Cluster")

sns.heatmap(norm_lower_limit_intconf_matrix_30_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=30)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("")

sns.heatmap(norm_upper_limit_intconf_matrix_30_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=30)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_30_35, "MST - Mean Distances", axes[0], color='red')
plot_mst(norm_lower_limit_intconf_matrix_30_35, "MST - Lower Limit", axes[1], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_30_35, "MST - Upper Limit", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

#### Intra class evaluation

In [None]:
# Define parameters
n_clusters = 10
radius = 0.5

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_30_35 = []
neighbor_counts_30_35 = []
kmeans_labels_list_30_35 = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_30_35):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_30_35 = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_30_35.append(kmeans_labels_5_35)
    
    run_max_distances_30_35 = []
    run_neighbor_counts_30_35 = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_30_35 == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_30_35 = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_30_35.append(max_distance_30_35)
        
        # Calculate number of neighbors within the radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_30_35 = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_30_35.append(neighbors_within_radius_30_35)
    
    # Append results for this run
    max_distances_30_35.append(run_max_distances_30_35)
    neighbor_counts_30_35.append(run_neighbor_counts_30_35)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_30_35 = np.array(max_distances_30_35)  # Shape: (n_runs, n_clusters)
neighbor_counts_30_35 = np.array(neighbor_counts_30_35)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_30_35 = np.array(kmeans_labels_list_30_35)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_30_35.npy', max_distances_30_35)
np.save('neighbor_counts_within_radius_30_35.npy', neighbor_counts_30_35)
np.save('kmeans_labels_list_30_35 .npy', kmeans_labels_list_30_35)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_30_35)
print("\nNeighbor counts within radius for each run and each cluster:\n", neighbor_counts_30_35)
print("\nKMeans labels saved successfully.")

In [52]:
max_distances_30_35_d= np.load(f'max_intra_cluster_distances_dynamic_30_35.npy')
neighbor_counts_30_35_d= np.load(f'neighbor_counts_within_dynamic_radius_30_35.npy')
kmeans_labels_list_30_35_d= np.load(f'kmeans_labels_list_30_35.npy')

In [None]:
# Variables to track the minimum distance and corresponding clusters
overall_min_distance_30 = float('inf')
min_distance_clusters_30 = None
min_distance_run_idx_30= None

for run_idx, run_centroids in enumerate(kmeans_centroids_30_35):
    # Compute pairwise distances between centroids
    pairwise_distances_30 = cdist(run_centroids, run_centroids, metric='euclidean')
    
    # Get the indices of the minimum non-zero distance
    np.fill_diagonal(pairwise_distances_30, np.inf)  # Ignore zero distances (self-comparisons)
    min_distance = np.min(pairwise_distances_30)
    if min_distance < overall_min_distance_30:
        overall_min_distance_30 = min_distance
        # Find the indices of the clusters corresponding to the minimum distance
        cluster_indices = np.unravel_index(np.argmin(pairwise_distances_30), pairwise_distances_30.shape)
        min_distance_clusters_30 = cluster_indices
        min_distance_run_idx_30 = run_idx

# Calculate dynamic radius
dynamic_radius_30_35 = overall_min_distance_30 / 2
print(f"Dynamic radius: {dynamic_radius_30_35}")
print(f"Minimum distance: {overall_min_distance_30}")
print(f"Clusters contributing to minimum distance: {min_distance_clusters_30}")
print(f"Run index: {min_distance_run_idx_30}")

# Save dynamic radius
np.save('dynamic_radius_results_30_35.npy', dynamic_radius_30_35)


In [None]:
# Define parameters
n_clusters = 10

# Dynamic radius, previously calculated
radius = dynamic_radius_30_35

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_30_35_d = []
neighbor_counts_30_35_d = []
kmeans_labels_list_30_35_d = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_30_35):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_30_35_d = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_30_35_d.append(kmeans_labels_30_35_d)
    
    run_max_distances_30_35_d = []
    run_neighbor_counts_30_35_d = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_30_35_d == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_30_35_d = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_30_35_d.append(max_distance_30_35_d)
        
        # Calculate number of neighbors within the dynamic radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_30_35_d = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_30_35_d.append(neighbors_within_radius_30_35_d)
    
    # Append results for this run
    max_distances_30_35_d.append(run_max_distances_30_35_d)
    neighbor_counts_30_35_d.append(run_neighbor_counts_30_35_d)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_30_35_d = np.array(max_distances_30_35_d)  # Shape: (n_runs, n_clusters)
neighbor_counts_30_35_d = np.array(neighbor_counts_30_35_d)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_30_35_d = np.array(kmeans_labels_list_30_35_d)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_dynamic_30_35.npy', max_distances_30_35_d)
np.save('neighbor_counts_within_dynamic_radius_30_35.npy', neighbor_counts_30_35_d)
np.save('kmeans_labels_list_30_35.npy', kmeans_labels_list_30_35_d)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_30_35_d)
print("\nNeighbor counts within dynamic radius for each run and each cluster:\n", neighbor_counts_30_35_d)
print("\nKMeans labels saved successfully.")

In [None]:
# Plot neighbor counts for each cluster across all runs
plt.figure(figsize=(16, 12))

# Iterate through each cluster
for cluster_idx in range(neighbor_counts_30_35_d.shape[1]):
    plt.subplot(2, 5, cluster_idx + 1)  # Create subplots for 10 clusters (2 rows, 5 columns)
    plt.plot(range(1, neighbor_counts_30_35_d.shape[0] + 1), neighbor_counts_30_35_d[:, cluster_idx], marker='o')
    plt.title(f'Cluster {cluster_idx}')
    plt.xlabel('Run')
    plt.ylabel('Neighbor Count')
    plt.xticks(range(1, neighbor_counts_30_35_d.shape[0] + 1, 5))  # Show every 5th run on the x-axis for clarity
    plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.suptitle('N=30 Neighbor Counts per Cluster Across Runs', y=1.02, fontsize=16)  # Add a global title
plt.show()

In [None]:
# Calculate mean and max values across clusters for each run
mean_neighbors_30 = np.mean(neighbor_counts_30_35_d, axis=1)  # Shape: (n_runs,)
max_neighbors_30 = np.max(neighbor_counts_30_35_d, axis=1)    # Shape: (n_runs,)

# Compute trend lines for mean and max
runs = np.arange(1, len(mean_neighbors_30) + 1)
mean_slope, mean_intercept, _, _, _ = linregress(runs, mean_neighbors_30)
max_slope, max_intercept, _, _, _ = linregress(runs, max_neighbors_30)

# Calculate trend line values
mean_trend_30 = mean_slope * runs + mean_intercept
max_trend_30 = max_slope * runs + max_intercept

# Plot the results
plt.figure(figsize=(12, 6))

# Mean neighbor counts
plt.plot(runs, mean_neighbors_30, label='Mean Neighbor Count', marker='o', color='blue')

# Max neighbor counts
plt.plot(runs, max_neighbors_30, label='Max Neighbor Count', marker='s', color='orange')

# Trend lines
plt.plot(runs, mean_trend_30, linestyle='--', color='green',label='Mean Trend Line')
plt.plot(runs, max_trend_30, linestyle='--', color='green', label='Max Trend Line')

# Add labels, legend, and title
plt.title('N=30 Neighbor Counts Across Runs (Mean vs. Max with Trend Lines)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Neighbor Count', fontsize=12)
plt.xticks(range(1, len(mean_neighbors_30) + 1, 5))  # Show every 5th run for readability
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the plot to a file
plt.savefig(f'neighbor_counts_plot_n_30_35.png', dpi=300)

# Show the plot
plt.show()

-------

### n_neighbors = 50, n_runs = 35, n_clusters = 10 (for KMeans)

In [41]:
# Load the n_neighbors Analysis
umap_projections_50 = np.load(f'umap_projections_neighbors_50.npy')
centroid_mean_50_35= np.load(f'centroid_mean_50_35.npy')
centroid_std_50_35= np.load(f'centroid_std_50_35.npy')
kmeans_centroids_50_35 = np.load(f"kmeans_centroids_neighbors_50.npy")
df_results_v2=pd.read_csv('result_table_neighbors_50_35.csv')
mean_distance_matrix_50_35= np.load(f'mean_distance_matrix_neighbors_50_35.npy')
distance_matrix_std_50_35= np.load(f"distance_matrix_std_50_35.npy")
normalized_distance_matrix_std_50_35= np.load(f'normalized_distance_matrix_std_50_35.npy')
normalized_mean_distance_matrix_50_35= np.load(f'normalized_mean_distance_matrix_50_35.npy')
mst_50_35= np.load(f'mst_50_35.npy')

In [None]:
### NO NEED TO RE RUN ###

# Define the number of UMAP and KMeans runs
n_runs = 35
n_clusters = 10  # Set the number of clusters (for KMeans)
n_neighbors = 50

# Store UMAP and KMeans results for each run
umap_projections_50_35 = []
kmeans_centroids_list_50_35 = []  # Use this to store centroids for each run

# Define a helper function to calculate the centroid of each cluster
def calculate_centroids(kmeans, x_umap):
    centroids = []
    for i in range(n_clusters):
        cluster_points = x_umap[kmeans.labels_ == i]
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    return np.array(centroids)

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.1, n_components=2, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap = umap_model.fit_transform(x_train_flattened)
    
    # Apply KMeans clustering on the UMAP projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_train_umap)

    # Calculate centroids for this run
    centroids = calculate_centroids(kmeans, x_train_umap)
    
    # Store the UMAP projections and KMeans models
    umap_projections_50_35.append(x_train_umap)
    kmeans_centroids_list_50_35.append(centroids)

# Now we calculate the mean and standard deviation of the centroids across all runs
kmeans_centroids = np.array(kmeans_centroids_list_50_35)  # Shape: (n_runs, n_clusters, 2)

# Calculate mean and std deviation for centroids' coordinates
centroid_mean = np.mean(kmeans_centroids, axis=0)
centroid_std = np.std(kmeans_centroids, axis=0)

# Save the UMAP projections and KMeans centroids
np.save(f'umap_projections_neighbors_{n_neighbors}.npy', np.array(umap_projections_50_35))
np.save(f'kmeans_centroids_neighbors_{n_neighbors}.npy', np.array(kmeans_centroids_list_50_35))

In [None]:
# Load the UMAP projections
umap_projections_50_35 = np.load(f'umap_projections_neighbors_50.npy')

# To see the contents of the UMAP projections
print(umap_projections_50_35)

In [None]:
### NO NEED TO RE RUN ###

# Save the centroid_mean and centroid_std
np.save(f'centroid_mean_50_35.npy', np.array(centroid_mean))
np.save(f'centroid_std_50_35.npy', np.array(centroid_std))

In [36]:
centroid_mean_50_35= np.load(f'centroid_mean_50_35.npy')
centroid_std_50_35= np.load(f'centroid_std_50_35.npy')

In [37]:
kmeans_centroids_50_35 = np.load(f"kmeans_centroids_neighbors_50.npy")  # Load the saved centroids data

------------------------

Standard deviation calculation

In [None]:
### NO NEED TO RE RUN ###
# Initialize arrays to store standard deviations
std_dev_x_50_35 = np.zeros(10)
std_dev_y_50_35 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_50_35[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_50_35[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_50_35[i] = np.std(cluster_x_coords)
    std_dev_y_50_35[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_50_35)
print("Standard deviation of y coordinates per cluster:", std_dev_y_50_35)

In [None]:
### NO NEED TO RE RUN ###
# Create an empty list to hold the data for the DataFrame
data_50_35 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_50_35[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_50_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x_50_35[cluster], mean_x + 2 * std_dev_x_50_35[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y_50_35[cluster], mean_y + 2 * std_dev_y_50_35[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_50_35.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_50_35 = pd.DataFrame(data_50_35, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:
### NO NEED TO RE RUN ###
# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true_50_35 = df_results_50_35.groupby('Trial')['Inside 2 std'].all()

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where all clusters were True
trials_with_all_true_50_35 = trials_all_true_50_35[trials_all_true_50_35].index.tolist()

In [None]:
### NO NEED TO RE RUN ###
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true_50_35)

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where not all clusters were True
trials_with_some_false_50_35 = trials_all_true_50_35[~trials_all_true_50_35].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false_50_35)

In [None]:
### NO NEED TO RE RUN ###
# Save the result table to a CSV file
df_results_50_35.to_csv(f'result_table_neighbors_50_35.csv', index=False)

-----------------

Removal outliers process

In [38]:
df_results_50_35=pd.read_csv('result_table_neighbors_50_35.csv')

In [None]:
# Convert the NumPy array into a DataFrame with 'Cluster', 'x_mean', and 'y_mean'
centroid_mean_neighbors_50_35_df = pd.DataFrame(centroid_mean_50_35, columns=['x_mean', 'y_mean'])
centroid_mean_neighbors_50_35_df['Cluster'] = np.arange(10)

In [None]:
# Step 1: Add commas between numbers in 'Centroid Coord' entries if they are missing
df_results_50_35['Centroid Coord'] = df_results_50_35['Centroid Coord'].str.replace(
    r'(\-?\d+\.\d+)\s+(\-?\d+\.\d+)', r'\1, \2', regex=True
)

# Step 2: Convert 'Centroid Coord' from string to list
df_results_50_35['Centroid Coord'] = df_results_50_35['Centroid Coord'].apply(ast.literal_eval)

# Step 3: Verify if each entry in 'Centroid Coord' is a list of length 2
invalid_rows = df_results_50_35[df_results_50_35['Centroid Coord'].apply(lambda x: not (isinstance(x, list) and len(x) == 2))]


In [None]:
# Extract x and y coordinates
df_results_50_35[['x', 'y']] = pd.DataFrame(df_results_50_35['Centroid Coord'].tolist(), index=df_results_50_35.index)

# Merge the mean centroids dataframe with the results dataframe on 'Cluster'
df_merged_50_35 = pd.merge(df_results_50_35, centroid_mean_neighbors_50_35_df, on='Cluster', how='left')

In [None]:
# Plot changes in X-coordinate for each cluster over all runs
n_runs = 35
n_clusters = 10
n_neighbors = 50

for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_50_35[:, cluster, 0], marker='o', linestyle='-', color='b')
    plt.title(f'Cluster {cluster} X-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('X Centroid Coordinate')
    plt.grid(True)
    plt.show()

# Plot changes in Y-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_50_35[:, cluster, 1], marker='o', linestyle='-', color='g')
    plt.title(f'Cluster {cluster} Y-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('Y Centroid Coordinate')
    plt.grid(True)
    plt.show()

In [None]:
# Calculate Euclidean distance from each centroid to its cluster's mean
df_merged_50_35['Distance_to_Mean'] = np.sqrt((df_merged_50_35['x'] - df_merged_50_35['x_mean'])**2 + (df_merged_50_35['y'] - df_merged_50_35['y_mean'])**2)

# Apply an outlier threshold (e.g., 90th percentile of the distance per cluster)
def filter_outliers(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return df[df['Distance_to_Mean'] <= threshold]

# Apply the filtering function for each cluster
df_no_outliers = df_merged_50_35.groupby('Cluster').apply(filter_outliers).reset_index(drop=True)

# Step 7: Drop unnecessary columns if needed (like 'x' and 'y' if only the distance matters)
df_no_outliers_cleaned_50_35 = df_no_outliers.drop(columns=['x', 'y', 'x_mean', 'y_mean'])

# Step 8: Check the size of the resulting dataframe
print(f"Original DataFrame size: {df_merged_50_35.shape}")
print(f"DataFrame size after removing outliers: {df_no_outliers_cleaned_50_35.shape}")

# Display the final dataframe to the user
df_no_outliers_cleaned_50_35

In [None]:
# Group the dataframe by 'Cluster'
clusters_grouped_50_35 = df_no_outliers_cleaned_50_35.groupby('Cluster')

# Create a dictionary to store arrays for each cluster's centroids
clusters_centroids_50_35 = {}

# Loop through each group (cluster) and store the centroids in arrays
for cluster, group in clusters_grouped_50_35:
    # Extract centroids (x, y) as a NumPy array
    centroids_array = np.array(group['Centroid Coord'].tolist())  # Assuming 'Centroid Coord' contains [x, y] pairs
    clusters_centroids_50_35[cluster] = centroids_array

In [None]:
# Create a dictionary to store the size of each cluster
cluster_sizes = {cluster: len(centroids) for cluster, centroids in clusters_centroids_50_35.items()}

# Print the size of each cluster
for cluster, size in cluster_sizes.items():
    print(f"Cluster {cluster} has {size} centroids considered.")

--------

Check to verify that if it is fine to have all clusters with the same number of centroids after filtering out outliers. This must be due to:
- The Distance Distributions are Likely Very Similar
- Uniform Data structure

In [None]:
# Loop through each cluster and plot the distribution of distances
for cluster, group in clusters_grouped_50_35:
    plt.figure(figsize=(10, 5))
    plt.hist(group['Distance_to_Mean'], bins=10, edgecolor='black')
    plt.title(f'Cluster {cluster}: Distance to Mean Distribution')
    plt.xlabel('Distance to Mean')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
# Percentile threshold per cluster check
def check_percentiles(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return threshold

# Function applied to each cluster and print the result
for cluster, group in clusters_grouped_50_35:
    threshold = check_percentiles(group)
    print(f"Cluster {cluster}: 90th percentile threshold = {threshold}")

In [None]:
# For each cluster, calculate the 70th percentile of distances and filter accordingly
for cluster, group in clusters_grouped_50_35:
    # Calculate the 70th percentile threshold for the current cluster
    threshold = np.percentile(group['Distance_to_Mean'], 70)
    
    # Filter centroids based on the 70th percentile
    filtered_group = group[group['Distance_to_Mean'] <= threshold]
    
    # Print the size of the group before and after filtering
    print(f"Cluster {cluster}: Original size = {len(group)}, Filtered size = {len(filtered_group)}")

--------

#### Mean Distance matrix n= 50

##### Distance Mean matrix

**Distance matrix**: elemnt d_{ij} has the distance between the center of cluster i and cluster j.

In [None]:
# Store distance matrices for each run
distance_matrices_50_35 = []

# Iterate over all runs and calculate the distance matrix for each run
for run_centroids in kmeans_centroids_50_35:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix_50_35 = cdist(run_centroids, run_centroids, metric='euclidean')
    distance_matrices_50_35.append(distance_matrix_50_35)

# Convert the list of distance matrices to a numpy array (35 runs, 10x10 distance matrices)
distance_matrices_50_35 = np.array(distance_matrices_50_35)

# Calculate the mean distance matrix across all runs
mean_distance_matrix_50_35 = np.mean(distance_matrices_50_35, axis=0)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_35 = (mean_distance_matrix_50_35 - np.min(mean_distance_matrix_50_35)) / (np.max(mean_distance_matrix_50_35) - np.min(mean_distance_matrix_50_35))

# Plot of the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_50_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=50)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_50_all_runs.npy', distance_matrices_50_35)
np.save('mean_distance_matrix_neighbors_50_35.npy', mean_distance_matrix_50_35)
np.save('normalized_mean_distance_matrix_50_35.npy', normalized_mean_distance_matrix_50_35)
# Mean distance matrix
print(f"Mean distance matrix across all runs:\n{mean_distance_matrix_50_35}")

In [None]:
mean_distance_matrix_50_35= np.load(f'mean_distance_matrix_neighbors_50_35.npy')
# mean_distance_matrix_50_35=np.round(mean_distance_matrix_50_35,3)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_35 = (mean_distance_matrix_50_35 - np.min(mean_distance_matrix_50_35)) / (np.max(mean_distance_matrix_50_35) - np.min(mean_distance_matrix_50_35))

In [None]:
# Create a graph from the distance matrix
G_50_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_50_35,3))
np.save('G_50_35.npy', G_50_35)

# Draw the graph
pos = nx.spring_layout(G_50_35, seed=42)  # positions for all nodes
nx.draw(G_50_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_50_35, 'weight')
nx.draw_networkx_edge_labels(G_50_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_50_35 = nx.minimum_spanning_tree(G_50_35)
np.save('mst_50_35.npy', mst_50_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_50_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_50_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_50_35, 'weight')
nx.draw_networkx_edge_labels(mst_50_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST - n_neighbors=50")
plt.show()

-------------

##### Distance Std. dev. Matrix

In [None]:
# Calculate the pairwise distance matrix for the standard deviations
distance_matrix_std_50_35 = cdist(centroid_std_50_35, centroid_std_50_35, metric='euclidean')

# Normalize the distance matrix
normalized_distance_matrix_std_50_35 = (distance_matrix_std_50_35 - np.min(distance_matrix_std_50_35)) / (np.max(distance_matrix_std_50_35) - np.min(distance_matrix_std_50_35))

# Visualize the normalized distance matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(normalized_distance_matrix_std_50_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=50)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()

# Save the distance matrix for later analysis
np.save("distance_matrix_std_50_35.npy", distance_matrix_std_50_35)
np.save("normalized_distance_matrix_std_50_35.npy", normalized_distance_matrix_std_50_35)


In [None]:
# Create a graph from the distance matrix
G_std_50_35 = nx.from_numpy_array(np.round(normalized_distance_matrix_std_50_35,3))
np.save('G_std_50_35.npy', G_std_50_35)

# Draw the graph
pos = nx.spring_layout(G_std_50_35, seed=42)  # positions for all nodes
nx.draw(G_std_50_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_std_50_35, 'weight')
nx.draw_networkx_edge_labels(G_std_50_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_std_50_35 = nx.minimum_spanning_tree(G_std_50_35)
np.save('mst_std_50_35.npy', mst_std_50_35)

# Define positions for all nodes
pos_std_50_35 = nx.spring_layout(mst_std_50_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_std_50_35, pos_std_50_35, with_labels=True, node_color='lightyellow', edge_color='green', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels_std_50_35 = nx.get_edge_attributes(mst_std_50_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_50_35, pos_std_50_35, edge_labels=edge_labels_std_50_35, font_size=8, label_pos=0.3)

plt.title("MST Std. Deviation - n_neighbors=50")
plt.show()

In [None]:
# Define a function to create heatmaps
def plot_heatmaps_side_by_side(matrices, titles, figsize=(16, 8), cmap="viridis", annot=True):
    """
    Plots multiple heatmaps side by side for given matrices and titles.

    Args:
        matrices (list): List of 2D matrices to plot as heatmaps.
        titles (list): List of titles corresponding to each matrix.
        figsize (tuple): Size of the entire figure (default: (16, 8)).
        cmap (str): Color map to use for all heatmaps (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    n = len(matrices)  # Number of heatmaps
    fig, axes = plt.subplots(1, n, figsize=figsize)

    for i, (matrix, title) in enumerate(zip(matrices, titles)):
        sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5, ax=axes[i])
        axes[i].set_title(title)
        axes[i].set_xlabel("Cluster")
        axes[i].set_ylabel("Cluster" if i == 0 else "")  # Only label y-axis for the first plot

    plt.tight_layout()
    plt.show()

# Call the function with the two heatmaps
plot_heatmaps_side_by_side(
    matrices=[
        normalized_distance_matrix_std_50_35,
        normalized_mean_distance_matrix_50_35
    ],
    titles=[
        "Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=50)",
        "Normalized Mean Distance Matrix (k=10, n_neighbors=50)"
    ]
)

In [None]:
# Define a threshold for "low" values
threshold = 0.65

# Identify pairs of clusters with low values in both matrices
low_low_pairs = []
for i in range(normalized_mean_distance_matrix_50_35.shape[0]):
    for j in range(normalized_mean_distance_matrix_50_35.shape[1]):
        if i != j:  # Skip diagonal
            mean_value = normalized_mean_distance_matrix_50_35[i, j]
            std_value = normalized_distance_matrix_std_50_35[i, j]
            if mean_value < threshold and std_value < threshold:
                low_low_pairs.append((i, j, mean_value, std_value))

# Display the results
for pair in low_low_pairs:
    print(f"Clusters {pair[0]} and {pair[1]}: Mean Distance = {pair[2]:.2f}, Std Distance = {pair[3]:.2f}")

**0.55  lowest threshold so far.**

Depending on the goal of the analysis we can think of it as:
- If the aim is to identify the strongest relationships between clusters, a lower threshold would make more sense.
- If we want to explore the broader connections, then it is fine.

In [None]:
# Example: Replace with your cluster pairs from low_low_pairs
low_low_pairs = [(0, 8, 0.65, 0.54), (7, 8, 0.62, 0.17), (7, 9, 0.60, 0.25)]

# UMAP projections and cluster labels (replace with your actual data)
umap_projections = np.load("umap_projections_neighbors_50.npy")
kmeans_labels = np.load("kmeans_labels_list_50_35.npy")  # Shape: (n_runs, n_samples)

# Function to plot clusters
def plot_clusters(umap_projection, labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair
    points_a = umap_projection[labels == cluster_a]
    points_b = umap_projection[labels == cluster_b]

    plt.figure(figsize=(8, 6))
    plt.scatter(points_a[:, 0], points_a[:, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.6)
    plt.scatter(points_b[:, 0], points_b[:, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.6)
    plt.title(f"Run {run_idx}: Cluster {cluster_a} vs. Cluster {cluster_b}")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Analyze each cluster pair
for cluster_pair in low_low_pairs:
    cluster_a, cluster_b = cluster_pair[0], cluster_pair[1]
    print(f"Analyzing Cluster Pair: {cluster_a} and {cluster_b}")
    
    # For simplicity, visualize them in a specific UMAP run (e.g., the first run)
    run_idx = 0  # Use the first run for visualization
    plot_clusters(umap_projections[run_idx], kmeans_labels[run_idx], (cluster_a, cluster_b), run_idx)

    # Calculate additional statistics if needed
    distances_a_to_b = np.linalg.norm(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a].mean(axis=0) - 
                                      umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b].mean(axis=0))
    print(f"Mean Centroid Distance (Run {run_idx}): {distances_a_to_b:.2f}")

    # Variability comparison
    cluster_a_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a], axis=0)
    cluster_b_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b], axis=0)
    print(f"Cluster {cluster_a} Std Dev: {cluster_a_std}")
    print(f"Cluster {cluster_b} Std Dev: {cluster_b_std}")
    print("\n")

**Cluster 1 and Cluster 8** have a moderate spatial relationship with visible overlap in the UMAP space. Their differing variability patterns suggest distinct structures, but the overlap points might represent shared features or transitions between the clusters.
The large spatial separation between their centroids suggests they represent distinct structures or classes in the data.

**Cluster 0 and Cluster 9** 9 appears more compact and stable, while Cluster 0 is larger and more variable.
Their distinct regions in the UMAP space and differing standard deviations reinforce their meaningful separation.
Insights from Variability:

The variability of Cluster 0 could indicate sensitivity to UMAP parameters or noise in the data.

Interval of confidence

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution

n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_50_35 = distance_matrix_std_50_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_50_35 = z_score * sem_matrix_50_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_50_35 = mean_distance_matrix_50_35 - margin_of_error_matrix_50_35
upper_limit_intconf_matrix_50_35 = mean_distance_matrix_50_35 + margin_of_error_matrix_50_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_50_35 = np.maximum(lower_limit_intconf_matrix_50_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_50_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_50_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_50_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_50_35.npy', lower_limit_intconf_matrix_50_35)
np.save('upper_limit_intconf_matrix_50_35.npy', upper_limit_intconf_matrix_50_35)

In [33]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_50_35 = normalize_matrix(lower_limit_intconf_matrix_50_35)
norm_upper_limit_intconf_matrix_50_35 = normalize_matrix(upper_limit_intconf_matrix_50_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_50_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=50)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_50_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=50)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_50_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=50)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_50_35, "MST - Mean Distances", axes[0], color='red')
plot_mst(norm_lower_limit_intconf_matrix_50_35, "MST - Lower Limit", axes[1], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_50_35, "MST - Upper Limit", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

#### Intra class evaluation

In [54]:
max_distances_50_35_d= np.load(f'max_intra_cluster_distances_dynamic_50_35.npy')
neighbor_counts_50_35_d= np.load(f'neighbor_counts_within_dynamic_radius_50_35.npy')
kmeans_labels_list_50_35_d= np.load(f'kmeans_labels_list_50_35.npy')

In [None]:
# Variables to track the minimum distance and corresponding clusters
overall_min_distance_50 = float('inf')
min_distance_clusters_50 = None
min_distance_run_idx_50= None

for run_idx, run_centroids in enumerate(kmeans_centroids_50_35):
    # Compute pairwise distances between centroids
    pairwise_distances_50 = cdist(run_centroids, run_centroids, metric='euclidean')
    
    # Get the indices of the minimum non-zero distance
    np.fill_diagonal(pairwise_distances_50, np.inf)  # Ignore zero distances (self-comparisons)
    min_distance = np.min(pairwise_distances_50)
    if min_distance < overall_min_distance_50:
        overall_min_distance_50 = min_distance
        # Find the indices of the clusters corresponding to the minimum distance
        cluster_indices = np.unravel_index(np.argmin(pairwise_distances_50), pairwise_distances_50.shape)
        min_distance_clusters_50 = cluster_indices
        min_distance_run_idx_50 = run_idx

# Calculate dynamic radius
dynamic_radius_50_35 = overall_min_distance_50 / 2
print(f"Dynamic radius: {dynamic_radius_50_35}")
print(f"Minimum distance: {overall_min_distance_50}")
print(f"Clusters contributing to minimum distance: {min_distance_clusters_50}")
print(f"Run index: {min_distance_run_idx_50}")

# Save dynamic radius
np.save('dynamic_radius_results_50_35.npy', dynamic_radius_50_35)


In [None]:
# Define parameters
n_clusters = 10

# Dynamic radius, previously calculated
radius = dynamic_radius_50_35

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_50_35_d = []
neighbor_counts_50_35_d = []
kmeans_labels_list_50_35_d = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_50_35):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_50_35_d = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_50_35_d.append(kmeans_labels_50_35_d)
    
    run_max_distances_50_35_d = []
    run_neighbor_counts_50_35_d = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_50_35_d == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_50_35_d = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_50_35_d.append(max_distance_50_35_d)
        
        # Calculate number of neighbors within the dynamic radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_50_35_d = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_50_35_d.append(neighbors_within_radius_50_35_d)
    
    # Append results for this run
    max_distances_50_35_d.append(run_max_distances_50_35_d)
    neighbor_counts_50_35_d.append(run_neighbor_counts_50_35_d)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_50_35_d = np.array(max_distances_50_35_d)  # Shape: (n_runs, n_clusters)
neighbor_counts_50_35_d = np.array(neighbor_counts_50_35_d)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_50_35_d = np.array(kmeans_labels_list_50_35_d)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_dynamic_50_35.npy', max_distances_50_35_d)
np.save('neighbor_counts_within_dynamic_radius_50_35.npy', neighbor_counts_50_35_d)
np.save('kmeans_labels_list_50_35.npy', kmeans_labels_list_50_35_d)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_50_35_d)
print("\nNeighbor counts within dynamic radius for each run and each cluster:\n", neighbor_counts_50_35_d)
print("\nKMeans labels saved successfully.")

In [None]:
# Plot neighbor counts for each cluster across all runs
plt.figure(figsize=(16, 12))

# Iterate through each cluster
for cluster_idx in range(neighbor_counts_50_35_d.shape[1]):
    plt.subplot(2, 5, cluster_idx + 1)  # Create subplots for 10 clusters (2 rows, 5 columns)
    plt.plot(range(1, neighbor_counts_50_35_d.shape[0] + 1), neighbor_counts_50_35_d[:, cluster_idx], marker='o')
    plt.title(f'Cluster {cluster_idx}')
    plt.xlabel('Run')
    plt.ylabel('Neighbor Count')
    plt.xticks(range(1, neighbor_counts_50_35_d.shape[0] + 1, 5))  # Show every 5th run on the x-axis for clarity
    plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.suptitle('N=30 Neighbor Counts per Cluster Across Runs', y=1.02, fontsize=16)  # Add a global title
plt.show()

In [None]:
# Calculate mean and max values across clusters for each run
mean_neighbors_50 = np.mean(neighbor_counts_50_35_d, axis=1)  # Shape: (n_runs,)
max_neighbors_50 = np.max(neighbor_counts_50_35_d, axis=1)    # Shape: (n_runs,)

# Compute trend lines for mean and max
runs = np.arange(1, len(mean_neighbors_50) + 1)
mean_slope, mean_intercept, _, _, _ = linregress(runs, mean_neighbors_50)
max_slope, max_intercept, _, _, _ = linregress(runs, max_neighbors_50)

# Calculate trend line values
mean_trend_50 = mean_slope * runs + mean_intercept
max_trend_50 = max_slope * runs + max_intercept

# Plot the results
plt.figure(figsize=(12, 6))

# Mean neighbor counts
plt.plot(runs, mean_neighbors_50, label='Mean Neighbor Count', marker='o', color='blue')

# Max neighbor counts
plt.plot(runs, max_neighbors_50, label='Max Neighbor Count', marker='s', color='orange')

# Trend lines
plt.plot(runs, mean_trend_50, linestyle='--', color='green',label='Mean Trend Line')
plt.plot(runs, max_trend_50, linestyle='--', color='green', label='Max Trend Line')

# Add labels, legend, and title
plt.title('N=50 Neighbor Counts Across Runs (Mean vs. Max with Trend Lines)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Neighbor Count', fontsize=12)
plt.xticks(range(1, len(mean_neighbors_50) + 1, 5))  # Show every 5th run for readability
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the plot to a file
plt.savefig(f'neighbor_counts_plot_n_50_35.png', dpi=300)

# Show the plot
plt.show()

In [None]:
# Define parameters
n_clusters = 10
radius = 0.5

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_50_35 = []
neighbor_counts_50_35 = []
kmeans_labels_list_50_35 = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_50_35):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_50_35 = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_50_35.append(kmeans_labels_5_35)
    
    run_max_distances_50_35 = []
    run_neighbor_counts_50_35 = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_50_35 == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_50_35 = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_50_35.append(max_distance_50_35)
        
        # Calculate number of neighbors within the radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_50_35 = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_50_35.append(neighbors_within_radius_50_35)
    
    # Append results for this run
    max_distances_50_35.append(run_max_distances_50_35)
    neighbor_counts_50_35.append(run_neighbor_counts_50_35)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_50_35 = np.array(max_distances_50_35)  # Shape: (n_runs, n_clusters)
neighbor_counts_50_35 = np.array(neighbor_counts_50_35)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_50_35 = np.array(kmeans_labels_list_50_35)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_5_35.npy', max_distances_50_35)
np.save('neighbor_counts_within_radius_5_35.npy', neighbor_counts_50_35)
np.save('kmeans_labels_list_5_35 .npy', kmeans_labels_list_50_35)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_50_35)
print("\nNeighbor counts within radius for each run and each cluster:\n", neighbor_counts_50_35)
print("\nKMeans labels saved successfully.")

-------

### n_neighbors = 100, n_runs = 35, n_clusters = 10 (for KMeans)

In [10]:
# Load the n_neighbors Analysis
umap_projections_100 = np.load(f'umap_projections_neighbors_100.npy')
centroid_mean_100_35= np.load(f'centroid_mean_100_35.npy')
centroid_std_100_35= np.load(f'centroid_std_100_35.npy')
kmeans_centroids_100_35 = np.load(f"kmeans_centroids_neighbors_100.npy")
df_results_v2=pd.read_csv('result_table_neighbors_100_35.csv')
mean_distance_matrix_100_35= np.load(f'mean_distance_matrix_neighbors_100_35.npy')
distance_matrix_std_100_35= np.load(f"distance_matrix_std_100_35.npy")
normalized_distance_matrix_std_100_35= np.load(f'normalized_distance_matrix_std_100_35.npy')
normalized_mean_distance_matrix_100_35= np.load(f'normalized_mean_distance_matrix_100_35.npy')
mst_std_100_35= np.load(f'mst_std_100_35.npy')
mst_100_35= np.load(f'mst_100_35.npy')

In [None]:
### NO NEED TO RE RUN ###

# Define the number of UMAP and KMeans runs
n_runs = 35
n_clusters = 10  # Set the number of clusters (for KMeans)
n_neighbors = 100

# Store UMAP and KMeans results for each run
umap_projections_100_35 = []
kmeans_centroids_list_100_35 = []  # Use this to store centroids for each run

# Define a helper function to calculate the centroid of each cluster
def calculate_centroids(kmeans, x_umap):
    centroids = []
    for i in range(n_clusters):
        cluster_points = x_umap[kmeans.labels_ == i]
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    return np.array(centroids)

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.1, n_components=2, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap = umap_model.fit_transform(x_train_flattened)
    
    # Apply KMeans clustering on the UMAP projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_train_umap)

    # Calculate centroids for this run
    centroids = calculate_centroids(kmeans, x_train_umap)
    
    # Store the UMAP projections and KMeans models
    umap_projections_100_35.append(x_train_umap)
    kmeans_centroids_list_100_35.append(centroids)

# Now we calculate the mean and standard deviation of the centroids across all runs
kmeans_centroids = np.array(kmeans_centroids_list_100_35)  # Shape: (n_runs, n_clusters, 2)

# Calculate mean and std deviation for centroids' coordinates
centroid_mean = np.mean(kmeans_centroids, axis=0)
centroid_std = np.std(kmeans_centroids, axis=0)

# Save the UMAP projections and KMeans centroids
np.save(f'umap_projections_neighbors_{n_neighbors}.npy', np.array(umap_projections_100_35))
np.save(f'kmeans_centroids_neighbors_{n_neighbors}.npy', np.array(kmeans_centroids_list_100_35))

In [None]:
# Load the UMAP projections
umap_projections_100_35 = np.load(f'umap_projections_neighbors_100.npy')

# To see the contents of the UMAP projections
print(umap_projections_100_35)

In [9]:
### NO NEED TO RE RUN ###

# Save the centroid_mean and centroid_std
np.save(f'centroid_mean_100_35.npy', np.array(centroid_mean))
np.save(f'centroid_std_100_35.npy', np.array(centroid_std))

In [42]:
centroid_mean_100_35= np.load(f'centroid_mean_100_35.npy')
centroid_std_100_35= np.load(f'centroid_std_100_35.npy')

In [43]:
kmeans_centroids_100_35 = np.load(f"kmeans_centroids_neighbors_100.npy")  # Load the saved centroids data

------------------------

Standard deviation calculation

In [None]:
### NO NEED TO RE RUN ###
# Initialize arrays to store standard deviations
std_dev_x_100_35 = np.zeros(10)
std_dev_y_100_35 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_100_35[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_100_35[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_100_35[i] = np.std(cluster_x_coords)
    std_dev_y_100_35[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_100_35)
print("Standard deviation of y coordinates per cluster:", std_dev_y_100_35)

In [13]:
### NO NEED TO RE RUN ###
# Create an empty list to hold the data for the DataFrame
data_100_35 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_100_35[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_100_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x_100_35[cluster], mean_x + 2 * std_dev_x_100_35[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y_100_35[cluster], mean_y + 2 * std_dev_y_100_35[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_100_35.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_100_35 = pd.DataFrame(data_100_35, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [14]:
### NO NEED TO RE RUN ###
# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true_100_35 = df_results_100_35.groupby('Trial')['Inside 2 std'].all()

In [15]:
### NO NEED TO RE RUN ###
# Filter the trials where all clusters were True
trials_with_all_true_100_35 = trials_all_true_100_35[trials_all_true_100_35].index.tolist()

In [None]:
### NO NEED TO RE RUN ###
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true_100_35)

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where not all clusters were True
trials_with_some_false_100_35 = trials_all_true_100_35[~trials_all_true_100_35].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false_100_35)

In [18]:
### NO NEED TO RE RUN ###
# Save the result table to a CSV file
df_results_100_35.to_csv(f'result_table_neighbors_100_35.csv', index=False)

-----------------

Removal outliers process

In [44]:
df_results_100_35=pd.read_csv('result_table_neighbors_100_35.csv')

In [20]:
# Convert the NumPy array into a DataFrame with 'Cluster', 'x_mean', and 'y_mean'
centroid_mean_neighbors_100_35_df = pd.DataFrame(centroid_mean_100_35, columns=['x_mean', 'y_mean'])
centroid_mean_neighbors_100_35_df['Cluster'] = np.arange(10)

In [23]:
# Step 1: Add commas between numbers in 'Centroid Coord' entries if they are missing
df_results_100_35['Centroid Coord'] = df_results_100_35['Centroid Coord'].str.replace(
    r'(\-?\d+\.\d+)\s+(\-?\d+\.\d+)', r'\1, \2', regex=True
)

# Step 2: Convert 'Centroid Coord' from string to list
df_results_100_35['Centroid Coord'] = df_results_100_35['Centroid Coord'].apply(ast.literal_eval)

# Step 3: Verify if each entry in 'Centroid Coord' is a list of length 2
invalid_rows = df_results_100_35[df_results_100_35['Centroid Coord'].apply(lambda x: not (isinstance(x, list) and len(x) == 2))]


In [24]:
# Extract x and y coordinates
df_results_100_35[['x', 'y']] = pd.DataFrame(df_results_100_35['Centroid Coord'].tolist(), index=df_results_100_35.index)

# Merge the mean centroids dataframe with the results dataframe on 'Cluster'
df_merged_100_35 = pd.merge(df_results_100_35, centroid_mean_neighbors_100_35_df, on='Cluster', how='left')

In [None]:
# Plot changes in X-coordinate for each cluster over all runs
n_runs = 35
n_clusters = 10
n_neighbors = 100

for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_100_35[:, cluster, 0], marker='o', linestyle='-', color='b')
    plt.title(f'Cluster {cluster} X-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('X Centroid Coordinate')
    plt.grid(True)
    plt.show()

# Plot changes in Y-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_100_35[:, cluster, 1], marker='o', linestyle='-', color='g')
    plt.title(f'Cluster {cluster} Y-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('Y Centroid Coordinate')
    plt.grid(True)
    plt.show()

In [None]:
# Calculate Euclidean distance from each centroid to its cluster's mean
df_merged_100_35['Distance_to_Mean'] = np.sqrt((df_merged_100_35['x'] - df_merged_100_35['x_mean'])**2 + (df_merged_100_35['y'] - df_merged_100_35['y_mean'])**2)

# Apply an outlier threshold (e.g., 90th percentile of the distance per cluster)
def filter_outliers(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return df[df['Distance_to_Mean'] <= threshold]

# Apply the filtering function for each cluster
df_no_outliers = df_merged_100_35.groupby('Cluster').apply(filter_outliers).reset_index(drop=True)

# Step 7: Drop unnecessary columns if needed (like 'x' and 'y' if only the distance matters)
df_no_outliers_cleaned_100_35 = df_no_outliers.drop(columns=['x', 'y', 'x_mean', 'y_mean'])

# Step 8: Check the size of the resulting dataframe
print(f"Original DataFrame size: {df_merged_100_35.shape}")
print(f"DataFrame size after removing outliers: {df_no_outliers_cleaned_100_35.shape}")

# Display the final dataframe to the user
df_no_outliers_cleaned_100_35

In [27]:
# Group the dataframe by 'Cluster'
clusters_grouped_100_35 = df_no_outliers_cleaned_100_35.groupby('Cluster')

# Create a dictionary to store arrays for each cluster's centroids
clusters_centroids_100_35 = {}

# Loop through each group (cluster) and store the centroids in arrays
for cluster, group in clusters_grouped_100_35:
    # Extract centroids (x, y) as a NumPy array
    centroids_array = np.array(group['Centroid Coord'].tolist())  # Assuming 'Centroid Coord' contains [x, y] pairs
    clusters_centroids_100_35[cluster] = centroids_array

In [None]:
# Create a dictionary to store the size of each cluster
cluster_sizes = {cluster: len(centroids) for cluster, centroids in clusters_centroids_100_35.items()}

# Print the size of each cluster
for cluster, size in cluster_sizes.items():
    print(f"Cluster {cluster} has {size} centroids considered.")

--------

Check to verify that if it is fine to have all clusters with the same number of centroids after filtering out outliers. This must be due to:
- The Distance Distributions are Likely Very Similar
- Uniform Data structure

In [None]:
# Loop through each cluster and plot the distribution of distances
for cluster, group in clusters_grouped_100_35:
    plt.figure(figsize=(10, 5))
    plt.hist(group['Distance_to_Mean'], bins=10, edgecolor='black')
    plt.title(f'Cluster {cluster}: Distance to Mean Distribution')
    plt.xlabel('Distance to Mean')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
# Percentile threshold per cluster check
def check_percentiles(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return threshold

# Function applied to each cluster and print the result
for cluster, group in clusters_grouped_100_35:
    threshold = check_percentiles(group)
    print(f"Cluster {cluster}: 90th percentile threshold = {threshold}")

In [None]:
# For each cluster, calculate the 70th percentile of distances and filter accordingly
for cluster, group in clusters_grouped_100_35:
    # Calculate the 70th percentile threshold for the current cluster
    threshold = np.percentile(group['Distance_to_Mean'], 70)
    
    # Filter centroids based on the 70th percentile
    filtered_group = group[group['Distance_to_Mean'] <= threshold]
    
    # Print the size of the group before and after filtering
    print(f"Cluster {cluster}: Original size = {len(group)}, Filtered size = {len(filtered_group)}")

--------

#### Mean Distance matrix n= 100

##### Distance Mean matrix

**Distance matrix**: elemnt d_{ij} has the distance between the center of cluster i and cluster j.

In [None]:
# Store distance matrices for each run
distance_matrices_100_35 = []

# Iterate over all runs and calculate the distance matrix for each run
for run_centroids in kmeans_centroids_100_35:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix_100_35 = cdist(run_centroids, run_centroids, metric='euclidean')
    distance_matrices_100_35.append(distance_matrix_100_35)

# Convert the list of distance matrices to a numpy array (35 runs, 10x10 distance matrices)
distance_matrices_100_35 = np.array(distance_matrices_100_35)

# Calculate the mean distance matrix across all runs
mean_distance_matrix_100_35 = np.mean(distance_matrices_100_35, axis=0)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_35 = (mean_distance_matrix_100_35 - np.min(mean_distance_matrix_100_35)) / (np.max(mean_distance_matrix_100_35) - np.min(mean_distance_matrix_100_35))

# Plot of the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_100_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_100_all_runs.npy', distance_matrices_100_35)
np.save('mean_distance_matrix_neighbors_100_35.npy', mean_distance_matrix_100_35)
np.save('normalized_mean_distance_matrix_100_35.npy', normalized_mean_distance_matrix_100_35)
# Mean distance matrix
print(f"Mean distance matrix across all runs:\n{mean_distance_matrix_100_35}")

In [47]:
mean_distance_matrix_100_35= np.load(f'mean_distance_matrix_neighbors_100_35.npy')
# mean_distance_matrix_100_35=np.round(mean_distance_matrix_100_35,3)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_35 = (mean_distance_matrix_100_35 - np.min(mean_distance_matrix_100_35)) / (np.max(mean_distance_matrix_100_35) - np.min(mean_distance_matrix_100_35))

In [None]:
# Create a graph from the distance matrix
G_100_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_100_35,3))
np.save('G_100_35.npy', G_100_35)

# Draw the graph
pos = nx.spring_layout(G_100_35, seed=42)  # positions for all nodes
nx.draw(G_100_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_100_35, 'weight')
nx.draw_networkx_edge_labels(G_100_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_100_35 = nx.minimum_spanning_tree(G_100_35)
np.save("mst_100_35.npy", mst_100_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_100_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_100_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_100_35, 'weight')
nx.draw_networkx_edge_labels(mst_100_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST - n_neighbors=100")
plt.show()

-------------

##### Distance Std. dev. Matrix

In [None]:
# Calculate the pairwise distance matrix for the standard deviations
distance_matrix_std_100_35 = cdist(centroid_std_100_35, centroid_std_100_35, metric='euclidean')

# Normalize the distance matrix
normalized_distance_matrix_std_100_35 = (distance_matrix_std_100_35 - np.min(distance_matrix_std_100_35)) / (np.max(distance_matrix_std_100_35) - np.min(distance_matrix_std_100_35))

# Visualize the normalized distance matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(normalized_distance_matrix_std_100_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=100)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()

# Save the distance matrix for later analysis
np.save("distance_matrix_std_100_35.npy", distance_matrix_std_100_35)
np.save("normalized_distance_matrix_std_100_35.npy", normalized_distance_matrix_std_100_35)


In [None]:
# Create a graph from the distance matrix
G_std_100_35 = nx.from_numpy_array(np.round(normalized_distance_matrix_std_100_35,3))
np.save('G_std_100_35', G_std_100_35)

# Draw the graph
pos = nx.spring_layout(G_std_100_35, seed=42)  # positions for all nodes
nx.draw(G_std_100_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_std_100_35, 'weight')
nx.draw_networkx_edge_labels(G_std_100_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_std_100_35 = nx.minimum_spanning_tree(G_std_100_35)
np.save('mst_std_100_35.npy', mst_std_100_35)

# Define positions for all nodes
pos_std_100_35 = nx.spring_layout(mst_std_100_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_std_100_35, pos_std_100_35, with_labels=True, node_color='lightyellow', edge_color='green', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels_std_100_35 = nx.get_edge_attributes(mst_std_100_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_100_35, pos_std_100_35, edge_labels=edge_labels_std_100_35, font_size=8, label_pos=0.3)

plt.title("MST Std. Deviation - n_neighbors=100")
plt.show()

In [None]:
# Define a function to create heatmaps
def plot_heatmaps_side_by_side(matrices, titles, figsize=(16, 8), cmap="viridis", annot=True):
    """
    Plots multiple heatmaps side by side for given matrices and titles.

    Args:
        matrices (list): List of 2D matrices to plot as heatmaps.
        titles (list): List of titles corresponding to each matrix.
        figsize (tuple): Size of the entire figure (default: (16, 8)).
        cmap (str): Color map to use for all heatmaps (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    n = len(matrices)  # Number of heatmaps
    fig, axes = plt.subplots(1, n, figsize=figsize)

    for i, (matrix, title) in enumerate(zip(matrices, titles)):
        sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5, ax=axes[i])
        axes[i].set_title(title)
        axes[i].set_xlabel("Cluster")
        axes[i].set_ylabel("Cluster" if i == 0 else "")  # Only label y-axis for the first plot

    plt.tight_layout()
    plt.show()

# Call the function with the two heatmaps
plot_heatmaps_side_by_side(
    matrices=[
        normalized_distance_matrix_std_100_35,
        normalized_mean_distance_matrix_100_35
    ],
    titles=[
        "Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=100)",
        "Normalized Mean Distance Matrix (k=10, n_neighbors=190)"
    ]
)

In [None]:
# Define a threshold for "low" values
threshold = 0.6

# Identify pairs of clusters with low values in both matrices
low_low_pairs = []
for i in range(normalized_mean_distance_matrix_100_35.shape[0]):
    for j in range(normalized_mean_distance_matrix_100_35.shape[1]):
        if i != j:  # Skip diagonal
            mean_value = normalized_mean_distance_matrix_100_35[i, j]
            std_value = normalized_distance_matrix_std_100_35[i, j]
            if mean_value < threshold and std_value < threshold:
                low_low_pairs.append((i, j, mean_value, std_value))

# Display the results
for pair in low_low_pairs:
    print(f"Clusters {pair[0]} and {pair[1]}: Mean Distance = {pair[2]:.2f}, Std Distance = {pair[3]:.2f}")

0.6 is the lowest threshold so far.

Depending on the goal of the analysis we can think of it as:
- If the aim is to identify the strongest relationships between clusters, a lower threshold would make more sense.
- If we want to explore the broader connections, then it is fine.

In [None]:
# Example: Replace with your cluster pairs from low_low_pairs
low_low_pairs = [(0, 8, 0.59, 0.55), (3, 8, 0.59, 0.25), (7, 8, 0.58, 0.24)]

# UMAP projections and cluster labels (replace with your actual data)
umap_projections = np.load("umap_projections_neighbors_100.npy")
kmeans_labels = np.load("kmeans_labels_list_100_35.npy")  # Shape: (n_runs, n_samples)

# Function to plot clusters
def plot_clusters(umap_projection, labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair
    points_a = umap_projection[labels == cluster_a]
    points_b = umap_projection[labels == cluster_b]

    plt.figure(figsize=(8, 6))
    plt.scatter(points_a[:, 0], points_a[:, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.6)
    plt.scatter(points_b[:, 0], points_b[:, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.6)
    plt.title(f"Run {run_idx}: Cluster {cluster_a} vs. Cluster {cluster_b}")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Analyze each cluster pair
for cluster_pair in low_low_pairs:
    cluster_a, cluster_b = cluster_pair[0], cluster_pair[1]
    print(f"Analyzing Cluster Pair: {cluster_a} and {cluster_b}")
    
    # For simplicity, visualize them in a specific UMAP run (e.g., the first run)
    run_idx = 0  # Use the first run for visualization
    plot_clusters(umap_projections[run_idx], kmeans_labels[run_idx], (cluster_a, cluster_b), run_idx)

    # Calculate additional statistics if needed
    distances_a_to_b = np.linalg.norm(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a].mean(axis=0) - 
                                      umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b].mean(axis=0))
    print(f"Mean Centroid Distance (Run {run_idx}): {distances_a_to_b:.2f}")

    # Variability comparison
    cluster_a_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a], axis=0)
    cluster_b_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b], axis=0)
    print(f"Cluster {cluster_a} Std Dev: {cluster_a_std}")
    print(f"Cluster {cluster_b} Std Dev: {cluster_b_std}")
    print("\n")

**Cluster 1 and Cluster 8** have a moderate spatial relationship with visible overlap in the UMAP space. Their differing variability patterns suggest distinct structures, but the overlap points might represent shared features or transitions between the clusters.
The large spatial separation between their centroids suggests they represent distinct structures or classes in the data.

**Cluster 0 and Cluster 9** 9 appears more compact and stable, while Cluster 0 is larger and more variable.
Their distinct regions in the UMAP space and differing standard deviations reinforce their meaningful separation.
Insights from Variability:

The variability of Cluster 0 could indicate sensitivity to UMAP parameters or noise in the data.

Interval of confidence

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution

n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_100_35 = distance_matrix_std_100_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_100_35 = z_score * sem_matrix_100_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_100_35 = mean_distance_matrix_100_35 - margin_of_error_matrix_100_35
upper_limit_intconf_matrix_100_35 = mean_distance_matrix_100_35 + margin_of_error_matrix_100_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_100_35 = np.maximum(lower_limit_intconf_matrix_100_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_100_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_100_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_100_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_100_35.npy', lower_limit_intconf_matrix_100_35)
np.save('upper_limit_intconf_matrix_100_35.npy', upper_limit_intconf_matrix_100_35)

In [38]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_100_35 = normalize_matrix(lower_limit_intconf_matrix_100_35)
norm_upper_limit_intconf_matrix_100_35 = normalize_matrix(upper_limit_intconf_matrix_100_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_100_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=100)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_100_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=100)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_100_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=100)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_100_35, "MST - Mean Distances", axes[0], color='red')
plot_mst(norm_lower_limit_intconf_matrix_100_35, "MST - Lower Limit", axes[1], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_100_35, "MST - Upper Limit", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

#### Intra class evaluation

In [None]:
# Define parameters
n_clusters = 10
radius = 0.5

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_50_35 = []
neighbor_counts_50_35 = []
kmeans_labels_list_50_35 = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_50_35):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_50_35 = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_50_35.append(kmeans_labels_5_35)
    
    run_max_distances_50_35 = []
    run_neighbor_counts_50_35 = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_50_35 == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_50_35 = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_50_35.append(max_distance_50_35)
        
        # Calculate number of neighbors within the radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_50_35 = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_50_35.append(neighbors_within_radius_50_35)
    
    # Append results for this run
    max_distances_50_35.append(run_max_distances_50_35)
    neighbor_counts_50_35.append(run_neighbor_counts_50_35)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_50_35 = np.array(max_distances_50_35)  # Shape: (n_runs, n_clusters)
neighbor_counts_50_35 = np.array(neighbor_counts_50_35)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_50_35 = np.array(kmeans_labels_list_50_35)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_5_35.npy', max_distances_50_35)
np.save('neighbor_counts_within_radius_5_35.npy', neighbor_counts_50_35)
np.save('kmeans_labels_list_5_35 .npy', kmeans_labels_list_50_35)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_50_35)
print("\nNeighbor counts within radius for each run and each cluster:\n", neighbor_counts_50_35)
print("\nKMeans labels saved successfully.")

In [40]:
max_distances_100_35_d= np.load(f'max_intra_cluster_distances_dynamic_100_35.npy')
neighbor_counts_100_35_d= np.load(f'neighbor_counts_within_dynamic_radius_100_35.npy')
kmeans_labels_list_100_35_d= np.load(f'kmeans_labels_list_100_35.npy')

In [None]:
# Variables to track the minimum distance and corresponding clusters
overall_min_distance_100 = float('inf')
min_distance_clusters_100 = None
min_distance_run_idx_100= None

for run_idx, run_centroids in enumerate(kmeans_centroids_100_35):
    # Compute pairwise distances between centroids
    pairwise_distances_100 = cdist(run_centroids, run_centroids, metric='euclidean')
    
    # Get the indices of the minimum non-zero distance
    np.fill_diagonal(pairwise_distances_100, np.inf)  # Ignore zero distances (self-comparisons)
    min_distance = np.min(pairwise_distances_100)
    if min_distance < overall_min_distance_100:
        overall_min_distance_100 = min_distance
        # Find the indices of the clusters corresponding to the minimum distance
        cluster_indices = np.unravel_index(np.argmin(pairwise_distances_100), pairwise_distances_100.shape)
        min_distance_clusters_100 = cluster_indices
        min_distance_run_idx_100 = run_idx

# Calculate dynamic radius
dynamic_radius_100_35 = overall_min_distance_100 / 2
print(f"Dynamic radius: {dynamic_radius_100_35}")
print(f"Minimum distance: {overall_min_distance_100}")
print(f"Clusters contributing to minimum distance: {min_distance_clusters_100}")
print(f"Run index: {min_distance_run_idx_100}")

# Save dynamic radius
np.save('dynamic_radius_results_100_35.npy', dynamic_radius_100_35)


In [None]:
# Define parameters
n_clusters = 10

# Dynamic radius, previously calculated
radius = dynamic_radius_100_35

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_100_35_d = []
neighbor_counts_100_35_d = []
kmeans_labels_list_100_35_d = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_100_35):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_100_35_d = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_100_35_d.append(kmeans_labels_100_35_d)
    
    run_max_distances_100_35_d = []
    run_neighbor_counts_100_35_d = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_100_35_d == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_100_35_d = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_100_35_d.append(max_distance_100_35_d)
        
        # Calculate number of neighbors within the dynamic radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_100_35_d = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_100_35_d.append(neighbors_within_radius_100_35_d)
    
    # Append results for this run
    max_distances_100_35_d.append(run_max_distances_100_35_d)
    neighbor_counts_100_35_d.append(run_neighbor_counts_100_35_d)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_100_35_d = np.array(max_distances_100_35_d)  # Shape: (n_runs, n_clusters)
neighbor_counts_100_35_d = np.array(neighbor_counts_100_35_d)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_100_35_d = np.array(kmeans_labels_list_100_35_d)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_dynamic_100_35.npy', max_distances_100_35_d)
np.save('neighbor_counts_within_dynamic_radius_100_35.npy', neighbor_counts_100_35_d)
np.save('kmeans_labels_list_100_35.npy', kmeans_labels_list_100_35_d)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_100_35_d)
print("\nNeighbor counts within dynamic radius for each run and each cluster:\n", neighbor_counts_100_35_d)
print("\nKMeans labels saved successfully.")

In [None]:
# Plot neighbor counts for each cluster across all runs
plt.figure(figsize=(16, 12))

# Iterate through each cluster
for cluster_idx in range(neighbor_counts_100_35_d.shape[1]):
    plt.subplot(2, 5, cluster_idx + 1)  # Create subplots for 10 clusters (2 rows, 5 columns)
    plt.plot(range(1, neighbor_counts_100_35_d.shape[0] + 1), neighbor_counts_100_35_d[:, cluster_idx], marker='o')
    plt.title(f'Cluster {cluster_idx}')
    plt.xlabel('Run')
    plt.ylabel('Neighbor Count')
    plt.xticks(range(1, neighbor_counts_100_35_d.shape[0] + 1, 5))  # Show every 5th run on the x-axis for clarity
    plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.suptitle('N=100 Neighbor Counts per Cluster Across Runs', y=1.02, fontsize=16)  # Add a global title
plt.show()

In [None]:
# Calculate mean and max values across clusters for each run
mean_neighbors_100 = np.mean(neighbor_counts_100_35_d, axis=1)  # Shape: (n_runs,)
max_neighbors_100 = np.max(neighbor_counts_100_35_d, axis=1)    # Shape: (n_runs,)

# Compute trend lines for mean and max
runs = np.arange(1, len(mean_neighbors_100) + 1)
mean_slope, mean_intercept, _, _, _ = linregress(runs, mean_neighbors_100)
max_slope, max_intercept, _, _, _ = linregress(runs, max_neighbors_100)

# Calculate trend line values
mean_trend_100 = mean_slope * runs + mean_intercept
max_trend_100 = max_slope * runs + max_intercept

# Plot the results
plt.figure(figsize=(12, 6))

# Mean neighbor counts
plt.plot(runs, mean_neighbors_100, label='Mean Neighbor Count', marker='o', color='blue')

# Max neighbor counts
plt.plot(runs, max_neighbors_100, label='Max Neighbor Count', marker='s', color='orange')

# Trend lines
plt.plot(runs, mean_trend_100, linestyle='--', color='green',label='Mean Trend Line')
plt.plot(runs, max_trend_100, linestyle='--', color='green', label='Max Trend Line')

# Add labels, legend, and title
plt.title('N=100 Neighbor Counts Across Runs (Mean vs. Max with Trend Lines)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Neighbor Count', fontsize=12)
plt.xticks(range(1, len(mean_neighbors_100) + 1, 5))  # Show every 5th run for readability
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the plot to a file
plt.savefig(f'neighbor_counts_plot_n_100_35.png', dpi=300)

# Show the plot
plt.show()

In [None]:
# Calculate mean, min, and max across clusters for each run
mean_intra_cluster_distances = np.mean(max_distances_100_35_d, axis=1)  # Mean across clusters
min_intra_cluster_distances = np.min(max_distances_100_35_d, axis=1)    # Min across clusters
max_intra_cluster_distances = np.max(max_distances_100_35_d, axis=1)    # Max across clusters

mean_neighbor_counts = np.mean(neighbor_counts_100_35_d, axis=1)  # Mean across clusters
min_neighbor_counts = np.min(neighbor_counts_100_35_d, axis=1)    # Min across clusters
max_neighbor_counts = np.max(neighbor_counts_100_35_d, axis=1)    # Max across clusters

# Smoothing using rolling average (optional, if data is noisy)
# window_size = 3
# mean_intra_cluster_distances = np.convolve(mean_intra_cluster_distances, np.ones(window_size)/window_size, mode='valid')
# mean_neighbor_counts = np.convolve(mean_neighbor_counts, np.ones(window_size)/window_size, mode='valid')

# Plot Intra-Cluster Distances
plt.figure(figsize=(12, 6))
plt.plot(mean_intra_cluster_distances, label='Mean Intra-Cluster Distance', marker='o', color='blue')
plt.fill_between(
    range(len(mean_intra_cluster_distances)),
    min_intra_cluster_distances,
    max_intra_cluster_distances,
    color='blue',
    alpha=0.2,
    label='Range (Min-Max)'
)
plt.title('Intra-Cluster Distance Statistics Across Runs (n_neighbors=100)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Distance', fontsize=12)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Plot Neighbor Counts
plt.figure(figsize=(12, 6))
plt.plot(mean_neighbor_counts, label='Mean Neighbor Count', marker='o', color='orange')
plt.fill_between(
    range(len(mean_neighbor_counts)),
    min_neighbor_counts,
    max_neighbor_counts,
    color='orange',
    alpha=0.2,
    label='Range (Min-Max)'
)
plt.title('Neighbor Counts Statistics Within Dynamic Radius Across Runs (n_neighbors=100)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Neighbor Count', fontsize=12)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


-------

## General UMAP

In [10]:
# Convert the MNIST dataset to a flattened format suitable for UMAP
x_train_flattened = np.array([np.array(img).flatten() for img in x_train])
x_test_flattened = np.array([np.array(img).flatten() for img in x_test])

### n=5

In [44]:
umap_projections_5_01_35= np.load(f'umap_projections_5_01_35.npy')
mean_projection_5_01_35= np.load(f'mean_projection_5_01_35.npy')
std_projection_5_01_35= np.load(f'std_projection_5_01_35.npy')
lower_limit_intconf_matrix_5_01_35= np.load(f'lower_limit_intconf_matrix_5_01_35.npy')
upper_limit_intconf_matrix_5_01_35= np.load(f'upper_limit_intconf_matrix_5_01_35.npy')
distance_matrices_5_01_35=np.load(f'distance_matrices_neighbors_5_01_35.npy')
mean_distance_matrix_5_01_35=np.load(f'mean_distance_matrix_neighbors_5_01_35.npy')
norm_lower_limit_intconf_matrix_5_01_35=np.load(f'norm_lower_limit_intconf_matrix_5_01_35.npy')
norm_upper_limit_intconf_matrix_5_01_35=np.load(f'norm_upper_limit_intconf_matrix_5_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 5
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_5_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_5_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_5_01_35 = np.array(umap_projections_5_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_5_01_35 = np.mean(umap_projections_5_01_35, axis=0)
std_projection_5_01_35 = np.std(umap_projections_5_01_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_5_01_35.npy', umap_projections_5_01_35)
np.save('mean_projection_5_01_35.npy', mean_projection_5_01_35)
np.save('std_projection_5_01_35.npy', std_projection_5_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_5_01_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_5_01_35 = np.sqrt(np.sum((umap_projections_5_01_35 - mean_projection_5_01_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_5_01_35, axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
print(f"y_train type: {type(y_train)}")  # Should now be <class 'numpy.ndarray'>
print(f"y_train shape: {y_train.shape}")

------

In [45]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_5_01_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_5_01 = np.zeros((n_runs, n_clusters, umap_projections_5_01_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_5_01_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_5_01[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_5_01 = np.zeros(10)
std_dev_y_5_01 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_5_01[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_5_01[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_5_01[i] = np.std(cluster_x_coords)
    std_dev_y_5_01[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_5_01)
print("Standard deviation of y coordinates per cluster:", std_dev_y_5_01)

------

-------

In [None]:
# Placeholder for cluster centroids for each valid run
cluster_centroids_per_run_kmeans = []

# Iterate over valid runs to calculate centroids using KMeans
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_5_01_35[run]  # Shape: (n_samples, 2)
    
    # Perform KMeans clustering to calculate centroids
    kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
    kmeans.fit(projections)
    centroids = kmeans.cluster_centers_
    
    # Store centroids for this run
    cluster_centroids_per_run_kmeans.append(centroids)

# Calculate pairwise distances between centroids for each run
distance_matrices_5_01_35_kmeans = []
for centroids in cluster_centroids_per_run_kmeans:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_5_01_35_kmeans.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_5_01_35_kmeans = np.array(distance_matrices_5_01_35_kmeans)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_5_01_35_kmeans = np.mean(distance_matrices_5_01_35_kmeans, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_01_35_kmeans = (
    mean_distance_matrix_5_01_35_kmeans - np.min(mean_distance_matrix_5_01_35_kmeans)
) / (np.max(mean_distance_matrix_5_01_35_kmeans) - np.min(mean_distance_matrix_5_01_35_kmeans))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_01_35_kmeans, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (KMeans, k=10, n_neighbors=5)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_01_35_kmeans.npy', distance_matrices_5_01_35_kmeans)
np.save('mean_distance_matrix_neighbors_5_01_35_kmeans.npy', mean_distance_matrix_5_01_35_kmeans)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs (KMeans):\n{mean_distance_matrix_5_01_35_kmeans}")

-------

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_5_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_5_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_5_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_5_01_35 = np.array(distance_matrices_5_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_5_01_35 = np.mean(distance_matrices_5_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_01_35 = (mean_distance_matrix_5_01_35 - np.min(mean_distance_matrix_5_01_35)) / (np.max(mean_distance_matrix_5_01_35) - np.min(mean_distance_matrix_5_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=5)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_01_35.npy', distance_matrices_5_01_35)
np.save('mean_distance_matrix_neighbors_5_01_35.npy', mean_distance_matrix_5_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_5_01_35}")

In [None]:
# Create a graph from the distance matrix
G_5_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_5_01_35,3))
np.save('G_5_01_35.npy',G_5_01_35)

# Draw the graph
pos = nx.spring_layout(G_5_01_35, seed=42)  # positions for all nodes
nx.draw(G_5_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_5_01_35, 'weight')
nx.draw_networkx_edge_labels(G_5_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_5_01_35 = nx.minimum_spanning_tree(G_5_01_35)
np.save('mst_5_01_35.npy', mst_5_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_5_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_5_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_5_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_5_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=5, min_dist=0.1")
plt.show()

**1. MST UMAP**

Structure:

The graph represents a Minimum Spanning Tree (MST) based on the normalized mean distance matrix derived purely from UMAP projections.
Each node corresponds to a cluster, and the edges represent the shortest path between clusters with the edge weight being the normalized distance.
Insights:

The clusters are more tightly interconnected, with nodes such as 3, 5, and 7 serving as central points, connecting multiple clusters.
The distances (weights on the edges) are relatively lower, indicating smaller normalized pairwise distances between clusters.
This suggests that the UMAP embedding preserves local neighborhood structures, where distances reflect proximity in the reduced-dimensional space.

**2. MST UMAP + KMeans**

Structure:

This graph represents an MST derived from the combination of UMAP for dimensionality reduction and KMeans for clustering.
KMeans introduces an additional layer of abstraction, grouping data into clusters and then calculating centroids for these clusters.
Insights:

The structure of this MST is more linear, with node 1 acting as a central hub connecting multiple clusters.
The edge weights (distances) are consistently higher, suggesting that the clustering step has created more separation between cluster centroids compared to distances derived from UMAP alone.
This linear structure likely reflects the influence of KMeans clustering, which tends to impose a more rigid partitioning of the data.

**Comparison**

Cluster Connectivity:

UMAP: Shows a more decentralized network with multiple hubs (3, 5, 7).
UMAP + KMeans: Displays a more centralized structure with one dominant hub (1).
Edge Weights:

UMAP: Lower edge weights suggest tighter groupings and shorter distances between clusters.
UMAP + KMeans: Higher edge weights reflect the increased separation introduced by the clustering process.
Interpretation of Distances:

UMAP: Distances directly reflect the UMAP embedding, which prioritizes local neighborhood preservation.
UMAP + KMeans: Distances represent the centroids derived from KMeans clustering, which may distort or amplify separation compared to the original embedding.

Flexibility vs. Structure:
UMAP: More flexible, as it is based on pairwise distances without enforcing strict cluster boundaries.
UMAP + KMeans: More structured due to KMeans, which enforces boundaries and distances are centroid-based.

**Conclusion**
The MST UMAP graph provides a more natural representation of cluster relationships, capturing the local proximities and overlaps between clusters.
The MST UMAP + KMeans graph introduces a more structured interpretation of the data, but at the cost of potentially oversimplifying relationships between clusters.

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_5_01_35 = np.std(distance_matrices_5_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_5_01_35.npy", distance_matrix_std_5_01_35)

# Output the results
print("Standard Deviation Distance Matrix (5_01_35):\n", distance_matrix_std_5_01_35)

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_5_01_35 = distance_matrix_std_5_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_5_01_35 = z_score * sem_matrix_5_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_5_01_35 = mean_distance_matrix_5_01_35 - margin_of_error_matrix_5_01_35
upper_limit_intconf_matrix_5_01_35 = mean_distance_matrix_5_01_35 + margin_of_error_matrix_5_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_5_01_35 = np.maximum(lower_limit_intconf_matrix_5_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_5_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_5_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_5_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_5_01_35.npy', lower_limit_intconf_matrix_5_01_35)
np.save('upper_limit_intconf_matrix_5_01_35.npy', upper_limit_intconf_matrix_5_01_35)

Interval of confidence

In [16]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_5_01_35 = normalize_matrix(lower_limit_intconf_matrix_5_01_35)
norm_upper_limit_intconf_matrix_5_01_35 = normalize_matrix(upper_limit_intconf_matrix_5_01_35)
np.save('norm_lower_limit_intconf_matrix_5_01_35.npy', norm_lower_limit_intconf_matrix_5_01_35)
np.save('norm_upper_limit_intconf_matrix_5_01_35.npy', norm_upper_limit_intconf_matrix_5_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_5_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=5, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_5_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=5, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_5_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=5, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_5_01_35, "UMAP MST - Mean Distances - n_neighbors=5 min_dist = 0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_5_01_35, "UMAP MST - Lower Limit - n_neighbors=5 min_dist = 0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_5_01_35, "UMAP MST - Upper Limit - n_neighbors=5 min_dist = 0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

In [None]:
## Could be nice to show but, analysis at the end not used
# Calculate the standard deviation of distances across runs for each sample
std_distances_per_point = np.std(distances_to_mean_5_01_35, axis=0)  # Shape: (n_samples,)

# Plot the distribution of standard deviations
plt.hist(std_distances_per_point, bins=50)
plt.xlabel('Standard Deviation of Distances')
plt.ylabel('Frequency')
plt.title('Distribution of Distance Variability Across Runs')
plt.show()

Above graph show that distances vary across runs and it might be necessary to increase UMAP parameters or fixed random_state.
From histogram most points have low standard deviation, which suggests the variability is relatively low across runs for a majority of points.However, there is a long tail where some points exhibit much higher variability, which could be influencing the filtering logic in stricter approaches

In [None]:
### FINAL VERSION - IN USED ###

# Assuming `umap_projections_5_01_35` has the shape (n_runs, n_points, 2)
# # and `y_train` has the labels for the dataset.

# # Step 1: Calculate Cluster Centers for Full Dataset
# n_clusters = 10  # Number of clusters (digits 0-9)
# cluster_centers_full = []

# for run_idx, x_umap in enumerate(umap_projections_5_01_35):  # Iterate over all runs
#     cluster_centers_run = []
#     for label in np.unique(y_train):  # Iterate over all labels
#         cluster_points = x_umap[y_train == label]
#         if len(cluster_points) > 0:
#             cluster_center = np.mean(cluster_points, axis=0)
#             cluster_centers_run.append(cluster_center)
#     cluster_centers_full.append(np.array(cluster_centers_run))

# cluster_centers_full = np.array(cluster_centers_full)  # Shape: (n_runs, n_clusters, 2)

# # Step 2: Calculate Dynamic Radius Per Cluster
# dynamic_radii_full = []

# for run_idx, x_umap in enumerate(umap_projections_5_01_35):
#     radii_run = []
#     for label, cluster_center in zip(np.unique(y_train), cluster_centers_full[run_idx]):
#         cluster_points = x_umap[y_train == label]
#         if len(cluster_points) > 0:
#             distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
#             dynamic_radius = np.mean(distances_to_center)  # Use the mean distance
#             radii_run.append(dynamic_radius)
#     dynamic_radii_full.append(radii_run)

# dynamic_radii_full = np.array(dynamic_radii_full)  # Shape: (n_runs, n_clusters)

# # Step 3: Count Neighbors Within Radius
# neighbor_counts_full = []

# for run_idx, x_umap in enumerate(umap_projections_5_01_35):
#     counts_run = []
#     for label, cluster_center, radius in zip(np.unique(y_train), cluster_centers_full[run_idx], dynamic_radii_full[run_idx]):
#         distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
#         count = np.sum(distances_to_center <= radius)  # Count points within the radius
#         counts_run.append(count)
#     neighbor_counts_full.append(counts_run)

# neighbor_counts_full = np.array(neighbor_counts_full)  # Shape: (n_runs, n_clusters)

# # Step 4: Save Results
# np.save("neighbor_counts_dynamic_radius_full.npy", neighbor_counts_full)
# np.save("dynamic_radii_full.npy", dynamic_radii_full)
# np.save("cluster_centers_full.npy", cluster_centers_full)

# # Step 5: Print Results for Verification
# print(f"Dynamic Radii (first run):\n{dynamic_radii_full[0]}")
# print(f"Neighbor Counts (first run):\n{neighbor_counts_full[0]}")
# print(f"Neighbor Counts Shape: {neighbor_counts_full.shape}")

### Adaptable Radius Final Version & Results

In [None]:
## THE FINAL - USE THIS DON'T OVERTHINK
# FOR min_dist = 0,1.

# Function to calculate cluster metrics
def calculate_cluster_metrics(umap_projections, y_labels, n_clusters=10):
    """
    Calculate average cluster radii and neighbor counts for each cluster over all runs.
    """
    n_runs = len(umap_projections)  # Number of runs
    cluster_centers_full = []
    
    # Step 1: Calculabte cluster centers for each run
    for run_idx, x_umap in enumerate(umap_projections):
        cluster_centers_run = []
        for label in np.unique(y_labels):
            cluster_points = x_umap[y_labels == label]
            if len(cluster_points) > 0:
                cluster_center = np.mean(cluster_points, axis=0)
                cluster_centers_run.append(cluster_center)
        cluster_centers_full.append(np.array(cluster_centers_run))
    
    cluster_centers_full = np.array(cluster_centers_full)  # Shape: (n_runs, n_clusters, 2)

    # Step 2: Calculate average radii for each cluster
    radii_per_cluster = []
    for cluster_idx in range(n_clusters):
        radii_cluster = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_center = cluster_centers_full[run_idx][cluster_idx]
            cluster_points = x_umap[y_labels == cluster_idx]
            if len(cluster_points) > 0:
                distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                dynamic_radius = np.mean(distances_to_center)  # Mean distance to center
                radii_cluster.append(dynamic_radius)
        radii_per_cluster.append(np.mean(radii_cluster))  # Average radius across runs

    # Step 3: Calculate neighbor counts for each cluster
    neighbor_counts_full = []
    for run_idx, x_umap in enumerate(umap_projections):
        counts_run = []
        for cluster_idx, cluster_center in enumerate(cluster_centers_full[run_idx]):
            radius = radii_per_cluster[cluster_idx]  # Use the average radius
            distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
            count = np.sum(distances_to_center <= radius)  # Count points within the radius
            counts_run.append(count)
        neighbor_counts_full.append(counts_run)

    neighbor_counts_full = np.array(neighbor_counts_full)  # Shape: (n_runs, n_clusters)
    average_neighbor_counts = np.mean(neighbor_counts_full, axis=0)  # Average across runs

    return radii_per_cluster, average_neighbor_counts

# Define n_neighbors values
n_neighbors_values = [5, 10, 20, 30, 50, 100]
results = []

# Iterate over each n_neighbors value
for n_neighbors in n_neighbors_values:
    if n_neighbors == 5:
        umap_projections = umap_projections_5_01_35
    elif n_neighbors == 10:
        umap_projections = umap_projections_10_01_35
    elif n_neighbors == 20:
        umap_projections = umap_projections_20_01_35
    elif n_neighbors == 30:
        umap_projections = umap_projections_30_01_35
    elif n_neighbors == 50:
        umap_projections = umap_projections_50_01_35
    elif n_neighbors == 100:
        umap_projections = umap_projections_100_01_35

    # Calculate metrics
    radii_per_cluster, average_neighbor_counts = calculate_cluster_metrics(umap_projections, y_train)

    # Store results
    for cluster_idx in range(len(radii_per_cluster)):
        results.append({
            "N": n_neighbors,
            "Cluster": cluster_idx,
            "Radius": np.round(radii_per_cluster[cluster_idx], 3),
            "Number of Neighbors": np.round(average_neighbor_counts[cluster_idx], 0)
        })

# Create a DataFrame for the results
df_results = pd.DataFrame(results)

# Save results for later use
df_results.to_csv("radius_neighbor_analysis_merged_MinDist_01.csv", index=False)

# Pivot table for easy visualization
pivot_table = df_results.pivot(index="Cluster", columns="N", values=["Radius", "Number of Neighbors"])
print(pivot_table)

In [55]:
def plot_mean_neighbor_counts_across_runs(umap_projections_list, n_neighbors_values, y_labels, n_clusters=10):
    """
    Plot mean number of neighbors across runs for different n_neighbors values.
    """
    neighbor_counts_avg_runs = []

    for umap_projections in umap_projections_list:
        # Calculate neighbor counts for each run
        neighbor_counts_per_run = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_centers = []
            radii_per_cluster = []
            for cluster_idx in range(n_clusters):
                # Compute cluster center
                cluster_points = x_umap[y_labels == cluster_idx]
                if len(cluster_points) > 0:
                    cluster_center = np.mean(cluster_points, axis=0)
                    cluster_centers.append(cluster_center)

                    # Compute dynamic radius for this cluster
                    distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                    dynamic_radius = np.mean(distances_to_center)
                    radii_per_cluster.append(dynamic_radius)
                else:
                    radii_per_cluster.append(0)
                    cluster_centers.append(np.array([0, 0]))

            # Compute number of neighbors within radius for each cluster
            neighbor_counts = []
            for cluster_idx, cluster_center in enumerate(cluster_centers):
                if radii_per_cluster[cluster_idx] > 0:  # Avoid empty clusters
                    distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
                    neighbor_count = np.sum(distances_to_center <= radii_per_cluster[cluster_idx])
                    neighbor_counts.append(neighbor_count)

            # Store the mean neighbor count for this run
            neighbor_counts_per_run.append(np.mean(neighbor_counts))
        
        neighbor_counts_avg_runs.append(neighbor_counts_per_run)

    # Plot results
    plt.figure(figsize=(10, 6))
    for i, counts in enumerate(neighbor_counts_avg_runs):
        plt.plot(range(1, len(counts) + 1), counts, label=f'n_neighbors={n_neighbors_values[i]}', marker='o')

    plt.xlabel("Run Index")
    plt.ylabel("Mean Number of Points")
    plt.title("Mean Number of Points Across Runs for min_dist = 0.1")
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Adjust legend position
    plt.grid(True)
    plt.show()

In [None]:
plot_mean_neighbor_counts_across_runs(
    umap_projections_list=[
        umap_projections_5_01_35,
        umap_projections_10_01_35,
        umap_projections_20_01_35,
        umap_projections_30_01_35,
        umap_projections_50_01_35,
        umap_projections_100_01_35
    ],
    n_neighbors_values=n_neighbors_values,
    y_labels=y_train
)

In [None]:
# Loading .csv 
df_results_01 = pd.read_csv('radius_neighbor_analysis_merged_MinDist_01.csv')

# Add a density column to df_results
df_results_01['Density'] = df_results_01['Number of Neighbors'] / df_results_01['Radius']

# Find the row with the maximum density
max_density_row = df_results_01.loc[df_results_01['Density'].idxmax()]

# Extract the cluster, n_neighbors, and maximum density
max_density = max_density_row['Density']
max_cluster = max_density_row['Cluster']
max_n_neighbors = max_density_row['N']

# Print the results
print(f"Highest Density: {max_density:.2f}")
print(f"Cluster: {int(max_cluster)}")
print(f"n_neighbors: {int(max_n_neighbors)}")

In [None]:
## FOR min_dist=0,0125.

# Function to calculate cluster metrics
def calculate_cluster_metrics(umap_projections, y_labels, n_clusters=10):
    """
    Calculate average cluster radii and neighbor counts for each cluster over all runs.
    """
    n_runs = len(umap_projections)  # Number of runs
    cluster_centers_full = []
    
    # Step 1: Calculate cluster centers for each run
    for run_idx, x_umap in enumerate(umap_projections):
        cluster_centers_run = []
        for label in np.unique(y_labels):
            cluster_points = x_umap[y_labels == label]
            if len(cluster_points) > 0:
                cluster_center = np.mean(cluster_points, axis=0)
                cluster_centers_run.append(cluster_center)
        cluster_centers_full.append(np.array(cluster_centers_run))
    
    cluster_centers_full = np.array(cluster_centers_full)  # Shape: (n_runs, n_clusters, 2)

    # Step 2: Calculate average radii for each cluster
    radii_per_cluster = []
    for cluster_idx in range(n_clusters):
        radii_cluster = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_center = cluster_centers_full[run_idx][cluster_idx]
            cluster_points = x_umap[y_labels == cluster_idx]
            if len(cluster_points) > 0:
                distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                dynamic_radius = np.mean(distances_to_center)  # Mean distance to center
                radii_cluster.append(dynamic_radius)
        radii_per_cluster.append(np.mean(radii_cluster))  # Average radius across runs

    # Step 3: Calculate neighbor counts for each cluster
    neighbor_counts_full = []
    for run_idx, x_umap in enumerate(umap_projections):
        counts_run = []
        for cluster_idx, cluster_center in enumerate(cluster_centers_full[run_idx]):
            radius = radii_per_cluster[cluster_idx]  # Use the average radius
            distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
            count = np.sum(distances_to_center <= radius)  # Count points within the radius
            counts_run.append(count)
        neighbor_counts_full.append(counts_run)

    neighbor_counts_full = np.array(neighbor_counts_full)  # Shape: (n_runs, n_clusters)
    average_neighbor_counts = np.mean(neighbor_counts_full, axis=0)  # Average across runs

    return radii_per_cluster, average_neighbor_counts

# Define n_neighbors values
n_neighbors_values = [5, 10, 20, 30, 50, 100]
results = []

# Iterate over each n_neighbors value
for n_neighbors in n_neighbors_values:
    if n_neighbors == 5:
        umap_projections = umap_projections_5_00125_35
    elif n_neighbors == 10:
        umap_projections = umap_projections_10_00125_35
    elif n_neighbors == 20:
        umap_projections = umap_projections_20_00125_35
    elif n_neighbors == 30:
        umap_projections = umap_projections_30_00125_35
    elif n_neighbors == 50:
        umap_projections = umap_projections_50_00125_35
    elif n_neighbors == 100:
        umap_projections = umap_projections_100_00125_35

    # Calculate metrics
    radii_per_cluster, average_neighbor_counts = calculate_cluster_metrics(umap_projections, y_train)

    # Store results
    for cluster_idx in range(len(radii_per_cluster)):
        results.append({
            "N": n_neighbors,
            "Cluster": cluster_idx,
            "Radius": np.round(radii_per_cluster[cluster_idx], 3),
            "Number of Neighbors": np.round(average_neighbor_counts[cluster_idx], 0)
        })

# Create a DataFrame for the results
df_results = pd.DataFrame(results)

# Save results for later use
df_results.to_csv("radius_neighbor_analysis_merged_MinDist_00125.csv", index=False)

# Pivot table for easy visualization
pivot_table = df_results.pivot(index="Cluster", columns="N", values=["Radius", "Number of Neighbors"])
print(pivot_table)

# # Visualization of mean neighbor counts across runs
# def plot_mean_neighbor_counts_across_runs(umap_projections_list, n_neighbors_values, y_labels):
#     """
#     Plot mean number of neighbors across runs for different n_neighbors values.
#     """
#     neighbor_counts_avg_runs = []
    
#     for umap_projections in umap_projections_list:
#         # Calculate neighbor counts for each run
#         neighbor_counts = []
#         for run_idx, x_umap in enumerate(umap_projections):
#             mean_neighbors = []
#             for cluster_idx in np.unique(y_labels):
#                 cluster_points = x_umap[y_labels == cluster_idx]
#                 cluster_center = np.mean(cluster_points, axis=0)
#                 distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
#                 mean_neighbors.append(np.mean(distances_to_center))
#             neighbor_counts.append(np.mean(mean_neighbors))  # Mean across clusters for a run
#         neighbor_counts_avg_runs.append(neighbor_counts)

#     # Plot results
#     plt.figure(figsize=(10, 6))
#     for i, counts in enumerate(neighbor_counts_avg_runs):
#         plt.plot(range(1, len(counts) + 1), counts, label=f'n_neighbors={n_neighbors_values[i]}', marker='o')
    
#     plt.xlabel("Run Index")
#     plt.ylabel("Mean Number of Neighbors")
#     plt.title("Mean Number of Neighbors Across Runs")
#     plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Adjust as needed
#     plt.grid(True)
#     plt.show()

# # Call the visualization function
# plot_mean_neighbor_counts_across_runs(
#     umap_projections_list=[umap_projections_5_00125_35,umap_projections_10_00125_35,umap_projections_20_00125_35,umap_projections_30_00125_35, umap_projections_50_00125_35, umap_projections_100_00125_35],
#     n_neighbors_values=n_neighbors_values,
#     y_labels=y_train
# )

In [53]:
def plot_mean_neighbor_counts_across_runs(umap_projections_list, n_neighbors_values, y_labels, n_clusters=10):
    """
    Plot mean number of neighbors across runs for different n_neighbors values.
    """
    neighbor_counts_avg_runs = []

    for umap_projections in umap_projections_list:
        # Calculate neighbor counts for each run
        neighbor_counts_per_run = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_centers = []
            radii_per_cluster = []
            for cluster_idx in range(n_clusters):
                # Compute cluster center
                cluster_points = x_umap[y_labels == cluster_idx]
                if len(cluster_points) > 0:
                    cluster_center = np.mean(cluster_points, axis=0)
                    cluster_centers.append(cluster_center)

                    # Compute dynamic radius for this cluster
                    distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                    dynamic_radius = np.mean(distances_to_center)
                    radii_per_cluster.append(dynamic_radius)
                else:
                    radii_per_cluster.append(0)
                    cluster_centers.append(np.array([0, 0]))

            # Compute number of neighbors within radius for each cluster
            neighbor_counts = []
            for cluster_idx, cluster_center in enumerate(cluster_centers):
                if radii_per_cluster[cluster_idx] > 0:  # Avoid empty clusters
                    distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
                    neighbor_count = np.sum(distances_to_center <= radii_per_cluster[cluster_idx])
                    neighbor_counts.append(neighbor_count)

            # Store the mean neighbor count for this run
            neighbor_counts_per_run.append(np.mean(neighbor_counts))
        
        neighbor_counts_avg_runs.append(neighbor_counts_per_run)

    # Plot results
    plt.figure(figsize=(10, 6))
    for i, counts in enumerate(neighbor_counts_avg_runs):
        plt.plot(range(1, len(counts) + 1), counts, label=f'n_neighbors={n_neighbors_values[i]}', marker='o')

    plt.xlabel("Run Index")
    plt.ylabel("Mean Number of Points")
    plt.title("Mean Number of Points Across Runs for min_dist = 0.0125")
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Adjust legend position
    plt.grid(True)
    plt.show()

In [None]:
plot_mean_neighbor_counts_across_runs(
    umap_projections_list=[
        umap_projections_5_00125_35,
        umap_projections_10_00125_35,
        umap_projections_20_00125_35,
        umap_projections_30_00125_35,
        umap_projections_50_00125_35,
        umap_projections_100_00125_35
    ],
    n_neighbors_values=n_neighbors_values,
    y_labels=y_train
)

In [None]:
# Loading .csv 
df_results_00125 = pd.read_csv('radius_neighbor_analysis_merged_MinDist_00125.csv')

# Add a density column to df_results
df_results_00125['Density'] = df_results_00125['Number of Neighbors'] / df_results_00125['Radius']

# Find the row with the maximum density
max_density_row = df_results_00125.loc[df_results_00125['Density'].idxmax()]

# Extract the cluster, n_neighbors, and maximum density
max_density = max_density_row['Density']
max_cluster = max_density_row['Cluster']
max_n_neighbors = max_density_row['N']

# Print the results
print(f"Highest Density: {max_density:.2f}")
print(f"Cluster: {int(max_cluster)}")
print(f"n_neighbors: {int(max_n_neighbors)}")

In [None]:
## For min_dist=0,8.

# Function to calculate cluster metrics
def calculate_cluster_metrics(umap_projections, y_labels, n_clusters=10):
    """
    Calculate average cluster radii and neighbor counts for each cluster over all runs.
    """
    n_runs = len(umap_projections)  # Number of runs
    cluster_centers_full = []
    
    # Step 1: Calculate cluster centers for each run
    for run_idx, x_umap in enumerate(umap_projections):
        cluster_centers_run = []
        for label in np.unique(y_labels):
            cluster_points = x_umap[y_labels == label]
            if len(cluster_points) > 0:
                cluster_center = np.mean(cluster_points, axis=0)
                cluster_centers_run.append(cluster_center)
        cluster_centers_full.append(np.array(cluster_centers_run))
    
    cluster_centers_full = np.array(cluster_centers_full)  # Shape: (n_runs, n_clusters, 2)

    # Step 2: Calculate average radii for each cluster
    radii_per_cluster = []
    for cluster_idx in range(n_clusters):
        radii_cluster = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_center = cluster_centers_full[run_idx][cluster_idx]
            cluster_points = x_umap[y_labels == cluster_idx]
            if len(cluster_points) > 0:
                distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                dynamic_radius = np.mean(distances_to_center)  # Mean distance to center
                radii_cluster.append(dynamic_radius)
        radii_per_cluster.append(np.mean(radii_cluster))  # Average radius across runs

    # Step 3: Calculate neighbor counts for each cluster
    neighbor_counts_full = []
    for run_idx, x_umap in enumerate(umap_projections):
        counts_run = []
        for cluster_idx, cluster_center in enumerate(cluster_centers_full[run_idx]):
            radius = radii_per_cluster[cluster_idx]  # Use the average radius
            distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
            count = np.sum(distances_to_center <= radius)  # Count points within the radius
            counts_run.append(count)
        neighbor_counts_full.append(counts_run)

    neighbor_counts_full = np.array(neighbor_counts_full)  # Shape: (n_runs, n_clusters)
    average_neighbor_counts = np.mean(neighbor_counts_full, axis=0)  # Average across runs

    return radii_per_cluster, average_neighbor_counts

# Define n_neighbors values
n_neighbors_values = [5, 10, 20, 30, 50, 100]
results = []

# Iterate over each n_neighbors value
for n_neighbors in n_neighbors_values:
    if n_neighbors == 5:
        umap_projections = umap_projections_5_08_35
    elif n_neighbors == 10:
        umap_projections = umap_projections_10_08_35
    elif n_neighbors == 20:
        umap_projections = umap_projections_20_08_35
    elif n_neighbors == 30:
        umap_projections = umap_projections_30_08_35
    elif n_neighbors == 50:
        umap_projections = umap_projections_50_08_35
    elif n_neighbors == 100:
        umap_projections = umap_projections_100_08_35

    # Calculate metrics
    radii_per_cluster, average_neighbor_counts = calculate_cluster_metrics(umap_projections, y_train)

    # Store results
    for cluster_idx in range(len(radii_per_cluster)):
        results.append({
            "N": n_neighbors,
            "Cluster": cluster_idx,
            "Radius": np.round(radii_per_cluster[cluster_idx], 3),
            "Number of Neighbors": np.round(average_neighbor_counts[cluster_idx], 0)
        })

# Create a DataFrame for the results
df_results = pd.DataFrame(results)

# Save results for later use
df_results.to_csv("radius_neighbor_analysis_merged_MinDist_08.csv", index=False)

# Pivot table for easy visualization
pivot_table = df_results.pivot(index="Cluster", columns="N", values=["Radius", "Number of Neighbors"])
print(pivot_table)

# # Visualization of mean neighbor counts across runs
# def plot_mean_neighbor_counts_across_runs(umap_projections_list, n_neighbors_values, y_labels):
#     """
#     Plot mean number of neighbors across runs for different n_neighbors values.
#     """
#     neighbor_counts_avg_runs = []
    
#     for umap_projections in umap_projections_list:
#         # Calculate neighbor counts for each run
#         neighbor_counts = []
#         for run_idx, x_umap in enumerate(umap_projections):
#             mean_neighbors = []
#             for cluster_idx in np.unique(y_labels):
#                 cluster_points = x_umap[y_labels == cluster_idx]
#                 cluster_center = np.mean(cluster_points, axis=0)
#                 distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
#                 mean_neighbors.append(np.mean(distances_to_center))
#             neighbor_counts.append(np.mean(mean_neighbors))  # Mean across clusters for a run
#         neighbor_counts_avg_runs.append(neighbor_counts)

#     # Plot results
#     plt.figure(figsize=(10, 6))
#     for i, counts in enumerate(neighbor_counts_avg_runs):
#         plt.plot(range(1, len(counts) + 1), counts, label=f'n_neighbors={n_neighbors_values[i]}', marker='o')
    
#     plt.xlabel("Run Index")
#     plt.ylabel("Mean Number of Neighbors")
#     plt.title("Mean Number of Neighbors Across Runs")
#     plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Adjust as needed
#     plt.grid(True)
#     plt.show()

# # Call the visualization function
# plot_mean_neighbor_counts_across_runs(
#     umap_projections_list=[umap_projections_5_08_35,umap_projections_10_08_35,umap_projections_20_08_35,umap_projections_30_08_35, umap_projections_50_08_35, umap_projections_100_08_35],
#     n_neighbors_values=n_neighbors_values,
#     y_labels=y_train
# )

In [51]:
# Adjusted Visualization of mean neighbor counts across runs
def plot_mean_neighbor_counts_across_runs(umap_projections_list, n_neighbors_values, y_labels, n_clusters=10):
    """
    Plot mean number of neighbors across runs for different n_neighbors values with adjusted scaling for readability.
    """
    neighbor_counts_avg_runs = []

    for umap_projections in umap_projections_list:
        # Calculate neighbor counts for each run
        neighbor_counts_per_run = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_centers = []
            radii_per_cluster = []
            for cluster_idx in range(n_clusters):
                # Compute cluster center
                cluster_points = x_umap[y_labels == cluster_idx]
                if len(cluster_points) > 0:
                    cluster_center = np.mean(cluster_points, axis=0)
                    cluster_centers.append(cluster_center)

                    # Compute dynamic radius for this cluster
                    distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                    dynamic_radius = np.mean(distances_to_center)
                    radii_per_cluster.append(dynamic_radius)
                else:
                    radii_per_cluster.append(0)
                    cluster_centers.append(np.array([0, 0]))

            # Compute number of neighbors within radius for each cluster
            neighbor_counts = []
            for cluster_idx, cluster_center in enumerate(cluster_centers):
                if radii_per_cluster[cluster_idx] > 0:  # Avoid empty clusters
                    distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
                    neighbor_count = np.sum(distances_to_center <= radii_per_cluster[cluster_idx])
                    neighbor_counts.append(neighbor_count)

            # Store the mean neighbor count for this run
            neighbor_counts_per_run.append(np.mean(neighbor_counts))
        
        neighbor_counts_avg_runs.append(neighbor_counts_per_run)

    # Plot results
    plt.figure(figsize=(12, 8))
    for i, counts in enumerate(neighbor_counts_avg_runs):
        plt.plot(range(1, len(counts) + 1), counts, label=f'n_neighbors={n_neighbors_values[i]}', marker='o')

    # Adjust y-axis scale and appearance
    plt.ylim(3600, 4600)  # Set y-axis range for readability
    plt.xlabel("Run Index")
    plt.ylabel("Mean Number of Points")
    plt.title("Mean Number of Points Across Runs for min_dist = 0.8")
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Adjust legend position
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()


In [None]:
plot_mean_neighbor_counts_across_runs(
    umap_projections_list=[
        umap_projections_5_08_35,
        umap_projections_10_08_35,
        umap_projections_20_08_35,
        umap_projections_30_08_35,
        umap_projections_50_08_35,
        umap_projections_100_08_35
    ],
    n_neighbors_values=n_neighbors_values,
    y_labels=y_train
)

In [None]:
# Loading .csv 
df_results_08 = pd.read_csv('radius_neighbor_analysis_merged_MinDist_08.csv')

# Add a density column to df_results
df_results_08['Density'] = df_results_08['Number of Neighbors'] / df_results_08['Radius']

# Find the row with the maximum density
max_density_row = df_results_08.loc[df_results_08['Density'].idxmax()]

# Extract the cluster, n_neighbors, and maximum density
max_density = max_density_row['Density']
max_cluster = max_density_row['Cluster']
max_n_neighbors = max_density_row['N']

# Print the results
print(f"Highest Density: {max_density:.2f}")
print(f"Cluster: {int(max_cluster)}")
print(f"n_neighbors: {int(max_n_neighbors)}")

In [None]:
## USE THIS - DON'T OVERTHINK ##

# # Function to calculate cluster centers, radii, and neighbor counts
# def calculate_cluster_metrics(umap_projections, y_labels, n_clusters=10, n_runs=35):
#     # Step 1: Calculate Cluster Centers for Each Run
#     cluster_centers_full = []
#     for run_idx, x_umap in enumerate(umap_projections):
#         cluster_centers_run = []
#         for label in np.unique(y_labels):  # Iterate over all labels
#             cluster_points = x_umap[y_labels == label]
#             if len(cluster_points) > 0:
#                 cluster_center = np.mean(cluster_points, axis=0)
#                 cluster_centers_run.append(cluster_center)
#         cluster_centers_full.append(np.array(cluster_centers_run))

#     cluster_centers_full = np.array(cluster_centers_full)  # Shape: (n_runs, n_clusters, 2)

#     # Step 2: Calculate the Average Radius Across Runs for Each Cluster
#     radii_per_cluster = []
#     for cluster_idx in range(n_clusters):
#         radii_cluster = []
#         for run_idx, x_umap in enumerate(umap_projections):
#             cluster_center = cluster_centers_full[run_idx][cluster_idx]
#             cluster_points = x_umap[y_labels == cluster_idx]
#             if len(cluster_points) > 0:
#                 distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
#                 dynamic_radius = np.mean(distances_to_center)  # Use mean distance as the radius
#                 radii_cluster.append(dynamic_radius)
#         radii_per_cluster.append(np.mean(radii_cluster))  # Average radius across runs

#     # Step 3: Count Neighbors Using the Average Radius
#     neighbor_counts_full = []
#     for run_idx, x_umap in enumerate(umap_projections):
#         counts_run = []
#         for cluster_idx, cluster_center in enumerate(cluster_centers_full[run_idx]):
#             radius = radii_per_cluster[cluster_idx]  # Use the average radius
#             distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
#             count = np.sum(distances_to_center <= radius)  # Count points within the radius
#             counts_run.append(count)
#         neighbor_counts_full.append(counts_run)

#     neighbor_counts_full = np.array(neighbor_counts_full)  # Shape: (n_runs, n_clusters)
#     average_neighbor_counts = np.mean(neighbor_counts_full, axis=0)  # Average across runs

#     return radii_per_cluster, average_neighbor_counts

# # Define n_neighbors values
# n_neighbors_values = [10, 50, 100]

# # Placeholder for results
# results = []

# # Iterate over each n_neighbors value
# for n_neighbors in n_neighbors_values:
#     if n_neighbors == 10:
#         umap_projections = umap_projections_10_01_35
#     elif n_neighbors == 50:
#         umap_projections = umap_projections_50_01_35
#     elif n_neighbors == 100:
#         umap_projections = umap_projections_100_01_35

#     # Calculate metrics
#     radii_per_cluster, average_neighbor_counts = calculate_cluster_metrics(umap_projections, y_train)

#     # Store results
#     for cluster_idx in range(len(radii_per_cluster)):
#         results.append({
#             "N": n_neighbors,
#             "Cluster": cluster_idx,
#             "Radius": np.round(radii_per_cluster[cluster_idx],3),
#             "Number of Neighbors": np.round(average_neighbor_counts[cluster_idx],0)
#         })

# # Create a DataFrame for easier analysis
# df_results = pd.DataFrame(results)

# # Save results
# df_results.to_csv("radius_neighbor_analysis_merged.csv", index=False)

# # Pivot table for easier comparison
# pivot_table = df_results.pivot(index="Cluster", columns="N", values=["Radius", "Number of Neighbors"])
# print(pivot_table)

In [None]:
from matplotlib.patches import Circle

# Visualization of UMAP clusters with dynamic radii for each n_neighbors value
def plot_umap_clusters_with_radii(umap_projections, cluster_centers, radii_per_cluster, n_neighbors, y_labels):
    """
    Plot UMAP clusters with dynamic radii for a given n_neighbors value.
    
    Parameters:
        umap_projections: UMAP projections for a single n_neighbors value.
        cluster_centers: Cluster centers across runs.
        radii_per_cluster: Average radius for each cluster.
        n_neighbors: Number of neighbors used in UMAP.
        y_labels: Original labels of the data points.
    """
    plt.figure(figsize=(12, 8))
    plt.scatter(umap_projections[0][:, 0], umap_projections[0][:, 1], s=1, c=y_labels, cmap='Spectral', alpha=0.5, label='UMAP Points')
    
    for cluster_idx, (center, radius) in enumerate(zip(cluster_centers, radii_per_cluster)):
        circle = Circle(center, radius, color='black', fill=False, linestyle='--', linewidth=1.5)
        plt.gca().add_patch(circle)
        plt.text(center[0], center[1], f'{cluster_idx}', color='black', fontsize=10, ha='center', va='center')
    
    plt.title(f"UMAP Clusters with Dynamic Radii (n_neighbors={n_neighbors})")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.colorbar(label='Digit Labels')
    plt.grid(True)
    plt.legend(loc='upper right')
    plt.show()

# Iterate over each n_neighbors value and visualize
for n_neighbors, umap_projections in zip(
    [10, 50, 100],
    [umap_projections_10_01_35, umap_projections_50_01_35, umap_projections_100_01_35]
):
    # Calculate the mean cluster centers for the first run
    cluster_centers_mean = np.mean(cluster_centers_full, axis=0)  # Average over runs
    # Visualize the clusters
    plot_umap_clusters_with_radii(
        umap_projections=umap_projections,
        cluster_centers=cluster_centers_mean,
        radii_per_cluster=radii_per_cluster,
        n_neighbors=n_neighbors,
        y_labels=y_train
    )

--------------

### n=5 with matrices aligned

**Instead of using the mean_umap I used ARI to choose the best run and use it as a reference to align all the runs with Procrusted.
Then this new aligned_projections will replace mean_projections**

In [155]:
np.save('aligned_projections_5_01_35.npy', aligned_projections)
np.save('aligned_cluster_centroids_per_run_5_01_35.npy', aligned_cluster_centroids_per_run)

In [2]:
aligned_projections = np.load('aligned_projections.npy')

In [None]:
umap_projections_5_01_35= np.load('umap_projections_5_01_35.npy')

In [None]:
# Placeholder for ARI scores
ari_scores = []

# Iterate through each run's UMAP projections
for i, run_projection in enumerate(umap_projections_5_01_35):
    # Perform K-Means clustering on the UMAP projection for this run
    kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
    cluster_labels = kmeans.fit_predict(run_projection)

    # Compute ARI between true labels and cluster labels
    ari_score = adjusted_rand_score(y_train, cluster_labels)
    ari_scores.append(ari_score)
    print(f"Run {i}: Adjusted Rand Index (ARI): {ari_score}")

# Identify the best run based on ARI
best_run_index = np.argmax(ari_scores)
best_ari_score = ari_scores[best_run_index]

print(f"Best run based on ARI: Run {best_run_index} with ARI Score: {best_ari_score}")

In [None]:
# Procrustes Alignment using the best run as the reference
reference_projection = umap_projections_5_01_35[best_run_index] # Best run based on ARI: Run 18 with ARI Score: 0.915

aligned_projections = []
procrustes_distances = []

for i, run_projection in enumerate(umap_projections_5_01_35):
    # Apply Procrustes alignment
    mtx1, mtx2, disparity = procrustes(reference_projection, run_projection)
    aligned_projections.append(mtx2)
    procrustes_distances.append(disparity)
    print(f"Run {i}: Procrustes disparity: {disparity}")

aligned_projections = np.array(aligned_projections)

In [None]:
# Define a color map for 10 classes (digits 0-9)
colors = plt.cm.tab10(np.linspace(0, 1, 10))

# Create a scatter plot of the mean projections
plt.figure(figsize=(10, 8))
for label in range(10):  # Assuming digits 0-9
    # Select points corresponding to the current label
    # points = aligned_projections[y_train == label]
    plt.scatter(points[:, 0], points[:, 1], color=colors[label], label=f"Digit {label}")

# Add labels, legend, and title
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.title("Mean UMAP Projections (n_neighbors=5, min_dist=0.1)")
plt.legend(loc='best', title="Digits")
plt.grid(True)
plt.show()

In [None]:
# Plot the aligned projections for a few runs
plt.figure(figsize=(10, 8))
for i in range(3):  # Plot the first 3 aligned runs
    plt.scatter(aligned_projections[i, :, 0], aligned_projections[i, :, 1], s=10, label=f"Run {i}")

plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.title("Procrustes-Aligned UMAP Projections")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Calculate mean and standard deviation of Procrustes disparities
mean_disparity = np.mean(procrustes_distances)
std_disparity = np.std(procrustes_distances)

# Define a threshold for "good" runs (e.g., within one standard deviation)
threshold = mean_disparity + std_disparity

# Identify good runs
good_runs = [i for i, disparity in enumerate(procrustes_distances) if disparity <= threshold]

print(f"Mean Procrustes disparity: {mean_disparity}")
print(f"Standard deviation of disparities: {std_disparity}")
print(f"Threshold for good runs: {threshold}")
print(f"Good runs based on Procrustes disparity: {good_runs}")

In [None]:
# TO CORROBORATE
# ## Recompute ARI scores after Procrustes alignment
aligned_ari_scores = []

for aligned_projection in aligned_projections:
    kmeans = KMeans(n_clusters=10, random_state=42)
    cluster_labels = kmeans.fit_predict(aligned_projection)
    ari_score = adjusted_rand_score(y_train, cluster_labels)
    aligned_ari_scores.append(ari_score)

# Output ARI scores
print(f"Aligned ARI scores: {aligned_ari_scores}")

In [None]:
# Placeholder for cluster centroids for each aligned run
aligned_cluster_centroids_per_run = []

# Compute centroids for each aligned run
for run in range(len(aligned_projections)):
    aligned_projection = aligned_projections[run]  # Current aligned projection
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        # Select points belonging to the current cluster
        cluster_points = aligned_projection[y_train == cluster_label]
        # Compute the centroid
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    
    # Store centroids for this run
    aligned_cluster_centroids_per_run.append(np.array(centroids))

aligned_cluster_centroids_per_run = np.array(aligned_cluster_centroids_per_run)  # Shape: (n_runs, n_clusters, 2)

# Output centroids for the first run (for verification)
# print(f"Cluster centroids for first aligned run:\n{aligned_cluster_centroids_per_run[0]}")


In [None]:

# Initialize arrays to store standard deviations
std_dev_x_n5 = np.zeros(10)
std_dev_y_n5 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = aligned_cluster_centroids_per_run[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = aligned_cluster_centroids_per_run[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_n5[i] = np.std(cluster_x_coords)
    std_dev_y_n5[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_n5)
print("Standard deviation of y coordinates per cluster:", std_dev_y_n5)

In [None]:
# Create an empty list to hold the data for the DataFrame
data_umap_n5 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(len(aligned_cluster_centroids_per_run)):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = aligned_cluster_centroids_per_run[trial, cluster]

        # Calculate the bounds for the 3 standard deviations range for x and y
        mean_x, mean_y = np.mean(aligned_cluster_centroids_per_run[:, cluster, :], axis=0)
        lower_bound_x, upper_bound_x = mean_x - 3 * std_dev_x_n5[cluster], mean_x + 3 * std_dev_x_n5[cluster]
        lower_bound_y, upper_bound_y = mean_y - 3 * std_dev_y_n5[cluster], mean_y + 3 * std_dev_y_n5[cluster]

        # Check if the centroid is inside the 3 std range
        inside_3_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)

        # Append the data as a new row in the list
        data_umap_n5.append([trial + 1, cluster, centroid_coord, inside_3_std])

# Create a DataFrame from the list of data
df_results_umap_n5 = pd.DataFrame(data_umap_n5, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 3 std'])

In [None]:
# Step 1: Group the DataFrame by Trial
grouped_by_trial = df_results_umap_n5.groupby('Trial')

# Step 2: Check if all clusters in each trial are True for 'Inside 2 std'
trials_with_all_true = grouped_by_trial['Inside 3 std'].all()

# Step 3: Filter the trials where all clusters were True
valid_trials = trials_with_all_true[trials_with_all_true].index.tolist()

# Step 4: Show the list of trials
print(f"Trials where all clusters are inside 3 std: {valid_trials}")

In [159]:
# Filter aligned projections to keep only valid runs
# valid_runs = [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 29, 30, 31, 33, 34, 35]
aligned_projections_5_01_35 = aligned_projections[[run - 1 for run in valid_trials]]

In [None]:
aligned_projections_5_01_35.shape

In [None]:
# Filter aligned_cluster_centroids_per_run to match valid runs
aligned_cluster_centroids_per_run_filtered = aligned_cluster_centroids_per_run[[run - 1 for run in valid_trials]]

# Placeholder for pairwise distance matrices
distance_matrices_5_01_35_al = []

# Calculate pairwise distances between centroids for each valid run
for centroids in aligned_cluster_centroids_per_run_filtered:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_5_01_35_al.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_5_01_35_al = np.array(distance_matrices_5_01_35_al)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_5_01_35_al = np.mean(distance_matrices_5_01_35_al, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_01_35_al = (mean_distance_matrix_5_01_35_al - np.min(mean_distance_matrix_5_01_35_al)) / (
    np.max(mean_distance_matrix_5_01_35_al) - np.min(mean_distance_matrix_5_01_35_al)
)

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_01_35_al, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=5)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_01_35_al.npy', distance_matrices_5_01_35_al)
np.save('mean_distance_matrix_neighbors_5_01_35_al.npy', mean_distance_matrix_5_01_35_al)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_5_01_35_al}")

In [None]:
# Placeholder for cluster centroids (using true labels)
aligned_cluster_centroids_true_labels_filtered = []

# Calculate centroids for each valid run using true labels
for run_projection in aligned_projections_100_01_35:  # Already filtered projections
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        # Select points belonging to the current cluster
        cluster_points = run_projection[y_train == cluster_label]
        # Compute the centroid
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    # Store centroids for this run
    aligned_cluster_centroids_true_labels_filtered.append(np.array(centroids))

# Placeholder for pairwise distance matrices
distance_matrices_5_01_35_labels = []

# Calculate pairwise distances between centroids for each valid run
for centroids in aligned_cluster_centroids_true_labels_filtered:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_5_01_35_labels.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_5_01_35_labels = np.array(distance_matrices_5_01_35_labels)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_5_01_35_labels = np.mean(distance_matrices_5_01_35_labels, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_01_35_labels = (mean_distance_matrix_5_01_35_labels - np.min(mean_distance_matrix_5_01_35_labels)) / (
    np.max(mean_distance_matrix_5_01_35_labels) - np.min(mean_distance_matrix_5_01_35_labels)
)

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_01_35_labels, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (True Labels, k=10, n_neighbors=5)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_01_35_labels.npy', distance_matrices_5_01_35_labels)
np.save('mean_distance_matrix_neighbors_5_01_35_labels.npy', mean_distance_matrix_5_01_35_labels)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs (True Labels):\n{mean_distance_matrix_5_01_35_labels}")



In [None]:
for i, run in enumerate(valid_trials):
    print(f"Run {i}: Centroids difference (A vs B):")
    print(np.linalg.norm(cluster_centroids_per_run[i] - aligned_cluster_centroids_per_run_filtered[i]))


In [None]:
distance_diff = np.abs(mean_distance_matrix_5_01_35 - mean_distance_matrix_5_01_35_al)
print(f"Difference between mean distance matrices:\n{distance_diff}")

In [None]:
normalized_diff = np.abs(normalized_mean_distance_matrix_5_01_35 - normalized_mean_distance_matrix_5_01_35_al)
print(f"Difference between normalized mean distance matrices:\n{normalized_diff}")


-----------

### n=10

In [16]:
umap_projections_10_01_35= np.load(f'umap_projections_10_01_35.npy')
mean_projection_10_01_35= np.load(f'mean_projection_10_01_35.npy')
std_projection_10_01_35= np.load(f'std_projection_10_01_35.npy')
lower_limit_intconf_matrix_10_01_35= np.load(f'lower_limit_intconf_matrix_10_01_35.npy')
upper_limit_intconf_matrix_10_01_35= np.load(f'upper_limit_intconf_matrix_10_01_35.npy')
distance_matrices_10_01_35=np.load(f'distance_matrices_neighbors_10_01_35.npy')
mean_distance_matrix_10_01_35=np.load(f'mean_distance_matrix_neighbors_10_01_35.npy')
norm_lower_limit_intconf_matrix_10_01_35=np.load(f'norm_lower_limit_intconf_matrix_10_01_35.npy')
norm_upper_limit_intconf_matrix_10_01_35=np.load(f'norm_upper_limit_intconf_matrix_10_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_10_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_10_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_10_01_35 = np.array(umap_projections_10_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_10_01_35 = np.mean(umap_projections_10_01_35, axis=0)
std_projection_10_01_35 = np.std(umap_projections_10_01_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_10_01_35.npy', umap_projections_10_01_35)
np.save('mean_projection_10_01_35.npy', mean_projection_10_01_35)
np.save('std_projection_10_01_35.npy', std_projection_10_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_10_01_35'.")

In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_10_01_35 = np.sqrt(np.sum((umap_projections_10_01_35 - mean_projection_10_01_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_10_01_35, axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [20]:
normalized_mean_distance_matrix_10_01_35 = (mean_distance_matrix_10_01_35 - np.min(mean_distance_matrix_10_01_35)) / (np.max(mean_distance_matrix_10_01_35) - np.min(mean_distance_matrix_10_01_35))

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_10_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_10_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_10_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_10_01_35 = np.array(distance_matrices_10_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_10_01_35 = np.mean(distance_matrices_10_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_10_01_35 = (mean_distance_matrix_10_01_35 - np.min(mean_distance_matrix_10_01_35)) / (np.max(mean_distance_matrix_10_01_35) - np.min(mean_distance_matrix_10_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_10_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_10_01_35.npy', distance_matrices_10_01_35)
np.save('mean_distance_matrix_neighbors_10_01_35.npy', mean_distance_matrix_10_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_10_01_35}")

In [None]:
# Create a graph from the distance matrix
G_10_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_10_01_35,3))
np.save('G_10_01_35.npy',G_10_01_35)

# Draw the graph
pos = nx.spring_layout(G_10_01_35, seed=42)  # positions for all nodes
nx.draw(G_10_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_10_01_35, 'weight')
nx.draw_networkx_edge_labels(G_10_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
# plt.show()

In [None]:
# Calculate the total weight of the MST
total_weight = sum(nx.get_edge_attributes(mst_10_01_35, 'weight').values())

# Print the total weight
print(f"Total weight of the MST: {total_weight}")

# Compute the minimum spanning tree of the graph
mst_10_01_35 = nx.minimum_spanning_tree(G_10_01_35)
np.save('mst_10_01_35.npy', mst_10_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_10_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_10_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_10_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_10_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=10, min_dist=0.1")
plt.show()

In [None]:
# Compute the minimum spanning tree (MST) of the graph
mst_10_01_35 = nx.minimum_spanning_tree(G_10_01_35)

# Save the MST for later use
np.save('mst_10_01_35.npy', mst_10_01_35)

# Define positions for all nodes in the MST using a spring layout
pos = nx.spring_layout(mst_10_01_35, seed=42)

# Increase figure size for better visibility
plt.figure(figsize=(12, 8))

# Draw the MST with larger nodes, thicker edges, and a larger font for labels
nx.draw(
    mst_10_01_35,
    pos,
    with_labels=True,
    node_color='lightblue',
    edge_color='red',
    node_size=1000,  # Larger node size for better visibility
    font_size=12,    # Larger font size for node labels
    width=3          # Thicker edge lines
)

# Get edge weights and format them to 2 decimal places for clarity
edge_labels = nx.get_edge_attributes(mst_10_01_35, 'weight')
formatted_edge_labels = {k: f"{v:.2f}" for k, v in edge_labels.items()}

# Draw edge labels with formatted weights
nx.draw_networkx_edge_labels(
    mst_10_01_35,
    pos,
    edge_labels=formatted_edge_labels,
    font_size=20,    # Font size for edge labels
    label_pos=0.5    # Position edge labels at the center of edges
)

# Add a title to the plot
plt.title("MST UMAP - n_neighbors=10, min_dist=0.1", fontsize=16)

# Display the plot
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_10_01_35 = np.std(distance_matrices_10_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_10_01_35.npy", distance_matrix_std_10_01_35)

# Output the results
print("Standard Deviation Distance Matrix (10_01_35):\n", distance_matrix_std_10_01_35)


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_10_01_35 = distance_matrix_std_10_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_10_01_35 = z_score * sem_matrix_10_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_10_01_35 = mean_distance_matrix_10_01_35 - margin_of_error_matrix_10_01_35
upper_limit_intconf_matrix_10_01_35 = mean_distance_matrix_10_01_35 + margin_of_error_matrix_10_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_10_01_35 = np.maximum(lower_limit_intconf_matrix_10_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_10_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_10_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_10_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_10_01_35.npy', lower_limit_intconf_matrix_10_01_35)
np.save('upper_limit_intconf_matrix_10_01_35.npy', upper_limit_intconf_matrix_10_01_35)

In [25]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_10_01_35 = normalize_matrix(lower_limit_intconf_matrix_10_01_35)
norm_upper_limit_intconf_matrix_10_01_35 = normalize_matrix(upper_limit_intconf_matrix_10_01_35)
np.save('norm_lower_limit_intconf_matrix_10_01_35.npy', norm_lower_limit_intconf_matrix_10_01_35)
np.save('norm_upper_limit_intconf_matrix_10_01_35.npy', norm_upper_limit_intconf_matrix_10_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_10_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=10, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_10_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=10, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_10_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=10, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_10_01_35, "UMAP MST - Mean Distances - n_neighbors=10, min_dist=0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_10_01_35, "UMAP MST - Lower Limit- n_neighbors=10, min_dist=0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_10_01_35, "UMAP MST - Upper Limit- n_neighbors=10, min_dist=0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=20

In [9]:
umap_projections_20_01_35= np.load(f'umap_projections_20_01_35.npy')
mean_projection_20_01_35= np.load(f'mean_projection_20_01_35.npy')
std_projection_20_01_35= np.load(f'std_projection_20_01_35.npy')
lower_limit_intconf_matrix_20_01_35= np.load(f'lower_limit_intconf_matrix_20_01_35.npy')
upper_limit_intconf_matrix_20_01_35= np.load(f'upper_limit_intconf_matrix_20_01_35.npy')
distance_matrices_20_01_35=np.load(f'distance_matrices_neighbors_20_01_35.npy')
mean_distance_matrix_20_01_35=np.load(f'mean_distance_matrix_neighbors_20_01_35.npy')
norm_lower_limit_intconf_matrix_20_01_35=np.load(f'norm_lower_limit_intconf_matrix_20_01_35.npy')
norm_upper_limit_intconf_matrix_20_01_35=np.load(f'norm_upper_limit_intconf_matrix_20_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 20
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_20_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_20_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_20_01_35 = np.array(umap_projections_20_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_20_01_35 = np.mean(umap_projections_20_01_35, axis=0)
std_projection_20_01_35 = np.std(umap_projections_20_01_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_20_01_35.npy', umap_projections_20_01_35)
np.save('mean_projection_20_01_35.npy', mean_projection_20_01_35)
np.save('std_projection_20_01_35.npy', std_projection_20_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_20_01_35'.")

In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_20_01_35 = np.sqrt(np.sum((umap_projections_20_01_35 - mean_projection_20_01_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_20_01_35, axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_20_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_20_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_20_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_20_01_35 = np.array(distance_matrices_20_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_20_01_35 = np.mean(distance_matrices_20_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_20_01_35 = (mean_distance_matrix_20_01_35 - np.min(mean_distance_matrix_20_01_35)) / (np.max(mean_distance_matrix_20_01_35) - np.min(mean_distance_matrix_20_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_20_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=20, min_dist=0.1)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_20_01_35.npy', distance_matrices_20_01_35)
np.save('mean_distance_matrix_neighbors_20_01_35.npy', mean_distance_matrix_20_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_20_01_35}")

In [None]:
# Create a graph from the distance matrix
G_20_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_20_01_35,3))
np.save('G_20_01_35.npy',G_20_01_35)

# Draw the graph
pos = nx.spring_layout(G_20_01_35, seed=42)  # positions for all nodes
nx.draw(G_20_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_20_01_35, 'weight')
nx.draw_networkx_edge_labels(G_20_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_20_01_35 = nx.minimum_spanning_tree(G_20_01_35)
np.save('mst_20_01_35.npy', mst_20_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_20_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_20_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_20_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_20_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=20, min_dist=0.1")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_20_01_35 = np.std(distance_matrices_20_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_20_01_35.npy", distance_matrix_std_20_01_35)

# Output the results
print("Standard Deviation Distance Matrix (20_01_35):\n", distance_matrix_std_20_01_35)


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_20_01_35 = distance_matrix_std_20_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_20_01_35 = z_score * sem_matrix_20_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_20_01_35 = mean_distance_matrix_20_01_35 - margin_of_error_matrix_20_01_35
upper_limit_intconf_matrix_20_01_35 = mean_distance_matrix_20_01_35 + margin_of_error_matrix_20_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_20_01_35 = np.maximum(lower_limit_intconf_matrix_20_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_20_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_20_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_20_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_20_01_35.npy', lower_limit_intconf_matrix_20_01_35)
np.save('upper_limit_intconf_matrix_20_01_35.npy', upper_limit_intconf_matrix_20_01_35)

In [37]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_20_01_35 = normalize_matrix(lower_limit_intconf_matrix_20_01_35)
norm_upper_limit_intconf_matrix_20_01_35 = normalize_matrix(upper_limit_intconf_matrix_20_01_35)
np.save('norm_lower_limit_intconf_matrix_20_01_35.npy', norm_lower_limit_intconf_matrix_20_01_35)
np.save('norm_upper_limit_intconf_matrix_20_01_35.npy', norm_upper_limit_intconf_matrix_20_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_20_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=20, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_20_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=20, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_20_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=20, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_20_01_35, "UMAP MST - Mean Distances - n_neighbors=20, min_dist=0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_20_01_35, "UMAP MST - Lower Limit - n_neighbors=20, min_dist=0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_20_01_35, "UMAP MST - Upper Limit - n_neighbors=20, min_dist=0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=30

In [10]:
umap_projections_30_01_35= np.load(f'umap_projections_30_01_35.npy')
mean_projection_30_01_35= np.load(f'mean_projection_30_01_35.npy')
std_projection_30_01_35= np.load(f'std_projection_30_01_35.npy')
lower_limit_intconf_matrix_30_01_35= np.load(f'lower_limit_intconf_matrix_30_01_35.npy')
upper_limit_intconf_matrix_30_01_35= np.load(f'upper_limit_intconf_matrix_30_01_35.npy')
distance_matrices_30_01_35=np.load(f'distance_matrices_neighbors_30_01_35.npy')
mean_distance_matrix_30_01_35=np.load(f'mean_distance_matrix_neighbors_30_01_35.npy')
norm_lower_limit_intconf_matrix_30_01_35=np.load(f'norm_lower_limit_intconf_matrix_30_01_35.npy')
norm_upper_limit_intconf_matrix_30_01_35=np.load(f'norm_upper_limit_intconf_matrix_30_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 30
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_30_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_30_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_30_01_35 = np.array(umap_projections_30_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_30_01_35 = np.mean(umap_projections_30_01_35, axis=0)
std_projection_30_01_35 = np.std(umap_projections_30_01_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_30_01_35.npy', umap_projections_30_01_35)
np.save('mean_projection_30_01_35.npy', mean_projection_30_01_35)
np.save('std_projection_30_01_35.npy', std_projection_30_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_30_01_35'.")

In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_30_01_35 = np.sqrt(np.sum((umap_projections_30_01_35 - mean_projection_30_01_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_30_01_35, axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_30_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_30_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_30_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_30_01_35 = np.array(distance_matrices_30_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_30_01_35 = np.mean(distance_matrices_30_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_30_01_35 = (mean_distance_matrix_30_01_35 - np.min(mean_distance_matrix_30_01_35)) / (np.max(mean_distance_matrix_30_01_35) - np.min(mean_distance_matrix_30_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_30_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10,  n_neighbors=30, min_dist=0.1)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_30_01_35.npy', distance_matrices_30_01_35)
np.save('mean_distance_matrix_neighbors_30_01_35.npy', mean_distance_matrix_30_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_30_01_35}")

In [None]:
# Create a graph from the distance matrix
G_30_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_30_01_35,3))
np.save('G_30_01_35.npy',G_30_01_35)

# Draw the graph
pos = nx.spring_layout(G_30_01_35, seed=42)  # positions for all nodes
nx.draw(G_30_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_30_01_35, 'weight')
nx.draw_networkx_edge_labels(G_30_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_30_01_35 = nx.minimum_spanning_tree(G_30_01_35)
np.save('mst_30_01_35.npy', mst_30_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_30_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_30_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_30_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_30_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=30, min_dist=0.1")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_30_01_35 = np.std(distance_matrices_30_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_30_01_35.npy", distance_matrix_std_30_01_35)

# Output the results
print("Standard Deviation Distance Matrix (30_01_35):\n", distance_matrix_std_30_01_35)


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_30_01_35 = distance_matrix_std_30_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_30_01_35 = z_score * sem_matrix_30_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_30_01_35 = mean_distance_matrix_30_01_35 - margin_of_error_matrix_30_01_35
upper_limit_intconf_matrix_30_01_35 = mean_distance_matrix_30_01_35 + margin_of_error_matrix_30_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_30_01_35 = np.maximum(lower_limit_intconf_matrix_30_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_30_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_30_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_30_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_30_01_35.npy', lower_limit_intconf_matrix_30_01_35)
np.save('upper_limit_intconf_matrix_30_01_35.npy', upper_limit_intconf_matrix_30_01_35)

In [47]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_30_01_35 = normalize_matrix(lower_limit_intconf_matrix_30_01_35)
norm_upper_limit_intconf_matrix_30_01_35 = normalize_matrix(upper_limit_intconf_matrix_30_01_35)
np.save('norm_lower_limit_intconf_matrix_30_01_35.npy', norm_lower_limit_intconf_matrix_30_01_35)
np.save('norm_upper_limit_intconf_matrix_30_01_35.npy', norm_upper_limit_intconf_matrix_30_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_30_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=30, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_30_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=30, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_30_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=30, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_30_01_35, "UMAP MST - Mean Distances - n_neighbors=30, min_dist=0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_30_01_35, "UMAP MST - Lower Limit - n_neighbors=30, min_dist=0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_30_01_35, "UMAP MST - Upper Limit - n_neighbors=30, min_dist=0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=50

In [2]:
umap_projections_50_01_35= np.load(f'umap_projections_50_01_35.npy')
mean_projection_50_01_35= np.load(f'mean_projection_50_01_35.npy')
std_projection_50_01_35= np.load(f'std_projection_50_01_35.npy')
lower_limit_intconf_matrix_50_01_35= np.load(f'lower_limit_intconf_matrix_50_01_35.npy')
upper_limit_intconf_matrix_50_01_35= np.load(f'upper_limit_intconf_matrix_50_01_35.npy')
distance_matrices_50_01_35=np.load(f'distance_matrices_neighbors_50_01_35.npy')
mean_distance_matrix_50_01_35=np.load(f'mean_distance_matrix_neighbors_50_01_35.npy')
norm_lower_limit_intconf_matrix_50_01_35=np.load(f'norm_lower_limit_intconf_matrix_50_01_35.npy')
norm_upper_limit_intconf_matrix_50_01_35=np.load(f'norm_upper_limit_intconf_matrix_50_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 50
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_50_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_50_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_50_01_35 = np.array(umap_projections_50_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_50_01_35 = np.mean(umap_projections_50_01_35, axis=0)
std_projection_50_01_35 = np.std(umap_projections_50_01_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_50_01_35.npy', umap_projections_50_01_35)
np.save('mean_projection_50_01_35.npy', mean_projection_50_01_35)
np.save('std_projection_50_01_35.npy', std_projection_50_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_50_01_35'.")

In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_50_01_35 = np.sqrt(np.sum((umap_projections_50_01_35 - mean_projection_50_01_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_50_01_35, axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

------

In [39]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_50_01_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_50_01 = np.zeros((n_runs, n_clusters, umap_projections_50_01_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_50_01_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_50_01[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_50_01 = np.zeros(10)
std_dev_y_50_01 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_50_01[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_50_01[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_50_01[i] = np.std(cluster_x_coords)
    std_dev_y_50_01[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_50_01)
print("Standard deviation of y coordinates per cluster:", std_dev_y_50_01)

------

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_50_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_50_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_50_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_50_01_35 = np.array(distance_matrices_50_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_50_01_35 = np.mean(distance_matrices_50_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_01_35 = (mean_distance_matrix_50_01_35 - np.min(mean_distance_matrix_50_01_35)) / (np.max(mean_distance_matrix_50_01_35) - np.min(mean_distance_matrix_50_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_50_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=50, min_dist=0.1)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_50_01_35.npy', distance_matrices_50_01_35)
np.save('mean_distance_matrix_neighbors_50_01_35.npy', mean_distance_matrix_50_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_50_01_35}")

In [None]:
# Create a graph from the distance matrix
G_50_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_50_01_35,3))
np.save('G_50_01_35.npy',G_50_01_35)

# Draw the graph
pos = nx.spring_layout(G_50_01_35, seed=42)  # positions for all nodes
nx.draw(G_50_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_50_01_35, 'weight')
nx.draw_networkx_edge_labels(G_50_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [3]:
# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_01_35 = (mean_distance_matrix_50_01_35 - np.min(mean_distance_matrix_50_01_35)) / (np.max(mean_distance_matrix_50_01_35) - np.min(mean_distance_matrix_50_01_35))

In [None]:
# Calculate the total weight of the MST
total_weight_50 = sum(nx.get_edge_attributes(mst_50_01_35, 'weight').values())

# Print the total weight
print(f"Total weight of the MST: {total_weight_50}")

# Compute the minimum spanning tree of the graph
mst_50_01_35 = nx.minimum_spanning_tree(G_50_01_35)
np.save('mst_50_01_35.npy', mst_50_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_50_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_50_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_50_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_50_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=50, min_dist=0.1")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_50_01_35 = np.std(distance_matrices_50_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_50_01_35.npy", distance_matrix_std_50_01_35)

# Output the results
print("Standard Deviation Distance Matrix (50_01_35):\n", distance_matrix_std_50_01_35)


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_50_01_35 = distance_matrix_std_50_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_50_01_35 = z_score * sem_matrix_50_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_50_01_35 = mean_distance_matrix_50_01_35 - margin_of_error_matrix_50_01_35
upper_limit_intconf_matrix_50_01_35 = mean_distance_matrix_50_01_35 + margin_of_error_matrix_50_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_50_01_35 = np.maximum(lower_limit_intconf_matrix_50_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_50_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_50_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_50_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_50_01_35.npy', lower_limit_intconf_matrix_50_01_35)
np.save('upper_limit_intconf_matrix_50_01_35.npy', upper_limit_intconf_matrix_50_01_35)

In [57]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_50_01_35 = normalize_matrix(lower_limit_intconf_matrix_50_01_35)
norm_upper_limit_intconf_matrix_50_01_35 = normalize_matrix(upper_limit_intconf_matrix_50_01_35)
np.save('norm_lower_limit_intconf_matrix_50_01_35.npy', norm_lower_limit_intconf_matrix_50_01_35)
np.save('norm_upper_limit_intconf_matrix_50_01_35.npy', norm_upper_limit_intconf_matrix_50_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_50_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=50, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_50_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=50, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_50_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=50, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_50_01_35, "UMAP MST - Mean Distances - n_neighbors=50, min_dist=0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_50_01_35, "UMAP MST - Lower Limit - n_neighbors=50, min_dist=0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_50_01_35, "UMAP MST - Upper Limit - n_neighbors=50, min_dist=0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=100

In [41]:
umap_projections_100_01_35= np.load(f'umap_projections_100_01_35.npy')
mean_projection_100_01_35= np.load(f'mean_projection_100_01_35.npy')
std_projection_100_01_35= np.load(f'std_projection_100_01_35.npy')
lower_limit_intconf_matrix_100_01_35= np.load(f'lower_limit_intconf_matrix_100_01_35.npy')
upper_limit_intconf_matrix_100_01_35= np.load(f'upper_limit_intconf_matrix_100_01_35.npy')
distance_matrices_100_01_35=np.load(f'distance_matrices_neighbors_100_01_35.npy')
mean_distance_matrix_100_01_35=np.load(f'mean_distance_matrix_neighbors_100_01_35.npy')
norm_lower_limit_intconf_matrix_100_01_35=np.load(f'norm_lower_limit_intconf_matrix_100_01_35.npy')
norm_upper_limit_intconf_matrix_100_01_35=np.load(f'norm_upper_limit_intconf_matrix_100_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 100
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_100_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_100_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_100_01_35 = np.array(umap_projections_100_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_100_01_35 = np.mean(umap_projections_100_01_35, axis=0)
std_projection_100_01_35 = np.std(umap_projections_100_01_35, axis=0)
                                                                        
# Save the projections, mean, and standard deviation
np.save('umap_projections_100_01_35.npy', umap_projections_100_01_35)
np.save('mean_projection_100_01_35.npy', mean_projection_100_01_35)
np.save('std_projection_100_01_35.npy', std_projection_100_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_100_01_35'.")

In [29]:
# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_01_35 = (mean_distance_matrix_100_01_35 - np.min(mean_distance_matrix_100_01_35)) / (np.max(mean_distance_matrix_100_01_35) - np.min(mean_distance_matrix_100_01_35))
normalized_mean_distance_matrix_50_01_35 = (mean_distance_matrix_50_01_35 - np.min(mean_distance_matrix_50_01_35)) / (np.max(mean_distance_matrix_50_01_35) - np.min(mean_distance_matrix_50_01_35))
normalized_mean_distance_matrix_30_01_35 = (mean_distance_matrix_30_01_35 - np.min(mean_distance_matrix_30_01_35)) / (np.max(mean_distance_matrix_30_01_35) - np.min(mean_distance_matrix_30_01_35))
normalized_mean_distance_matrix_20_01_35 = (mean_distance_matrix_20_01_35 - np.min(mean_distance_matrix_20_01_35)) / (np.max(mean_distance_matrix_20_01_35) - np.min(mean_distance_matrix_20_01_35))
normalized_mean_distance_matrix_10_01_35 = (mean_distance_matrix_10_01_35 - np.min(mean_distance_matrix_10_01_35)) / (np.max(mean_distance_matrix_10_01_35) - np.min(mean_distance_matrix_10_01_35))
normalized_mean_distance_matrix_5_01_35 = (mean_distance_matrix_5_01_35 - np.min(mean_distance_matrix_5_01_35)) / (np.max(mean_distance_matrix_5_01_35) - np.min(mean_distance_matrix_5_01_35))

In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_100_01_35 = np.sqrt(np.sum((umap_projections_100_01_35 - mean_projection_100_01_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_100_01_35, axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_100_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_100_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_100_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_100_01_35 = np.array(distance_matrices_100_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_100_01_35 = np.mean(distance_matrices_100_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_01_35 = (mean_distance_matrix_100_01_35 - np.min(mean_distance_matrix_100_01_35)) / (np.max(mean_distance_matrix_100_01_35) - np.min(mean_distance_matrix_100_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_100_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100, min_dist=0.1)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_100_01_35.npy', distance_matrices_100_01_35)
np.save('mean_distance_matrix_neighbors_100_01_35.npy', mean_distance_matrix_100_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_100_01_35}")

In [None]:
# Create a graph from the distance matrix
G_100_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_100_01_35,3))
np.save('G_100_01_35.npy',G_100_01_35)

# Draw the graph
pos = nx.spring_layout(G_100_01_35, seed=42)  # positions for all nodes
nx.draw(G_100_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_100_01_35, 'weight')
nx.draw_networkx_edge_labels(G_100_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [43]:
# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_01_35 = (mean_distance_matrix_100_01_35 - np.min(mean_distance_matrix_100_01_35)) / (np.max(mean_distance_matrix_100_01_35) - np.min(mean_distance_matrix_100_01_35))

In [None]:
# Calculate the total weight of the MST
total_weight_100 = sum(nx.get_edge_attributes(mst_100_01_35, 'weight').values())

# Print the total weight
print(f"Total weight of the MST: {total_weight_100}")

# Compute the minimum spanning tree of the graph
mst_100_01_35 = nx.minimum_spanning_tree(G_100_01_35)
np.save('mst_100_01_35.npy', mst_100_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_100_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_100_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_100_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_100_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=100, min_dist=0.1")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_100_01_35 = np.std(distance_matrices_100_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_100_01_35.npy", distance_matrix_std_100_01_35)

# Output the results
print("Standard Deviation Distance Matrix (100_01_35):\n", distance_matrix_std_100_01_35)


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_100_01_35 = distance_matrix_std_100_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_100_01_35 = z_score * sem_matrix_100_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_100_01_35 = mean_distance_matrix_100_01_35 - margin_of_error_matrix_100_01_35
upper_limit_intconf_matrix_100_01_35 = mean_distance_matrix_100_01_35 + margin_of_error_matrix_100_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_100_01_35 = np.maximum(lower_limit_intconf_matrix_100_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_100_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_100_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_100_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_100_01_35.npy', lower_limit_intconf_matrix_100_01_35)
np.save('upper_limit_intconf_matrix_100_01_35.npy', upper_limit_intconf_matrix_100_01_35)

In [67]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_100_01_35 = normalize_matrix(lower_limit_intconf_matrix_100_01_35)
norm_upper_limit_intconf_matrix_100_01_35 = normalize_matrix(upper_limit_intconf_matrix_100_01_35)
np.save('norm_lower_limit_intconf_matrix_100_01_35.npy', norm_lower_limit_intconf_matrix_100_01_35)
np.save('norm_upper_limit_intconf_matrix_100_01_35.npy', norm_upper_limit_intconf_matrix_100_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_100_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=100, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_100_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=100, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_100_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=100, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_100_01_35, "UMAP MST - Mean Distances - n_neighbors=100, min_dist=0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_100_01_35, "UMAP MST - Lower Limit - n_neighbors=100, min_dist=0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_100_01_35, "UMAP MST - Upper Limit - n_neighbors=100, min_dist=0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

--------

#### ALIGNED

In [None]:
np.save('aligned_projections_100_01_35.npy', aligned_projections)
np.save('aligned_cluster_centroids_per_run_100_01_35.npy', aligned_cluster_centroids_per_run)

In [None]:
# Placeholder for ARI scores
ari_scores = []

# Iterate through each run's UMAP projections
for i, run_projection in enumerate(umap_projections_100_01_35):
    # Perform K-Means clustering on the UMAP projection for this run
    kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
    cluster_labels = kmeans.fit_predict(run_projection)

    # Compute ARI between true labels and cluster labels
    ari_score = adjusted_rand_score(y_train, cluster_labels)
    ari_scores.append(ari_score)
    print(f"Run {i}: Adjusted Rand Index (ARI): {ari_score}")

# Identify the best run based on ARI
best_run_index = np.argmax(ari_scores)
best_ari_score = ari_scores[best_run_index]

print(f"Best run based on ARI: Run {best_run_index} with ARI Score: {best_ari_score}")

In [None]:
# Procrustes Alignment using the best run as the reference
reference_projection_100 = umap_projections_100_01_35[best_run_index] # Best run based on ARI: Run 18 with ARI Score: 0.915

aligned_projections_100 = []
procrustes_distances_100 = []

for i, run_projection in enumerate(umap_projections_100_01_35):
    # Apply Procrustes alignment
    mtx1, mtx2, disparity = procrustes(reference_projection_100, run_projection)
    aligned_projections_100.append(mtx2)
    procrustes_distances_100.append(disparity)
    print(f"Run {i}: Procrustes disparity: {disparity}")

aligned_projections_100 = np.array(aligned_projections_100)

In [None]:
# Plot the aligned projections for a few runs
plt.figure(figsize=(10, 8))
for i in range(3):  # Plot the first 3 aligned runs
    plt.scatter(aligned_projections_100[i, :, 0], aligned_projections_100[i, :, 1], s=10, label=f"Run {i}")

plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.title("Procrustes-Aligned UMAP Projections")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Calculate mean and standard deviation of Procrustes disparities
mean_disparity = np.mean(procrustes_distances_100)
std_disparity = np.std(procrustes_distances_100)

# Define a threshold for "good" runs (e.g., within one standard deviation)
threshold = mean_disparity + std_disparity

# Identify good runs
good_runs = [i for i, disparity in enumerate(procrustes_distances_100) if disparity <= threshold]

print(f"Mean Procrustes disparity: {mean_disparity}")
print(f"Standard deviation of disparities: {std_disparity}")
print(f"Threshold for good runs: {threshold}")
print(f"Good runs based on Procrustes disparity: {good_runs}")

In [None]:
# TO CORROBORATE
# ## Recompute ARI scores after Procrustes alignment
aligned_ari_scores_100 = []

for aligned_projection in aligned_projections_100:
    kmeans = KMeans(n_clusters=10, random_state=42)
    cluster_labels = kmeans.fit_predict(aligned_projection)
    ari_score = adjusted_rand_score(y_train, cluster_labels)
    aligned_ari_scores_100.append(ari_score)

# Output ARI scores
print(f"Aligned ARI scores: {aligned_ari_scores_100}")

In [186]:
# Placeholder for cluster centroids for each aligned run
aligned_cluster_centroids_per_run_100 = []

# Compute centroids for each aligned run
for run in range(len(aligned_projections)):
    aligned_projection = aligned_projections[run]  # Current aligned projection
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        # Select points belonging to the current cluster
        cluster_points = aligned_projection[y_train == cluster_label]
        # Compute the centroid
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    
    # Store centroids for this run
    aligned_cluster_centroids_per_run_100.append(np.array(centroids))

aligned_cluster_centroids_per_run_100 = np.array(aligned_cluster_centroids_per_run_100)  # Shape: (n_runs, n_clusters, 2)

# Output centroids for the first run (for verification)
# print(f"Cluster centroids for first aligned run:\n{aligned_cluster_centroids_per_run_100[0]}")


In [None]:

# Initialize arrays to store standard deviations
std_dev_x_100 = np.zeros(10)
std_dev_y_100 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = aligned_cluster_centroids_per_run_100[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = aligned_cluster_centroids_per_run_100[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_n5[i] = np.std(cluster_x_coords)
    std_dev_y_n5[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_100)
print("Standard deviation of y coordinates per cluster:", std_dev_y_100)

In [188]:
# Create an empty list to hold the data for the DataFrame
data_umap_100 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(len(aligned_cluster_centroids_per_run_100)):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = aligned_cluster_centroids_per_run_100[trial, cluster]

        # Calculate the bounds for the 3 standard deviations range for x and y
        mean_x, mean_y = np.mean(aligned_cluster_centroids_per_run_100[:, cluster, :], axis=0)
        lower_bound_x, upper_bound_x = mean_x - 3 * std_dev_x_n5[cluster], mean_x + 3 * std_dev_x_n5[cluster]
        lower_bound_y, upper_bound_y = mean_y - 3 * std_dev_y_n5[cluster], mean_y + 3 * std_dev_y_n5[cluster]

        # Check if the centroid is inside the 3 std range
        inside_3_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)

        # Append the data as a new row in the list
        data_umap_100.append([trial + 1, cluster, centroid_coord, inside_3_std])

# Create a DataFrame from the list of data
df_results_umap_100 = pd.DataFrame(data_umap_100, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 3 std'])

In [None]:
# Step 1: Group the DataFrame by Trial
grouped_by_trial = df_results_umap_100.groupby('Trial')

# Step 2: Check if all clusters in each trial are True for 'Inside 2 std'
trials_with_all_true = grouped_by_trial['Inside 3 std'].all()

# Step 3: Filter the trials where all clusters were True
valid_trials_100 = trials_with_all_true[trials_with_all_true].index.tolist()

# Step 4: Show the list of trials
print(f"Trials where all clusters are inside 3 std: {valid_trials}")

In [190]:
# Filter aligned projections to keep only valid runs
# valid_runs = [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 29, 30, 31, 33, 34, 35]
aligned_projections_100 = aligned_projections_100[[run - 1 for run in valid_trials_100]]

In [None]:
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Placeholder for pairwise distance matrices
distance_matrices_kmeans_100 = []

# Compute pairwise distances between centroids for each run
for centroids in aligned_cluster_centroids_per_run_100:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_kmeans_100.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_kmeans_100 = np.array(distance_matrices_kmeans_100)  # Shape: (n_runs, 10, 10)

# Calculate the mean distance matrix across all runs
mean_distance_matrix_kmeans_100 = np.mean(distance_matrices_kmeans_100, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_kmeans_100 = (
    mean_distance_matrix_kmeans_100 - np.min(mean_distance_matrix_kmeans_100)
) / (np.max(mean_distance_matrix_kmeans_100) - np.min(mean_distance_matrix_kmeans_100))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_kmeans_100, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (Precomputed KMeans Centroids, n_neighbors=100, min_dist=0.1)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save results
np.save('distance_matrices_kmeans_100.npy', distance_matrices_kmeans_100)
np.save('mean_distance_matrix_kmeans_100.npy', mean_distance_matrix_kmeans_100)
np.save('normalized_mean_distance_matrix_kmeans_100.npy', normalized_mean_distance_matrix_kmeans_100)

# Output results
print("Mean Distance Matrix (KMeans, Precomputed Centroids):")
print(mean_distance_matrix_kmeans_100)


In [193]:
##########

In [None]:
# Filter aligned_cluster_centroids_per_run to match valid runs
aligned_cluster_centroids_per_run_filtered_100 = aligned_cluster_centroids_per_run[[run - 1 for run in valid_trials_100]]

# Placeholder for pairwise distance matrices
distance_matrices_100_01_35_al = []

# Calculate pairwise distances between centroids for each valid run
for centroids in aligned_cluster_centroids_per_run_filtered_100:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_100_01_35_al.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_100_01_35_al = np.array(distance_matrices_100_01_35_al)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_100_01_35_al = np.mean(distance_matrices_100_01_35_al, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_01_35_al = (mean_distance_matrix_100_01_35_al - np.min(mean_distance_matrix_100_01_35_al)) / (
    np.max(mean_distance_matrix_100_01_35_al) - np.min(mean_distance_matrix_100_01_35_al)
)

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_100_01_35_al, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_100_01_35_al.npy', distance_matrices_100_01_35_al)
np.save('mean_distance_matrix_neighbors_100_01_35_al.npy', mean_distance_matrix_100_01_35_al)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_100_01_35_al}")

--------

### General UMAP Comparison

In [191]:
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
# Define clusters
clusters = np.arange(10)  # Clusters from 0 to 9

# Define colors for each n_neighbors
colors = {5: "orange", 10: "blue", 20: "yellow", 30: "grey", 50: "green", 100: "red"}

# Create a PDF to save all the plots
with PdfPages("Cluster_Confidence_Intervals.pdf") as pdf:
    # Iterate over each cluster as the base cluster
    for base_cluster in clusters:
        
        # Define the data for each n_neighbors, adjusted for the base cluster
        data = {
            5: {
                "mean": np.delete(mean_distance_matrix_5_01_35[base_cluster], base_cluster),  # Distances from base cluster
                "lower": np.delete(lower_limit_intconf_matrix_5_01_35[base_cluster], base_cluster),  # Lower bounds
                "upper": np.delete(upper_limit_intconf_matrix_5_01_35[base_cluster], base_cluster)   # Upper bounds
            },
            10: {
                "mean": np.delete(mean_distance_matrix_10_01_35[base_cluster], base_cluster),
                "lower": np.delete(lower_limit_intconf_matrix_10_01_35[base_cluster], base_cluster),
                "upper": np.delete(upper_limit_intconf_matrix_10_01_35[base_cluster], base_cluster)
            },
            20: {
                "mean": np.delete(mean_distance_matrix_20_01_35[base_cluster], base_cluster),
                "lower": np.delete(lower_limit_intconf_matrix_20_01_35[base_cluster], base_cluster),
                "upper": np.delete(upper_limit_intconf_matrix_20_01_35[base_cluster], base_cluster)
            },
            30: {
                "mean": np.delete(mean_distance_matrix_30_01_35[base_cluster], base_cluster),
                "lower": np.delete(lower_limit_intconf_matrix_30_01_35[base_cluster], base_cluster),
                "upper": np.delete(upper_limit_intconf_matrix_30_01_35[base_cluster], base_cluster)
            },
            50: {
                "mean": np.delete(mean_distance_matrix_50_01_35[base_cluster], base_cluster),
                "lower": np.delete(lower_limit_intconf_matrix_50_01_35[base_cluster], base_cluster),
                "upper": np.delete(upper_limit_intconf_matrix_50_01_35[base_cluster], base_cluster)
            },
            100: {
                "mean": np.delete(mean_distance_matrix_100_01_35[base_cluster], base_cluster),
                "lower": np.delete(lower_limit_intconf_matrix_100_01_35[base_cluster], base_cluster),
                "upper": np.delete(upper_limit_intconf_matrix_100_01_35[base_cluster], base_cluster)
            }
        }

        # Define clusters to be compared against (excluding the base cluster)
        compare_clusters = np.delete(clusters, base_cluster)

        # Plotting
        fig, ax = plt.subplots(figsize=(16, 8))

        width = 0.15  # Bar width
        x = np.arange(len(compare_clusters))  # X positions for clusters

        for idx, (n_neighbors, values) in enumerate(data.items()):
            # Calculate positions for the current set of bars
            x_positions = x + (idx - len(data) / 2) * width

            # Plot bars for the mean distances
            ax.bar(
                x_positions,
                values["mean"],  # Mean distances
                yerr=[
                    values["mean"] - values["lower"],  # Lower error
                    values["upper"] - values["mean"]   # Upper error
                ],
                width=width,
                color=colors[n_neighbors],
                alpha=0.7,
                label=f"n={n_neighbors}",
                capsize=5
            )

        # Add labels, title, and legend
        ax.set_xlabel("Clusters", fontsize=14)
        ax.set_ylabel("Distance", fontsize=14)
        ax.set_title(f"Confidence Intervals of Distances from Cluster {base_cluster} to Other Clusters", fontsize=16)
        ax.set_xticks(x)
        ax.set_xticklabels([f"{i}" for i in compare_clusters], fontsize=12)
        ax.legend(title="n_neighbors", fontsize=10)
        ax.grid(axis="y", linestyle="--", alpha=0.7)

        plt.tight_layout()

        # Save the figure to the PDF
        pdf.savefig(fig)
        plt.close(fig)

print("PDF with cluster confidence intervals has been successfully created.")

In [None]:
# Define clusters
clusters = np.arange(10)  # Clusters from 0 to 9

# Define colors for each n_neighbors
colors = {5: "orange", 10: "blue", 20: "yellow", 30: "grey", 50: "green", 100: "red"}

# Iterate over each cluster as the base cluster
for base_cluster in clusters:
    
    # Define the data for each n_neighbors, adjusted for the base cluster
    data = {
        5: {
            "mean": np.delete(mean_distance_matrix_5_01_35[base_cluster], base_cluster),  # Distances from base cluster
            "lower": np.delete(lower_limit_intconf_matrix_5_01_35[base_cluster], base_cluster),  # Lower bounds
            "upper": np.delete(upper_limit_intconf_matrix_5_01_35[base_cluster], base_cluster)   # Upper bounds
        },
        10: {
            "mean": np.delete(mean_distance_matrix_10_01_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_10_01_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_10_01_35[base_cluster], base_cluster)
        },
        20: {
            "mean": np.delete(mean_distance_matrix_20_01_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_20_01_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_20_01_35[base_cluster], base_cluster)
        },
        30: {
            "mean": np.delete(mean_distance_matrix_30_01_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_30_01_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_30_01_35[base_cluster], base_cluster)
        },
        50: {
            "mean": np.delete(mean_distance_matrix_50_01_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_50_01_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_50_01_35[base_cluster], base_cluster)
        },
        100: {
            "mean": np.delete(mean_distance_matrix_100_01_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_100_01_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_100_01_35[base_cluster], base_cluster)
        }
    }

    # Define clusters to be compared against (excluding the base cluster)
    compare_clusters = np.delete(clusters, base_cluster)

    # Plotting
    fig, ax = plt.subplots(figsize=(16, 8))

    width = 0.15  # Bar width
    x = np.arange(len(compare_clusters))  # X positions for clusters

    for idx, (n_neighbors, values) in enumerate(data.items()):
        # Calculate positions for the current set of bars
        x_positions = x + (idx - len(data) / 2) * width

        # Plot bars for the mean distances
        ax.bar(
            x_positions,
            values["mean"],  # Mean distances
            yerr=[
                values["mean"] - values["lower"],  # Lower error
                values["upper"] - values["mean"]   # Upper error
            ],
            width=width,
            color=colors[n_neighbors],
            alpha=0.7,
            label=f"n={n_neighbors}",
            capsize=5
        )

    # Add labels, title, and legend
    ax.set_xlabel("Clusters", fontsize=14)
    ax.set_ylabel("Distance", fontsize=14)
    ax.set_title(f"Confidence Intervals of Distances from Cluster {base_cluster} to Other Clusters", fontsize=16)
    ax.set_xticks(x)
    ax.set_xticklabels([f"{i}" for i in compare_clusters], fontsize=12)
    ax.legend(title="n_neighbors", fontsize=10)
    ax.grid(axis="y", linestyle="--", alpha=0.7)

    plt.tight_layout()
    plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Define your matrices for each n_neighbors value
matrices = {
    5: {
        "mean": normalized_mean_distance_matrix_5_08_35,
        "lower": norm_lower_limit_intconf_matrix_5_08_35,
        "upper": norm_upper_limit_intconf_matrix_5_08_35
    },
    10: {
        "mean": normalized_mean_distance_matrix_10_08_35,
        "lower": norm_lower_limit_intconf_matrix_10_08_35,
        "upper": norm_upper_limit_intconf_matrix_10_08_35
    },
    20: {
        "mean": normalized_mean_distance_matrix_20_08_35,
        "lower": norm_lower_limit_intconf_matrix_20_08_35,
        "upper": norm_upper_limit_intconf_matrix_20_08_35
    },
    30: {
        "mean": normalized_mean_distance_matrix_30_08_35,
        "lower": norm_lower_limit_intconf_matrix_30_08_35,
        "upper": norm_upper_limit_intconf_matrix_30_08_35
    },
    50: {
        "mean": normalized_mean_distance_matrix_50_08_35,
        "lower": norm_lower_limit_intconf_matrix_50_08_35,
        "upper": norm_upper_limit_intconf_matrix_50_08_35
    },
    100: {
        "mean": normalized_mean_distance_matrix_100_08_35,
        "lower": norm_lower_limit_intconf_matrix_100_08_35,
        "upper": norm_upper_limit_intconf_matrix_100_08_35
    }
}

# Open a PDF to save the plots
with PdfPages('MST_UMAP_Comparisons min_dis=0.8.pdf') as pdf:
    for n_neighbors, matrix_set in matrices.items():
        # Set up the figure with three subplots
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        
        # Plot MSTs for mean, lower, and upper matrices
        plot_mst(matrix_set["mean"], f"MST UMAP - Mean Distances (n_neighbors={n_neighbors}, min_dis=0.8)", axes[1], color='red')
        plot_mst(matrix_set["lower"], f"MST UMAP - Lower Limit (n_neighbors={n_neighbors}, min_dis=0.8)", axes[0], color='blue')
        plot_mst(matrix_set["upper"], f"MST UMAP - Upper Limit (n_neighbors={n_neighbors}, min_dis=0.8)", axes[2], color='green')
        
        # Adjust layout for better spacing
        plt.tight_layout()
        
        # Save the current figure to the PDF
        pdf.savefig(fig)
        plt.close(fig)

print("PDF with MST UMAP Comparisons has been successfully created.")

### MDS vs UMAP Sammon's stress

In [None]:
# Step 1: Downsample the Dataset Consistently
def downsample_mnist_consistent(x_data, y_labels, sample_fraction=0.1):
    """
    Downsample the dataset consistently, returning indices to ensure
    the same points are selected in both spaces.
    """
    sampled_indices = []
    unique_labels = np.unique(y_labels)
    for label in unique_labels:
        # Select indices for the current label
        label_indices = np.where(y_labels == label)[0]
        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False
        )
        sampled_indices.extend(sampled_indices_label)
    return np.array(sampled_indices)

# Get consistent indices for sampling
sampled_indices = downsample_mnist_consistent(x_train_flattened, y_train, sample_fraction=0.1)

# Step 2: Use the Sampled Indices to Extract Points from Both Spaces
# Downsample the high-dimensional original space
x_sampled = x_train_flattened[sampled_indices]  # Original high-dimensional space
y_sampled = y_train[sampled_indices]            # Corresponding labels

# Load the mean projections and downsample
umap_projections_downsampled = {
    10: np.load("mean_projection_10_01_35.npy")[sampled_indices],  # Mean projection for n_neighbors=10
    50: np.load("mean_projection_50_01_35.npy")[sampled_indices],  # Mean projection for n_neighbors=50
    100: np.load("mean_projection_100_01_35.npy")[sampled_indices],  # Mean projection for n_neighbors=100
}

# Output shapes for verification
print(f"x_sampled shape: {x_sampled.shape}")
print(f"y_sampled shape: {y_sampled.shape}")
for n_neighbors, projection in umap_projections_downsampled.items():
    print(f"UMAP (n_neighbors={n_neighbors}) downsampled shape: {projection.shape}")

In [None]:
# Use sklearn's pairwise_distances for better handling of large arrays
# "True" distances between points in the original/high dimensional space
pairwise_distances = sklearn_pairwise_distances(x_sampled, metric='euclidean')

# Initialize and fit MDS
# Uses MDS to create a reference (ideal or baseline) embedding in 2D while preserving the global structure of pairwise distances
mds_model = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
mds_embedding = mds_model.fit_transform(pairwise_distances)

def sammons_stress(original_distances, embedding_distances):
    """
    Calculate Sammon's stress/error with normalization.
    """
    epsilon = 1e-9  # Avoid division by zero
    original_distances = np.maximum(original_distances, epsilon)
    normalization = np.sum(original_distances)  # Sum of all original distances
    stress = np.sum(((original_distances - embedding_distances) ** 2) / original_distances)
    return stress / normalization  # Normalize by the total sum of original distances

# Above eq. quantifies the degree to which the low-dimensional embedding preserves the pairwise distances from the original space. Lower stress indicates better preservation.

# Calculate Sammon's stress for UMAP embeddings
# Evaluates how well each UMAP embedding preserves global structures compared to the original distances
stress_results = {}
original_distances = pairwise_distances
for n_neighbors, umap_embedding in umap_projections_downsampled.items():
    # Compute pairwise distances for the UMAP embedding
    umap_distances = sklearn_pairwise_distances(umap_embedding, metric='euclidean')
    # Calculate Sammon's stress
    stress = sammons_stress(original_distances, umap_distances)
    stress_results[n_neighbors] = stress

-----

Everything between these and botton line works and should be consider final version - READY FOR OVERLEAF

Using the MDS indeces and results from MNIST silhouette Score + ARI + Acc... section

Last version, is the one to be used.

In [22]:
# Load sampled indices
sampled_indices_train_mds= np.load("sampled_indices_train_mds.npy")
sampled_indices_test_mds= np.load("sampled_indices_test_mds.npy")

# Load downsampled dataset
x_train_sampled_mds= np.load("x_train_sampled_mds.npy")
y_train_sampled_mds= np.load("y_train_sampled_mds.npy")
x_test_sampled_mds= np.load("x_test_sampled_mds.npy")
y_test_sampled_mds= np.load("y_test_sampled_mds.npy")

# Load MDS embeddings
x_train_mds_c2= np.load("x_train_mds_c2.npy")
x_test_mds_c2= np.load("x_test_mds_c2.npy")

In [23]:
# Load the mean projections and downsample
umap_projections_downsampled = {
    10: np.load("mean_projection_10_01_35.npy")[sampled_indices_train_mds],  # Mean projection for n_neighbors=10
    50: np.load("mean_projection_50_01_35.npy")[sampled_indices_train_mds],  # Mean projection for n_neighbors=50
    100: np.load("mean_projection_100_01_35.npy")[sampled_indices_train_mds],  # Mean projection for n_neighbors=100
}

In [None]:
for n_neighbors, projection in umap_projections_downsampled.items():
    print(f"UMAP (n_neighbors={n_neighbors}) downsampled shape: {projection.shape}")

In [None]:
# Function to calculate Sammon's stress
def sammons_stress(original_distances, embedding_distances):
    """
    Calculate Sammon's stress/error with normalization.
    """
    epsilon = 1e-9  # Avoid division by zero
    original_distances = np.maximum(original_distances, epsilon)  # Prevent zero distances
    normalization = np.sum(original_distances)  # Sum of all original distances
    stress = np.sum(((original_distances - embedding_distances) ** 2) / original_distances)
    return stress / normalization  # Normalize by the total sum of original distances

# Compute pairwise distances for the original MDS embedding
original_distances = sklearn_pairwise_distances(x_train_mds_c2, metric='euclidean')
print(f"Original distances shape: {original_distances.shape}")  # Should be (5996, 5996)

In [None]:
# Calculate Sammon's stress for UMAP embeddings
stress_results = {}
for n_neighbors, umap_embedding in umap_projections_downsampled.items():
    # Compute pairwise distances for the UMAP embedding
    umap_distances = sklearn_pairwise_distances(umap_embedding, metric='euclidean')
    print(f"UMAP (n_neighbors={n_neighbors}) distances shape: {umap_distances.shape}")  # Should be (5996, 5996)

    # Calculate Sammon's stress
    stress = sammons_stress(original_distances, umap_distances)
    stress_results[n_neighbors] = stress

# Print the Sammon's stress results
for n_neighbors, stress in stress_results.items():
    print(f"Sammon's stress for UMAP (n_neighbors={n_neighbors}): {stress}")

In [None]:
# Step 3: Visualize Results
plt.figure(figsize=(12, 8))

# Plot MDS Embedding
plt.subplot(2, 2, 1)
plt.scatter(x_train_mds_c2[:, 0], x_train_mds_c2[:, 1], c=y_train_sampled_mds, cmap='Spectral', s=5)
plt.title("MDS Embedding")
plt.colorbar(label="Digit Label")

# Plot UMAP Embeddings for different n_neighbors
for idx, n_neighbors in enumerate([10, 50, 100], start=2):
    plt.subplot(2, 2, idx)
    plt.scatter(
        umap_projections_downsampled[n_neighbors][:, 0], 
        umap_projections_downsampled[n_neighbors][:, 1], 
        c=y_train_sampled_mds, cmap='Spectral', s=5
    )
    plt.title(f"UMAP Embedding (n_neighbors={n_neighbors})")
    plt.colorbar(label="Digit Label")

plt.tight_layout()
plt.show()

Procrustes

In [None]:
from scipy.spatial import procrustes

# Calculate Procrustes distance between MDS and UMAP embeddings
procrustes_results = {}
for n_neighbors, umap_embedding in umap_projections_downsampled.items():
    # Perform Procrustes analysis
    mds_embedding = x_train_mds_c2  # Reference embedding (MDS)
    _, umap_aligned, disparity = procrustes(mds_embedding, umap_embedding)
    # Store the Procrustes distance (disparity)
    procrustes_results[n_neighbors] = disparity

# Print results
for n_neighbors, distance in procrustes_results.items():
    print(f"Procrustes Distance for UMAP (n_neighbors={n_neighbors}): {np.round(distance,3)}")

Sammon's stress and Variability with t-student (due to runs= 10<30)

In [83]:
# Iterate over stress results
for n_neighbors, run_stress_values in stress_results.items():
    # Ensure run_stress_values is an array
    run_stress_values = np.array(run_stress_values)

    if run_stress_values.ndim == 0:  # Handle scalar case (not iterable)
        run_stress_values = np.array([run_stress_values])  # Convert scalar to array

In [None]:
# Import necessary libraries
from sklearn.metrics import pairwise_distances
from scipy.stats import t
import numpy as np

# Step 1: Load the UMAP Projections for the First 10 Runs
umap_projections_dict = {
    10: umap_projections_10_01_35[:10, sampled_indices_train_mds, :],  # First 10 runs for n_neighbors=10
    50: umap_projections_50_01_35[:10, sampled_indices_train_mds, :],  # First 10 runs for n_neighbors=50
    100: umap_projections_100_01_35[:10, sampled_indices_train_mds, :]  # First 10 runs for n_neighbors=100
}

# Step 2: Calculate Sammon's Stress for Each Run
sammon_results = {}  # Dictionary to store stress results for each n_neighbors
for n_neighbors, projections in umap_projections_dict.items():
    stress_values = []
    for run_number, run_projection in enumerate(projections, start=1):
        # Compute pairwise distances for the UMAP embedding
        try:
            embedding_distances = pairwise_distances(run_projection, metric='euclidean')
            # Calculate Sammon's stress
            stress = sammons_stress(original_distances, embedding_distances)
            stress_values.append((run_number, stress))
        except Exception as e:
            print(f"Error calculating stress for n_neighbors={n_neighbors}, run={run_number}: {e}")
            continue
    sammon_results[n_neighbors] = stress_values

# Step 3: Print the Results for the First 10 Runs
print("Sammon's Stress Results for the First 10 Runs:")
for n_neighbors, stress_values in sammon_results.items():
    print(f"\nUMAP (n_neighbors={n_neighbors}):")
    for run_number, stress in stress_values:
        print(f"  Run {run_number}: Sammon's Stress = {stress:.6f}")

# Step 4: Update Variability Computation with Multiple Runs
final_results = {}
run_variability = {}

for n_neighbors, stress_values in sammon_results.items():
    run_stress_values = [stress for _, stress in stress_values]
    mean_stress = np.mean(run_stress_values)
    std_stress = np.std(run_stress_values, ddof=1)  # Use ddof=1 for sample standard deviation

    # Calculate confidence interval using Student's t-distribution
    if len(run_stress_values) > 1:  # Ensure enough data points for CI calculation
        t_score = t.ppf(0.975, df=len(run_stress_values) - 1)
        margin_of_error = t_score * (std_stress / np.sqrt(len(run_stress_values)))
        confidence_interval = (mean_stress - margin_of_error, mean_stress + margin_of_error)
    else:
        confidence_interval = (mean_stress, mean_stress)

    # Store final results
    final_results[n_neighbors] = {
        "mean": mean_stress,
        "std": std_stress,
        "95% CI": confidence_interval,
        "run_values": run_stress_values
    }
    # Store standard deviation for run-to-run variability
    run_variability[n_neighbors] = std_stress

# Step 5: Print Final Results with Variability
print("\nSammon's Stress Results with Variability (10 Runs):")
for n_neighbors, stats in final_results.items():
    print(f"n_neighbors={n_neighbors}:")
    print(f"  Mean Stress: {stats['mean']:.4f}")
    print(f"  Standard Deviation Across Runs: {stats['std']:.4f}")
    print(f"  95% Confidence Interval: {stats['95% CI']}")
    for run_idx, stress in enumerate(stats['run_values'], start=1):
        print(f"    Run {run_idx}: Stress={stress:.4f}")


Below code is for adding std. deviation per run, but it is given extremely high values due to not normalized or errors in the formulation.

In [None]:
# # Function to calculate Sammon's stress and its standard deviation per run
# def calculate_stress_with_variability_fixed(umap_projections_dict, original_distances):
#     stress_results_with_variability = {}
    
#     for n_neighbors, projections in umap_projections_dict.items():
#         run_results = []
#         for run_number, run_projection in enumerate(projections, start=1):
#             try:
#                 # Calculate pairwise distances for the embedding
#                 embedding_distances = pairwise_distances(run_projection, metric='euclidean')
                
#                 # Compute Sammon's stress
#                 stress = sammons_stress(original_distances, embedding_distances)
                
#                 # Compute standard deviation of pointwise stress values
#                 pointwise_stress = ((original_distances - embedding_distances) ** 2) / np.maximum(original_distances, 1e-9)
#                 std_dev = np.std(pointwise_stress.flatten())  # Flatten to compute correct standard deviation
                
#                 # Normalize the standard deviation relative to the stress
#                 normalized_std_dev = std_dev / stress
                
#                 run_results.append((run_number, stress, std_dev, normalized_std_dev))
#             except Exception as e:
#                 print(f"Error calculating stress for n_neighbors={n_neighbors}, run={run_number}: {e}")
        
#         stress_results_with_variability[n_neighbors] = run_results
    
#     return stress_results_with_variability

# # Call the fixed function
# sammon_results_with_variability_fixed = calculate_stress_with_variability_fixed(umap_projections_dict, original_distances)

# # Print results
# print("Sammon's Stress Results with Per-Run Variability (Fixed):")
# for n_neighbors, results in sammon_results_with_variability_fixed.items():
#     print(f"\nn_neighbors={n_neighbors}:")
#     for run_number, stress, std_dev, normalized_std_dev in results:
#         print(f"  Run {run_number}: Stress={stress:.4f}, Std Dev={std_dev:.4f}, Normalized Std Dev={normalized_std_dev:.4f}")

--------

In [None]:
# Step 3: Visualize Results
plt.figure(figsize=(12, 8))

# Plot MDS Embedding
plt.subplot(2, 2, 1)
plt.scatter(mds_embedding[:, 0], mds_embedding[:, 1], c=y_sampled, cmap='Spectral', s=5)
plt.title("MDS Embedding")
plt.colorbar(label="Digit Label")

# Plot UMAP Embeddings for different n_neighbors
for idx, n_neighbors in enumerate([10, 50, 100], start=2):
    plt.subplot(2, 2, idx)
    plt.scatter(
        umap_projections_downsampled[n_neighbors][:, 0], 
        umap_projections_downsampled[n_neighbors][:, 1], 
        c=y_sampled, cmap='Spectral', s=5
    )
    plt.title(f"UMAP Embedding (n_neighbors={n_neighbors})")
    plt.colorbar(label="Digit Label")

plt.tight_layout()
plt.show()

In [None]:
# Step 4: Print Sammon's Stress Results
print("Sammon's Stress/Error for UMAP:")
for n_neighbors, stress in stress_results.items():
    print(f"n_neighbors={n_neighbors}: {stress:.4f}")

In [None]:
from scipy.spatial import procrustes

# Calculate Procrustes distance for UMAP embeddings compared to MDS
procrustes_distances = {}
for n_neighbors, umap_embedding in umap_projections_downsampled.items():
    # Apply Procrustes analysis
    mtx1, mtx2, disparity = procrustes(mds_embedding, umap_embedding)
    procrustes_distances[n_neighbors] = disparity

# Print Procrustes distances
print("Procrustes Distances (MDS vs UMAP):")
for n_neighbors, distance in procrustes_distances.items():
    print(f"n_neighbors={n_neighbors}: {distance:.4f}")


In [39]:
def normalize_coordinates(data):
    """Normalize data to the range [0, 1]."""
    min_val = np.min(data, axis=0)
    max_val = np.max(data, axis=0)
    return (data - min_val) / (max_val - min_val)

# Normalize MDS embedding
mds_embedding_normalized = normalize_coordinates(mds_embedding)

# Normalize UMAP embeddings
umap_embeddings_normalized = {
    n_neighbors: normalize_coordinates(embedding)
    for n_neighbors, embedding in umap_projections_downsampled.items()
}

In [None]:
plt.figure(figsize=(18, 6))

for idx, n_neighbors in enumerate([10, 50, 100], start=1):
    plt.subplot(1, 3, idx)
    # Plot MDS in blue
    plt.scatter(
        mds_embedding_normalized[:, 0],
        mds_embedding_normalized[:, 1],
        c="blue",
        s=10,
        alpha=0.5,
        label="MDS"
    )
    # Plot UMAP in red
    plt.scatter(
        umap_embeddings_normalized[n_neighbors][:, 0],
        umap_embeddings_normalized[n_neighbors][:, 1],
        c="red",
        s=10,
        alpha=0.5,
        marker="x",
        label=f"UMAP (n_neighbors={n_neighbors})"
    )
    plt.title(f"MDS vs UMAP Overlay (n_neighbors={n_neighbors})")
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
def calculate_neighborhood_consistency(mds_embedding, umap_embedding, k=10):
    """
    Calculate neighborhood consistency between MDS and UMAP embeddings.
    Parameters:
        mds_embedding (np.ndarray): The MDS embedding of shape (n_samples, 2).
        umap_embedding (np.ndarray): The UMAP embedding of shape (n_samples, 2).
        k (int): Number of neighbors to consider.
    Returns:
        avg_consistency (float): Average neighborhood consistency.
    """
    # Compute pairwise distances
    mds_distances = pairwise_distances(mds_embedding, metric="euclidean")
    umap_distances = pairwise_distances(umap_embedding, metric="euclidean")

    # Get k-nearest neighbors for each point
    mds_neighbors = np.argsort(mds_distances, axis=1)[:, 1 : k + 1]  # Exclude self (index 0)
    umap_neighbors = np.argsort(umap_distances, axis=1)[:, 1 : k + 1]

    # Calculate overlap ratios for each point
    overlap_ratios = []
    for i in range(mds_neighbors.shape[0]):
        overlap = len(set(mds_neighbors[i]).intersection(set(umap_neighbors[i])))
        overlap_ratios.append(overlap / k)

    # Compute average consistency
    avg_consistency = np.mean(overlap_ratios)
    return avg_consistency

# Calculate neighborhood consistency for different n_neighbors in UMAP
neighborhood_consistency_results = {}
k_neighbors = 10  # Number of neighbors to consider
for n_neighbors, umap_embedding in umap_projections_downsampled.items():
    consistency = calculate_neighborhood_consistency(mds_embedding, umap_embedding, k=k_neighbors)
    neighborhood_consistency_results[n_neighbors] = consistency

# Print results
print("Neighborhood Consistency between MDS and UMAP:")
for n_neighbors, consistency in neighborhood_consistency_results.items():
    print(f"n_neighbors={n_neighbors}: {consistency:.4f}")

-------------

### Sammon's stress and Gaussian

In [10]:
from sklearn.metrics import pairwise_distances
import numpy as np
from sklearn.manifold import MDS
from sklearn.utils import resample
from scipy.stats import t
import matplotlib.pyplot as plt

In [None]:
## Same analysis but with downsampling to 25% and 10 runs instead of all 35

# Step 1: Downsample the Dataset Consistently
def downsample_mnist_consistent(x_data, y_labels, sample_fraction=0.25):
    """
    Downsample the dataset consistently, returning indices to ensure
    the same points are selected in both spaces.
    """
    sampled_indices = []
    unique_labels = np.unique(y_labels)
    for label in unique_labels:
        # Select indices for the current label
        label_indices = np.where(y_labels == label)[0]
        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False
        )
        sampled_indices.extend(sampled_indices_label)
    return np.array(sampled_indices)

In [None]:
# Sample the dataset
sampled_indices = downsample_mnist_consistent(x_train_flattened, y_train, sample_fraction=0.25)
x_sampled = x_train_flattened[sampled_indices]  # Original high-dimensional space
y_sampled = y_train[sampled_indices]            # Corresponding labels

# Precompute pairwise distances for the original high-dimensional data
original_distances = pairwise_distances(x_sampled, metric='euclidean')

# Load the UMAP projections for 10 runs
umap_projections_dict = {
    10: umap_projections_10_01_35[:10, sampled_indices, :],  # First 10 runs
    50: umap_projections_50_01_35[:10, sampled_indices, :],
    100: umap_projections_100_01_35[:10, sampled_indices, :]
}

In [None]:
# Step 2: Calculate Sammon's Stress for Each Run
sammon_results = {}
for n_neighbors, projections in umap_projections_dict.items():
    stress_values = []
    for run_number, run_projection in enumerate(projections, start=1):
        embedding_distances = pairwise_distances(run_projection, metric='euclidean')
        stress = sammons_stress(original_distances, embedding_distances)
        stress_values.append((run_number, stress))
    sammon_results[n_neighbors] = stress_values

In [None]:
from scipy.stats import norm

final_results = {}
run_variability = {}  # To store standard deviation across runs

for n_neighbors, stress_values in sammon_results.items():
    run_stress_values = [stress for _, stress in stress_values]
    mean_stress = np.mean(run_stress_values)
    std_stress = np.std(run_stress_values)  # Standard deviation across runs

    # Calculate confidence interval using Gaussian assumptions
    z_score = norm.ppf(0.975)  # z-score for 95% confidence interval
    margin_of_error = z_score * (std_stress / np.sqrt(len(run_stress_values)))
    confidence_interval = (
        mean_stress - margin_of_error,
        mean_stress + margin_of_error
    )

    # Store final results
    final_results[n_neighbors] = {
        "mean": mean_stress,
        "std": std_stress,
        "95% CI": confidence_interval,
        "run_values": run_stress_values  # Per-run stress values for visualization
    }
    # Store standard deviation for run-to-run variability
    run_variability[n_neighbors] = std_stress

    # Print Results with Run Variability
print("Sammon's Stress Results (10 Runs, Gaussian):")
for n_neighbors, stats in final_results.items():
    print(f"n_neighbors={n_neighbors}:")
    print(f"  Mean Stress: {stats['mean']:.4f}")
    print(f"  Standard Deviation Across Runs: {stats['std']:.4f}")
    print(f"  95% Confidence Interval: {stats['95% CI']}")
    print(f"  Standard Deviation Across Runs: {run_variability[n_neighbors]:.4f}")
    for run_idx, stress in enumerate(stats['run_values'], start=1):
        print(f"    Run {run_idx}: Stress={stress:.4f}")


In [17]:
# # Step 1: Downsample the Dataset Consistently
# def downsample_mnist_consistent(x_data, y_labels, sample_fraction=0.15):
#     """
#     Downsample the dataset consistently, returning indices to ensure
#     the same points are selected in both spaces.
#     """
#     sampled_indices = []
#     unique_labels = np.unique(y_labels)
#     for label in unique_labels:
#         # Select indices for the current label
#         label_indices = np.where(y_labels == label)[0]
#         # Sample a fraction of points for this label
#         sampled_indices_label = resample(
#             label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False
#         )
#         sampled_indices.extend(sampled_indices_label)
#     return np.array(sampled_indices)

In [18]:
# # Step 2: Calculate Sammon's Stress
# def sammons_stress(original_distances, embedding_distances):
#     """
#     Calculate Sammon's stress/error with normalization.
#     """
#     epsilon = 1e-9  # Avoid division by zero
#     original_distances = np.maximum(original_distances, epsilon)
#     normalization = np.sum(original_distances)  # Sum of all original distances
#     stress = np.sum(((original_distances - embedding_distances) ** 2) / original_distances)
#     return stress / normalization  # Normalize by the total sum of original distances

# # Sample the dataset
# sampled_indices = downsample_mnist_consistent(x_train_flattened, y_train, sample_fraction=0.15)
# x_sampled = x_train_flattened[sampled_indices]  # Original high-dimensional space
# y_sampled = y_train[sampled_indices]            # Corresponding labels

# # Precompute pairwise distances for the original high-dimensional data
# original_distances = pairwise_distances(x_sampled, metric='euclidean')

# # Load the UMAP projections for multiple runs
# umap_projections_dict = {
#     10: umap_projections_10_01_35[:, sampled_indices, :],
#     50: umap_projections_50_01_35[:, sampled_indices, :],
#     100: umap_projections_100_01_35[:, sampled_indices, :]
# }

In [19]:
# # Step 3: Calculate Sammon's Stress for Each Run and print the run number
# sammon_results = {}
# for n_neighbors, projections in umap_projections_dict.items():
#     stress_values = []
#     for run_number, run_projection in enumerate(projections, start=1):
#         embedding_distances = pairwise_distances(run_projection, metric='euclidean')
#         stress = sammons_stress(original_distances, embedding_distances)
#         stress_values.append((run_number, stress))
#     sammon_results[n_neighbors] = stress_values

In [26]:
# # Step 4: Compute Mean, Standard Deviation, and Confidence Intervals
# final_results = {}
# for n_neighbors, stress_values in sammon_results.items():
#     mean_stress = np.mean([stress for _, stress in stress_values])
#     std_stress = np.std([stress for _, stress in stress_values])
#     confidence_interval = t.interval(
#         0.95, len(stress_values) - 1,
#         loc=mean_stress, scale=std_stress / np.sqrt(len(stress_values))
#     )
#     final_results[n_neighbors] = {
#         "mean": mean_stress,
#         "std": std_stress,
#         "95% CI": confidence_interval
#     }

# # Print Results with run number
# print("Sammon's Stress Results:")
# for n_neighbors, stats in final_results.items():
#     print(f"n_neighbors={n_neighbors}: Mean={stats['mean']:.4f}, Std={stats['std']:.4f}, 95% CI={stats['95% CI']}")
#     for run_number, stress in sammon_results[n_neighbors]:
#         print(f"Run {run_number}: Stress={stress:.4f}")

In [27]:
# # Print sampled indices for each run
# for run_number in range(35):
#     sampled_indices = downsample_mnist_consistent(x_train_flattened, y_train, sample_fraction=0.1)
#     print(f"Run {run_number + 1}: Sampled Indices = {sampled_indices}")

In [28]:
# # Visualize embeddings for different runs
# for n_neighbors, projections in umap_projections_dict.items():
#     plt.figure(figsize=(10, 6))
#     for run_number, run_projection in enumerate(projections, start=1):
#         plt.scatter(run_projection[:, 0], run_projection[:, 1], label=f'Run {run_number}', alpha=0.5)
#     plt.title(f'UMAP Projections for n_neighbors={n_neighbors}')
#     plt.legend()
#     plt.show()

In [None]:
plt.bar(list(run_variability.keys()), list(run_variability.values()), color='skyblue')
plt.xticks(list(run_variability.keys()))


In [None]:
# Visualization: Plot Stress Values for Each Run
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
for n_neighbors, stats in final_results.items():
    run_stress_values = stats.get('run_values', [])
    plt.plot(
        range(1, len(run_stress_values) + 1), run_stress_values,
        label=f'n_neighbors={n_neighbors}', marker='o'
    )

plt.xlabel("Run Index")
plt.ylabel("Sammon's Stress")
plt.title("Sammon's Stress Across Runs for Different n_neighbors")
plt.legend()
plt.grid(True)
plt.show()

# Visualization: Bar Chart for Standard Deviation Across Runs
plt.figure(figsize=(8, 5))
plt.bar(list(run_variability.keys()), list(run_variability.values()), color='skyblue')
plt.xticks(list(run_variability.keys()))
plt.xlabel("n_neighbors")
plt.ylabel("Standard Deviation of Sammon's Stress")
plt.title("Run-to-Run Variability (Std. Dev.) of Sammon's Stress")
plt.grid(axis='y')
plt.show()

In [29]:
# #### Using Gaussian distribution ####
# from scipy.stats import norm

# # Step 4: Compute Mean, Standard Deviation, and Confidence Intervals using Gaussian metric
# final_results = {}
# for n_neighbors, stress_values in sammon_results.items():
#     mean_stress = np.mean([stress for _, stress in stress_values])  # Mean of stress values
#     std_stress = np.std([stress for _, stress in stress_values])   # Standard deviation of stress values

#     # Calculate 95% confidence interval using Gaussian assumptions
#     z_score = norm.ppf(0.975)  # z-score for 95% confidence interval (two-tailed)
#     margin_of_error = z_score * (std_stress / np.sqrt(len(stress_values)))
#     confidence_interval = (
#         mean_stress - margin_of_error,
#         mean_stress + margin_of_error
#     )

#     final_results[n_neighbors] = {
#         "mean": mean_stress,
#         "std": std_stress,
#         "95% CI": confidence_interval
#     }

# # Print Results with run number
# print("Sammon's Stress Results (Gaussian):")
# for n_neighbors, stats in final_results.items():
#     print(f"n_neighbors={n_neighbors}: Mean={stats['mean']:.4f}, Std={stats['std']:.4f}, 95% CI={stats['95% CI']}")
#     for run_number, stress in sammon_results[n_neighbors]:
#         print(f"Run {run_number}: Stress={stress:.4f}")

In [30]:
# # Step 3: Compute Mean, Standard Deviation, and Confidence Intervals Using Gaussian Metric
# from scipy.stats import norm

# final_results = {}
# for n_neighbors, stress_values in sammon_results.items():
#     mean_stress = np.mean([stress for _, stress in stress_values])  # Mean of stress values
#     std_stress = np.std([stress for _, stress in stress_values])   # Standard deviation of stress values

#     # Calculate 95% confidence interval using Gaussian assumptions
#     z_score = norm.ppf(0.975)  # z-score for 95% confidence interval (two-tailed)
#     margin_of_error = z_score * (std_stress / np.sqrt(len(stress_values)))
#     confidence_interval = (
#         mean_stress - margin_of_error,
#         mean_stress + margin_of_error
#     )

#     final_results[n_neighbors] = {
#         "mean": mean_stress,
#         "std": std_stress,
#         "95% CI": confidence_interval
#     }

# # Print Results
# print("Sammon's Stress Results (10 Runs, Gaussian):")
# for n_neighbors, stats in final_results.items():
#     print(f"n_neighbors={n_neighbors}: Mean={stats['mean']:.4f}, Std={stats['std']:.4f}, 95% CI={stats['95% CI']}")
#     for run_number, stress in sammon_results[n_neighbors]:
#         print(f"Run {run_number}: Stress={stress:.4f}")

### Sammon's stress and Gaussian for T-SNE

In [37]:
# Step 3: Calculate Sammon's Stress for t-SNE
# Downsample t-SNE embedding
tsne_embedding_downsampled = x_train_tsne_c2[sampled_indices]  # Downsampled t-SNE embedding

# Compute pairwise distances for t-SNE embedding
tsne_distances = pairwise_distances(tsne_embedding_downsampled, metric='euclidean')

# Calculate Sammon's Stress
from scipy.stats import norm

tsne_stress = sammons_stress(original_distances, tsne_distances)

In [None]:
# Step 4: Report Results
# Since t-SNE is run only once, we use the stress value directly
print("Sammon's Stress for t-SNE:")
print(f"Stress: {tsne_stress:.4f}")

In [None]:
# Step 5: Visualize t-SNE Embedding (Optional)
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(
    tsne_embedding_downsampled[:, 0], tsne_embedding_downsampled[:, 1],
    c=y_sampled, cmap='Spectral', s=10, alpha=0.7
)
plt.colorbar(label='Digit Labels')
plt.title("t-SNE Embedding with Downsampling (25% of MNIST)")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.grid(True)
plt.show()

In [None]:
# Step 3: Calculate Sammon's Stress for t-SNE
# Downsample t-SNE embedding
pca_embedding_downsampled = x_train_pca_c2[sampled_indices]  # Downsampled pca embedding

# Compute pairwise distances for t-SNE embedding
pca_distances = pairwise_distances(pca_embedding_downsampled, metric='euclidean')

# Calculate Sammon's Stress
from scipy.stats import norm

pca_stress = debug_sammons_stress (original_distances, pca_distances)

In [None]:
# Step 4: Report Results
# Since t-SNE is run only once, we use the stress value directly
print("Sammon's Stress for PCA:")
print(f"Stress: {pca_stress:.4f}")

In [44]:
def debug_sammons_stress(original_distances, embedding_distances):
    epsilon = 1e-9
    original_distances = np.maximum(original_distances, epsilon)
    normalization = np.sum(original_distances)
    stress_numerator = np.sum(((original_distances - embedding_distances) ** 2) / original_distances)
    print(f"Numerator: {stress_numerator}, Normalization: {normalization}")
    return stress_numerator / normalization

In [None]:
embedding_distances_sample = embedding_distances[:5, :5]  # Example for t-SNE
print("Embedding Distances Matrix (Sample):")
print(embedding_distances_sample)


In [47]:
scaling_factor = np.mean(original_distances) / np.mean(embedding_distances)
embedding_distances_rescaled = embedding_distances * scaling_factor

### Adaptable radius with min_dis between cluster/2. Mix of local and global details

In [None]:
# Not fully adaptable per cluster. 
# It uses a fixed value for all clusters within a run, derived from half of the min. intercluster dist. across all clusters in that run. 
# This fixed radius is then used to count the neighbors for each cluster.

from scipy.spatial.distance import cdist
import numpy as np

# Function to calculate combined cluster metrics
def calculate_combined_cluster_metrics(umap_projections, y_labels, n_clusters=10, n_runs=35):
    # Step 1: Calculate Cluster Centers for Each Run
    cluster_centers_full = []
    for run_idx, x_umap in enumerate(umap_projections):
        cluster_centers_run = []
        for label in np.unique(y_labels):  # Iterate over all labels
            cluster_points = x_umap[y_labels == label]
            if len(cluster_points) > 0:
                cluster_center = np.mean(cluster_points, axis=0)
                cluster_centers_run.append(cluster_center)
        cluster_centers_full.append(np.array(cluster_centers_run))
    
    cluster_centers_full = np.array(cluster_centers_full)  # Shape: (n_runs, n_clusters, 2)

    # Step 2: Calculate the Adaptable Radius (Combining Metrics)
    radii_per_cluster = []
    for run_idx, cluster_centers in enumerate(cluster_centers_full):
        # Calculate intercluster distances for this run
        intercluster_distances = cdist(cluster_centers, cluster_centers, metric='euclidean')
        np.fill_diagonal(intercluster_distances, np.inf)  # Ignore self-distances
        min_intercluster_distance = np.min(intercluster_distances) / 2  # Half of the minimum intercluster distance
        
        # Calculate intracluster radius for each cluster
        radii_cluster = []
        for cluster_idx, cluster_center in enumerate(cluster_centers):
            cluster_points = umap_projections[run_idx][y_labels == cluster_idx]
            if len(cluster_points) > 0:
                intracluster_radius = np.mean(np.linalg.norm(cluster_points - cluster_center, axis=1))
                # Combine intercluster and intracluster metrics
                combined_radius = min(min_intercluster_distance, intracluster_radius)
                radii_cluster.append(combined_radius)
        radii_per_cluster.append(radii_cluster)
    
    # Average radius across runs for each cluster
    radii_per_cluster_mean = np.mean(radii_per_cluster, axis=0)

    # Step 3: Count Neighbors Using the Combined Radius
    neighbor_counts_full = []
    for run_idx, x_umap in enumerate(umap_projections):
        counts_run = []
        for cluster_idx, cluster_center in enumerate(cluster_centers_full[run_idx]):
            radius = radii_per_cluster_mean[cluster_idx]  # Use the combined radius
            distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
            count = np.sum(distances_to_center <= radius)  # Count points within the radius
            counts_run.append(count)
        neighbor_counts_full.append(counts_run)
    
    neighbor_counts_full = np.array(neighbor_counts_full)  # Shape: (n_runs, n_clusters)
    average_neighbor_counts = np.mean(neighbor_counts_full, axis=0)  # Average across runs

    # Step 4: Calculate Intercluster Separation Metrics
    separation_metrics = []
    for run_idx, cluster_centers in enumerate(cluster_centers_full):
        intercluster_distances = cdist(cluster_centers, cluster_centers, metric='euclidean')
        np.fill_diagonal(intercluster_distances, np.inf)  # Ignore self-distances
        separation_metrics.append(np.min(intercluster_distances))  # Minimum intercluster distance
    
    average_intercluster_separation = np.mean(separation_metrics)

    return radii_per_cluster_mean, average_neighbor_counts, average_intercluster_separation

# Define n_neighbors values
n_neighbors_values = [10, 50, 100]

# Placeholder for results
results = []

# Iterate over each n_neighbors value
for n_neighbors in n_neighbors_values:
    if n_neighbors == 10:
        umap_projections = umap_projections_10_01_35
    elif n_neighbors == 50:
        umap_projections = umap_projections_50_01_35
    elif n_neighbors == 100:
        umap_projections = umap_projections_100_01_35

    # Calculate combined metrics
    radii_per_cluster, average_neighbor_counts, avg_intercluster_sep = calculate_combined_cluster_metrics(
        umap_projections, y_train
    )

    # Store results
    for cluster_idx in range(len(radii_per_cluster)):
        results.append({
            "N": n_neighbors,
            "Cluster": cluster_idx,
            "Radius": np.round(radii_per_cluster[cluster_idx], 3),
            "Number of Neighbors": np.round(average_neighbor_counts[cluster_idx], 0),
            "Intercluster Separation": np.round(avg_intercluster_sep, 3)
        })

# Create a DataFrame for easier analysis
df_results = pd.DataFrame(results)

# Save results
df_results.to_csv("combined_radius_neighbor_analysis.csv", index=False)

# Pivot table for easy comparison
pivot_table = df_results.pivot(index="Cluster", columns="N", values=["Radius", "Number of Neighbors", "Intercluster Separation"])
print(pivot_table)

### n=5, min_dist=0.0125

In [14]:
umap_projections_5_00125_35= np.load('umap_projections_5_00125_35.npy')
mean_umap_projection_5_00125_35= np.load('mean_projection_5_00125_35.npy')
std_projection_umap_5_00125_35= np.load('std_projection_5_00125_35.npy')

In [None]:
# Define parameters
n_neighbors = 5
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_5_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_5_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_5_00125_35 = np.array(umap_projections_5_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_5_00125_35 = np.mean(umap_projections_5_00125_35, axis=0)
std_projection_5_00125_35 = np.std(umap_projections_5_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_5_00125_35.npy', umap_projections_5_00125_35)
np.save('mean_projection_5_00125_35.npy', mean_projection_5_00125_35)
np.save('std_projection_5_00125_35.npy', std_projection_5_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_5_00125_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_5_00125_35  = np.sqrt(np.sum((umap_projections_5_00125_35 - mean_umap_projection_5_00125_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_5_00125_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_5_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_5_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_5_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_5_00125_35  = np.array(distance_matrices_5_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_5_00125_35  = np.mean(distance_matrices_5_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_00125_35  = (mean_distance_matrix_5_00125_35  - np.min(mean_distance_matrix_5_00125_35 )) / (np.max(mean_distance_matrix_5_00125_35 ) - np.min(mean_distance_matrix_5_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=5, , min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_00125_35 .npy', distance_matrices_5_00125_35)
np.save('mean_distance_matrix_neighbors_5_00125_35 .npy', mean_distance_matrix_5_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_5_00125_35 }")

In [None]:
# Create a graph from the distance matrix
G_5_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_5_00125_35 ,3))
np.save('G_5_00125_35 .npy',G_5_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_5_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_5_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_5_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_5_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_5_00125_35  = nx.minimum_spanning_tree(G_5_00125_35 )
np.save('mst_5_00125_35 .npy', mst_5_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_5_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_5_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_5_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_5_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=5, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_5_00125_35  = np.std(distance_matrices_5_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_5_00125_35 .npy", distance_matrix_std_5_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_5_00125_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_5_00125_35  = distance_matrix_std_5_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_5_00125_35  = z_score * sem_matrix_5_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_5_00125_35  = mean_distance_matrix_5_00125_35  - margin_of_error_matrix_5_00125_35 
upper_limit_intconf_matrix_5_00125_35  = mean_distance_matrix_5_00125_35  + margin_of_error_matrix_5_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_5_00125_35  = np.maximum(lower_limit_intconf_matrix_5_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_5_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_5_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_5_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_5_00125_35 .npy', lower_limit_intconf_matrix_5_00125_35 )
np.save('upper_limit_intconf_matrix_5_00125_35 .npy', upper_limit_intconf_matrix_5_00125_35 )

In [77]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_5_00125_35  = normalize_matrix(lower_limit_intconf_matrix_5_00125_35 )
norm_upper_limit_intconf_matrix_5_00125_35  = normalize_matrix(upper_limit_intconf_matrix_5_00125_35 )
np.save('norm_lower_limit_intconf_matrix_5_00125_35.npy', norm_lower_limit_intconf_matrix_5_00125_35 )
np.save('norm_upper_limit_intconf_matrix_5_00125_35.npy', norm_upper_limit_intconf_matrix_5_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_5_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=5, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_5_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=5, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_5_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=5, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_5_00125_35 , "UMAP MST - Mean Distances - n_neighbors=5, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_5_00125_35 , "UMAP MST - Lower Limit - n_neighbors=5, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_5_00125_35 , "UMAP MST - Upper Limit - n_neighbors=5, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=10, min_dist=0.0125

In [15]:
umap_projections_10_00125_35= np.load('umap_projections_10_00125_35.npy')
mean_umap_projection_10_00125_35= np.load('mean_projection_10_00125_35.npy')
std_projection_umap_10_00125_35= np.load('std_projection_10_00125_35.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_10_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_10_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_10_00125_35 = np.array(umap_projections_10_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_10_00125_35 = np.mean(umap_projections_10_00125_35, axis=0)
std_projection_10_00125_35 = np.std(umap_projections_10_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_10_00125_35.npy', umap_projections_10_00125_35)
np.save('mean_projection_10_00125_35.npy', mean_projection_10_00125_35)
np.save('std_projection_10_00125_35.npy', std_projection_10_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_10_00125_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_10_00125_35  = np.sqrt(np.sum((umap_projections_10_00125_35 - mean_umap_projection_10_00125_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_10_00125_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_10_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_10_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_10_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_10_00125_35  = np.array(distance_matrices_10_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_10_00125_35  = np.mean(distance_matrices_10_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_10_00125_35  = (mean_distance_matrix_10_00125_35  - np.min(mean_distance_matrix_10_00125_35 )) / (np.max(mean_distance_matrix_10_00125_35 ) - np.min(mean_distance_matrix_10_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_10_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10, , min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_10_00125_35 .npy', distance_matrices_10_00125_35)
np.save('mean_distance_matrix_neighbors_10_00125_35 .npy', mean_distance_matrix_10_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_10_00125_35 }")

In [None]:
# Create a graph from the distance matrix
G_10_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_10_00125_35 ,3))
np.save('G_10_00125_35 .npy',G_10_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_10_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_10_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_10_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_10_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_10_00125_35  = nx.minimum_spanning_tree(G_10_00125_35 )
np.save('mst_10_00125_35 .npy', mst_10_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_10_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_10_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_10_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_10_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=10, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_10_00125_35  = np.std(distance_matrices_10_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_10_00125_35 .npy", distance_matrix_std_10_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_10_00125_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_10_00125_35  = distance_matrix_std_10_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_10_00125_35  = z_score * sem_matrix_10_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_10_00125_35  = mean_distance_matrix_10_00125_35  - margin_of_error_matrix_10_00125_35 
upper_limit_intconf_matrix_10_00125_35  = mean_distance_matrix_10_00125_35  + margin_of_error_matrix_10_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_10_00125_35  = np.maximum(lower_limit_intconf_matrix_10_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_10_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_10_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_10_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_10_00125_35 .npy', lower_limit_intconf_matrix_10_00125_35 )
np.save('upper_limit_intconf_matrix_10_00125_35 .npy', upper_limit_intconf_matrix_10_00125_35 )

In [87]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_10_00125_35  = normalize_matrix(lower_limit_intconf_matrix_10_00125_35 )
norm_upper_limit_intconf_matrix_10_00125_35  = normalize_matrix(upper_limit_intconf_matrix_10_00125_35 )
np.save('norm_lower_limit_intconf_matrix_10_00125_35.npy', norm_lower_limit_intconf_matrix_10_00125_35 )
np.save('norm_upper_limit_intconf_matrix_10_00125_35.npy', norm_upper_limit_intconf_matrix_10_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_10_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=10, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_10_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=10, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_10_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=10, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_10_00125_35 , "UMAP MST - Mean Distances - n_neighbors=10, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_10_00125_35 , "UMAP MST - Lower Limit - n_neighbors=10, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_10_00125_35 , "UMAP MST - Upper Limit - n_neighbors=10, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=20, min_dist=0.0125

In [16]:
umap_projections_20_00125_35= np.load('umap_projections_20_00125_35.npy')
mean_umap_projection_20_00125_35= np.load('mean_projection_20_00125_35.npy')
std_projection_umap_20_00125_35= np.load('std_projection_20_00125_35.npy')

In [None]:
# Define parameters
n_neighbors = 20
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_20_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_20_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_20_00125_35 = np.array(umap_projections_20_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_20_00125_35 = np.mean(umap_projections_20_00125_35, axis=0)
std_projection_20_00125_35 = np.std(umap_projections_20_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_20_00125_35.npy', umap_projections_20_00125_35)
np.save('mean_projection_20_00125_35.npy', mean_projection_20_00125_35)
np.save('std_projection_20_00125_35.npy', std_projection_20_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_20_00125_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_20_00125_35  = np.sqrt(np.sum((umap_projections_20_00125_35 - mean_umap_projection_20_00125_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_20_00125_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_20_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_20_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_20_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_20_00125_35  = np.array(distance_matrices_20_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_20_00125_35  = np.mean(distance_matrices_20_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_20_00125_35  = (mean_distance_matrix_20_00125_35  - np.min(mean_distance_matrix_20_00125_35 )) / (np.max(mean_distance_matrix_20_00125_35 ) - np.min(mean_distance_matrix_20_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_20_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10, , min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_20_00125_35 .npy', distance_matrices_20_00125_35)
np.save('mean_distance_matrix_neighbors_20_00125_35 .npy', mean_distance_matrix_20_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_20_00125_35 }")

In [None]:
# Create a graph from the distance matrix
G_20_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_20_00125_35 ,3))
np.save('G_20_00125_35 .npy',G_20_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_20_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_20_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_20_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_20_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_20_00125_35  = nx.minimum_spanning_tree(G_20_00125_35 )
np.save('mst_20_00125_35 .npy', mst_20_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_20_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_20_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_20_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_20_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=20, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_20_00125_35  = np.std(distance_matrices_20_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_20_00125_35 .npy", distance_matrix_std_20_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_20_00125_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_20_00125_35  = distance_matrix_std_20_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_20_00125_35  = z_score * sem_matrix_20_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_20_00125_35  = mean_distance_matrix_20_00125_35  - margin_of_error_matrix_20_00125_35 
upper_limit_intconf_matrix_20_00125_35  = mean_distance_matrix_20_00125_35  + margin_of_error_matrix_20_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_20_00125_35  = np.maximum(lower_limit_intconf_matrix_20_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_20_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_20_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_20_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_20_00125_35 .npy', lower_limit_intconf_matrix_20_00125_35 )
np.save('upper_limit_intconf_matrix_20_00125_35 .npy', upper_limit_intconf_matrix_20_00125_35 )

In [97]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_20_00125_35  = normalize_matrix(lower_limit_intconf_matrix_20_00125_35 )
norm_upper_limit_intconf_matrix_20_00125_35  = normalize_matrix(upper_limit_intconf_matrix_20_00125_35 )
np.save('norm_lower_limit_intconf_matrix_20_00125_35.npy', norm_lower_limit_intconf_matrix_20_00125_35 )
np.save('norm_upper_limit_intconf_matrix_20_00125_35.npy', norm_upper_limit_intconf_matrix_20_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_20_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=20, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_20_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=20, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_20_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=20, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_20_00125_35 , "UMAP MST - Mean Distances - n_neighbors=20, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_20_00125_35 , "UMAP MST - Lower Limit - n_neighbors=20, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_20_00125_35 , "UMAP MST - Upper Limit - n_neighbors=20, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=30, min_dist=0.0125

In [17]:
umap_projections_30_00125_35= np.load('umap_projections_30_00125_35.npy')
mean_umap_projection_30_00125_35= np.load('mean_projection_30_00125_35.npy')
std_projection_umap_30_00125_35= np.load('std_projection_30_00125_35.npy')

In [None]:
# Define parameters
n_neighbors = 30
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_30_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_30_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_30_00125_35 = np.array(umap_projections_30_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_30_00125_35 = np.mean(umap_projections_30_00125_35, axis=0)
std_projection_30_00125_35 = np.std(umap_projections_30_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_30_00125_35.npy', umap_projections_30_00125_35)
np.save('mean_projection_30_00125_35.npy', mean_projection_30_00125_35)
np.save('std_projection_30_00125_35.npy', std_projection_30_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_30_00125_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_30_00125_35  = np.sqrt(np.sum((umap_projections_30_00125_35 - mean_umap_projection_30_00125_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_30_00125_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_30_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_30_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_30_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_30_00125_35  = np.array(distance_matrices_30_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_30_00125_35  = np.mean(distance_matrices_30_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_30_00125_35  = (mean_distance_matrix_30_00125_35  - np.min(mean_distance_matrix_30_00125_35 )) / (np.max(mean_distance_matrix_30_00125_35 ) - np.min(mean_distance_matrix_30_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_30_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10, , min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_30_00125_35 .npy', distance_matrices_30_00125_35)
np.save('mean_distance_matrix_neighbors_30_00125_35 .npy', mean_distance_matrix_30_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_30_00125_35 }")

In [None]:
# Create a graph from the distance matrix
G_30_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_30_00125_35 ,3))
np.save('G_30_00125_35 .npy',G_30_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_30_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_30_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_30_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_30_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_30_00125_35  = nx.minimum_spanning_tree(G_30_00125_35 )
np.save('mst_30_00125_35 .npy', mst_30_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_30_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_30_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_30_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_30_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=30, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_30_00125_35  = np.std(distance_matrices_30_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_30_00125_35 .npy", distance_matrix_std_30_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_30_00125_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_30_00125_35  = distance_matrix_std_30_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_30_00125_35  = z_score * sem_matrix_30_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_30_00125_35  = mean_distance_matrix_30_00125_35  - margin_of_error_matrix_30_00125_35 
upper_limit_intconf_matrix_30_00125_35  = mean_distance_matrix_30_00125_35  + margin_of_error_matrix_30_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_30_00125_35  = np.maximum(lower_limit_intconf_matrix_30_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_30_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_30_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_30_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_30_00125_35 .npy', lower_limit_intconf_matrix_30_00125_35 )
np.save('upper_limit_intconf_matrix_30_00125_35 .npy', upper_limit_intconf_matrix_30_00125_35 )

In [107]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_30_00125_35  = normalize_matrix(lower_limit_intconf_matrix_30_00125_35 )
norm_upper_limit_intconf_matrix_30_00125_35  = normalize_matrix(upper_limit_intconf_matrix_30_00125_35 )
np.save('norm_lower_limit_intconf_matrix_30_00125_35.npy', norm_lower_limit_intconf_matrix_30_00125_35 )
np.save('norm_upper_limit_intconf_matrix_30_00125_35.npy', norm_upper_limit_intconf_matrix_30_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_30_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=30, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_30_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=30, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_30_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=30, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_30_00125_35 , "UMAP MST - Mean Distances - n_neighbors=30, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_30_00125_35 , "UMAP MST - Lower Limit - n_neighbors=30, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_30_00125_35 , "UMAP MST - Upper Limit - n_neighbors=30, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=50, min_dist=0.0125

In [6]:
umap_projections_50_00125_35= np.load('umap_projections_50_00125_35.npy')
mean_umap_projection_50_00125_35= np.load('mean_projection_50_00125_35.npy')
std_projection_umap_50_00125_35= np.load('std_projection_50_00125_35.npy')
distance_matrices_50_00125_35= np.load('distance_matrices_neighbors_50_00125_35 .npy')
mean_distance_matrix_50_00125_35= np.load('mean_distance_matrix_neighbors_50_00125_35 .npy')

In [None]:
# Define parameters
n_neighbors = 50
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs
    
# Store UMAP projections for each run
umap_projections_50_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_50_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_50_00125_35 = np.array(umap_projections_50_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_50_00125_35 = np.mean(umap_projections_50_00125_35, axis=0)
std_projection_50_00125_35 = np.std(umap_projections_50_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_50_00125_35.npy', umap_projections_50_00125_35)
np.save('mean_projection_50_00125_35.npy', mean_projection_50_00125_35)
np.save('std_projection_50_00125_35.npy', std_projection_50_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_50_00125_35'.")


------

In [47]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_50_00125_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_50_00125 = np.zeros((n_runs, n_clusters, umap_projections_50_00125_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_50_00125_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_50_00125[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_50_00125 = np.zeros(10)
std_dev_y_50_00125 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_50_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_50_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_50_00125[i] = np.std(cluster_x_coords)
    std_dev_y_50_00125[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_50_00125)
print("Standard deviation of y coordinates per cluster:", std_dev_y_50_00125)

------

In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_50_00125_35  = np.sqrt(np.sum((umap_projections_50_00125_35 - mean_umap_projection_50_00125_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_50_00125_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_50_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_50_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_50_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_50_00125_35  = np.array(distance_matrices_50_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_50_00125_35  = np.mean(distance_matrices_50_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_00125_35  = (mean_distance_matrix_50_00125_35  - np.min(mean_distance_matrix_50_00125_35 )) / (np.max(mean_distance_matrix_50_00125_35 ) - np.min(mean_distance_matrix_50_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_50_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=50, , min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_50_00125_35 .npy', distance_matrices_50_00125_35)
np.save('mean_distance_matrix_neighbors_50_00125_35 .npy', mean_distance_matrix_50_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_50_00125_35 }")

In [7]:
# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_00125_35  = (mean_distance_matrix_50_00125_35  - np.min(mean_distance_matrix_50_00125_35 )) / (np.max(mean_distance_matrix_50_00125_35 ) - np.min(mean_distance_matrix_50_00125_35 ))

In [None]:
# Create a graph from the distance matrix
G_50_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_50_00125_35 ,3))
np.save('G_50_00125_35 .npy',G_50_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_50_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_50_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_50_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_50_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Calculate the total weight of the MST
total_weight_50_00125 = sum(nx.get_edge_attributes(mst_50_00125_35, 'weight').values())

# Print the total weight
print(f"Total weight of the MST: {total_weight_50_00125}")

# Compute the minimum spanning tree of the graph
mst_50_00125_35  = nx.minimum_spanning_tree(G_50_00125_35 )
np.save('mst_50_00125_35 .npy', mst_50_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_50_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_50_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_50_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_50_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=50, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_50_00125_35  = np.std(distance_matrices_50_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_50_00125_35 .npy", distance_matrix_std_50_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_50_00125_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_50_00125_35  = distance_matrix_std_50_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_50_00125_35  = z_score * sem_matrix_50_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_50_00125_35  = mean_distance_matrix_50_00125_35  - margin_of_error_matrix_50_00125_35 
upper_limit_intconf_matrix_50_00125_35  = mean_distance_matrix_50_00125_35  + margin_of_error_matrix_50_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_50_00125_35  = np.maximum(lower_limit_intconf_matrix_50_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_50_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_50_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_50_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_50_00125_35 .npy', lower_limit_intconf_matrix_50_00125_35 )
np.save('upper_limit_intconf_matrix_50_00125_35 .npy', upper_limit_intconf_matrix_50_00125_35 )

In [117]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_50_00125_35  = normalize_matrix(lower_limit_intconf_matrix_50_00125_35 )
norm_upper_limit_intconf_matrix_50_00125_35  = normalize_matrix(upper_limit_intconf_matrix_50_00125_35 )
np.save('norm_lower_limit_intconf_matrix_50_00125_35.npy', norm_lower_limit_intconf_matrix_50_00125_35 )
np.save('norm_upper_limit_intconf_matrix_50_00125_35.npy', norm_upper_limit_intconf_matrix_50_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_50_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=50, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_50_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=50, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_50_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=50, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_50_00125_35 , "UMAP MST - Mean Distances - n_neighbors=50, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_50_00125_35 , "UMAP MST - Lower Limit - n_neighbors=50, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_50_00125_35 , "UMAP MST - Upper Limit - n_neighbors=50, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=100, min_dist=0.0125

In [19]:
umap_projections_100_00125_35= np.load('umap_projections_100_00125_35.npy')
mean_umap_projection_100_00125_35= np.load('mean_projection_100_00125_35.npy')
std_projection_umap_100_00125_35= np.load('std_projection_100_00125_35.npy')

In [None]:
# Define parameters
n_neighbors = 100
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_100_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_100_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_100_00125_35 = np.array(umap_projections_100_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_100_00125_35 = np.mean(umap_projections_100_00125_35, axis=0)
std_projection_100_00125_35 = np.std(umap_projections_100_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_100_00125_35.npy', umap_projections_100_00125_35)
np.save('mean_projection_100_00125_35.npy', mean_projection_100_00125_35)
np.save('std_projection_100_00125_35.npy', std_projection_100_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_100_00125_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_100_00125_35  = np.sqrt(np.sum((umap_projections_100_00125_35 - mean_umap_projection_100_00125_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_100_00125_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_100_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_100_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_100_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_100_00125_35  = np.array(distance_matrices_100_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_100_00125_35  = np.mean(distance_matrices_100_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_00125_35  = (mean_distance_matrix_100_00125_35  - np.min(mean_distance_matrix_100_00125_35 )) / (np.max(mean_distance_matrix_100_00125_35 ) - np.min(mean_distance_matrix_100_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_100_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100, min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_100_00125_35 .npy', distance_matrices_100_00125_35)
np.save('mean_distance_matrix_neighbors_100_00125_35 .npy', mean_distance_matrix_100_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_100_00125_35 }")

In [None]:
# Create a graph from the distance matrix
G_100_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_100_00125_35 ,3))
np.save('G_100_00125_35 .npy',G_100_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_100_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_100_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_100_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_100_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_100_00125_35  = nx.minimum_spanning_tree(G_100_00125_35 )
np.save('mst_100_00125_35 .npy', mst_100_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_100_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_100_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_100_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_100_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=100, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_100_00125_35  = np.std(distance_matrices_100_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_100_00125_35 .npy", distance_matrix_std_100_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_100_00125_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_100_00125_35  = distance_matrix_std_100_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_100_00125_35  = z_score * sem_matrix_100_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_100_00125_35  = mean_distance_matrix_100_00125_35  - margin_of_error_matrix_100_00125_35 
upper_limit_intconf_matrix_100_00125_35  = mean_distance_matrix_100_00125_35  + margin_of_error_matrix_100_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_100_00125_35  = np.maximum(lower_limit_intconf_matrix_100_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_100_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_100_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_100_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_100_00125_35 .npy', lower_limit_intconf_matrix_100_00125_35 )
np.save('upper_limit_intconf_matrix_100_00125_35 .npy', upper_limit_intconf_matrix_100_00125_35 )

In [127]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_100_00125_35  = normalize_matrix(lower_limit_intconf_matrix_100_00125_35 )
norm_upper_limit_intconf_matrix_100_00125_35  = normalize_matrix(upper_limit_intconf_matrix_100_00125_35 )
np.save('norm_lower_limit_intconf_matrix_100_00125_35.npy', norm_lower_limit_intconf_matrix_100_00125_35 )
np.save('norm_upper_limit_intconf_matrix_100_00125_35.npy', norm_upper_limit_intconf_matrix_100_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_100_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=100, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_100_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=100, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_100_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=100, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_100_00125_35 , "UMAP MST - Mean Distances - n_neighbors=100, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_100_00125_35 , "UMAP MST - Lower Limit - n_neighbors=100, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_100_00125_35 , "UMAP MST - Upper Limit - n_neighbors=100, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=5, min_dist=0.8

In [20]:
umap_projections_5_08_35= np.load('umap_projections_5_08_35.npy')
mean_umap_projection_5_08_35= np.load('mean_projection_5_08_35.npy')
std_projection_umap_5_08_35= np.load('std_projection_5_08_35.npy')

In [None]:
# Define parameters
n_neighbors = 5
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_5_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_5_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_5_08_35 = np.array(umap_projections_5_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_5_08_35 = np.mean(umap_projections_5_08_35, axis=0)
std_projection_5_08_35 = np.std(umap_projections_5_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_5_08_35.npy', umap_projections_5_08_35)
np.save('mean_projection_5_08_35.npy', mean_projection_5_08_35)
np.save('std_projection_5_08_35.npy', std_projection_5_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_5_08_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_5_08_35  = np.sqrt(np.sum((umap_projections_5_08_35 - mean_umap_projection_5_08_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_5_08_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_5_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_5_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_5_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_5_08_35  = np.array(distance_matrices_5_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_5_08_35  = np.mean(distance_matrices_5_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_08_35  = (mean_distance_matrix_5_08_35  - np.min(mean_distance_matrix_5_08_35 )) / (np.max(mean_distance_matrix_5_08_35 ) - np.min(mean_distance_matrix_5_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100, min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_08_35 .npy', distance_matrices_5_08_35)
np.save('mean_distance_matrix_neighbors_5_08_35 .npy', mean_distance_matrix_5_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_5_08_35 }")

In [None]:
# Create a graph from the distance matrix
G_5_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_5_08_35 ,3))
np.save('G_5_08_35 .npy',G_5_08_35 )

# Draw the graph
pos = nx.spring_layout(G_5_08_35 , seed=42)  # positions for all nodes
nx.draw(G_5_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_5_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_5_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_5_08_35  = nx.minimum_spanning_tree(G_5_08_35 )
np.save('mst_5_08_35 .npy', mst_5_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_5_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_5_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_5_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_5_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=5, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_5_08_35  = np.std(distance_matrices_5_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_5_08_35 .npy", distance_matrix_std_5_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_5_08_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_5_08_35  = distance_matrix_std_5_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_5_08_35  = z_score * sem_matrix_5_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_5_08_35  = mean_distance_matrix_5_08_35  - margin_of_error_matrix_5_08_35 
upper_limit_intconf_matrix_5_08_35  = mean_distance_matrix_5_08_35  + margin_of_error_matrix_5_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_5_08_35  = np.maximum(lower_limit_intconf_matrix_5_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_5_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_5_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_5_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_5_08_35 .npy', lower_limit_intconf_matrix_5_08_35 )
np.save('upper_limit_intconf_matrix_5_08_35 .npy', upper_limit_intconf_matrix_5_08_35 )

In [137]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_5_08_35  = normalize_matrix(lower_limit_intconf_matrix_5_08_35 )
norm_upper_limit_intconf_matrix_5_08_35  = normalize_matrix(upper_limit_intconf_matrix_5_08_35 )
np.save('norm_lower_limit_intconf_matrix_5_08_35.npy', norm_lower_limit_intconf_matrix_5_08_35 )
np.save('norm_upper_limit_intconf_matrix_5_08_35.npy', norm_upper_limit_intconf_matrix_5_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_5_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=5, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_5_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=5, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_5_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=5, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_5_08_35 , "UMAP MST - Mean Distances - n_neighbors=5, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_5_08_35 , "UMAP MST - Lower Limit - n_neighbors=5, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_5_08_35 , "UMAP MST - Upper Limit - n_neighbors=5, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=10, min_dist=0.8

In [21]:
umap_projections_10_08_35= np.load('umap_projections_10_08_35.npy')
mean_umap_projection_10_08_35= np.load('mean_projection_10_08_35.npy')
std_projection_umap_10_08_35= np.load('std_projection_10_08_35.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_10_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_10_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_10_08_35 = np.array(umap_projections_10_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_10_08_35 = np.mean(umap_projections_10_08_35, axis=0)
std_projection_10_08_35 = np.std(umap_projections_10_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_10_08_35.npy', umap_projections_10_08_35)
np.save('mean_projection_10_08_35.npy', mean_projection_10_08_35)
np.save('std_projection_10_08_35.npy', std_projection_10_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_10_08_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_10_08_35  = np.sqrt(np.sum((umap_projections_10_08_35 - mean_umap_projection_10_08_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_10_08_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_10_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_10_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_10_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_10_08_35  = np.array(distance_matrices_10_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_10_08_35  = np.mean(distance_matrices_10_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_10_08_35  = (mean_distance_matrix_10_08_35  - np.min(mean_distance_matrix_10_08_35 )) / (np.max(mean_distance_matrix_10_08_35 ) - np.min(mean_distance_matrix_10_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_10_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10, min_dists=0.8)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_10_08_35 .npy', distance_matrices_10_08_35)
np.save('mean_distance_matrix_neighbors_10_08_35 .npy', mean_distance_matrix_10_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_10_08_35 }")

In [None]:
# Create a graph from the distance matrix
G_10_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_10_08_35 ,3))
np.save('G_10_08_35 .npy',G_10_08_35 )

# Draw the graph
pos = nx.spring_layout(G_10_08_35 , seed=42)  # positions for all nodes
nx.draw(G_10_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_10_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_10_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_10_08_35  = nx.minimum_spanning_tree(G_10_08_35 )
np.save('mst_10_08_35 .npy', mst_10_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_10_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_10_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_10_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_10_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=10, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_10_08_35  = np.std(distance_matrices_10_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_10_08_35 .npy", distance_matrix_std_10_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (10_08_35):\n", distance_matrix_std_10_08_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_10_08_35  = distance_matrix_std_10_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_10_08_35  = z_score * sem_matrix_10_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_10_08_35  = mean_distance_matrix_10_08_35  - margin_of_error_matrix_10_08_35 
upper_limit_intconf_matrix_10_08_35  = mean_distance_matrix_10_08_35  + margin_of_error_matrix_10_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_10_08_35  = np.maximum(lower_limit_intconf_matrix_10_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_10_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_10_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_10_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_10_08_35 .npy', lower_limit_intconf_matrix_10_08_35 )
np.save('upper_limit_intconf_matrix_10_08_35 .npy', upper_limit_intconf_matrix_10_08_35 )

In [147]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_10_08_35  = normalize_matrix(lower_limit_intconf_matrix_10_08_35 )
norm_upper_limit_intconf_matrix_10_08_35  = normalize_matrix(upper_limit_intconf_matrix_10_08_35 )
np.save('norm_lower_limit_intconf_matrix_10_08_35.npy', norm_lower_limit_intconf_matrix_10_08_35 )
np.save('norm_upper_limit_intconf_matrix_10_08_35.npy', norm_upper_limit_intconf_matrix_10_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_10_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=10, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_10_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=10, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_10_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=10, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_10_08_35 , "UMAP MST - Mean Distances - n_neighbors=10, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_10_08_35 , "UMAP MST - Lower Limit - n_neighbors=10, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_10_08_35 , "UMAP MST - Upper Limit - n_neighbors=10, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=20, min_dist=0.8

In [22]:
umap_projections_20_08_35= np.load('umap_projections_20_08_35.npy')
mean_umap_projection_20_08_35= np.load('mean_projection_20_08_35.npy')
std_projection_umap_20_08_35= np.load('std_projection_20_08_35.npy')

In [None]:
# Define parameters
n_neighbors = 20
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_20_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_20_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_20_08_35 = np.array(umap_projections_20_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_20_08_35 = np.mean(umap_projections_20_08_35, axis=0)
std_projection_20_08_35 = np.std(umap_projections_20_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_20_08_35.npy', umap_projections_20_08_35)
np.save('mean_projection_20_08_35.npy', mean_projection_20_08_35)
np.save('std_projection_20_08_35.npy', std_projection_20_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_20_08_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_20_08_35  = np.sqrt(np.sum((umap_projections_20_08_35 - mean_umap_projection_20_08_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_20_08_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_20_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_20_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_20_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_20_08_35  = np.array(distance_matrices_20_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_20_08_35  = np.mean(distance_matrices_20_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_20_08_35  = (mean_distance_matrix_20_08_35  - np.min(mean_distance_matrix_20_08_35 )) / (np.max(mean_distance_matrix_20_08_35 ) - np.min(mean_distance_matrix_20_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_20_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100, min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_20_08_35 .npy', distance_matrices_20_08_35)
np.save('mean_distance_matrix_neighbors_20_08_35 .npy', mean_distance_matrix_20_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_20_08_35 }")

In [None]:
# Create a graph from the distance matrix
G_20_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_20_08_35 ,3))
np.save('G_20_08_35 .npy',G_20_08_35 )

# Draw the graph
pos = nx.spring_layout(G_20_08_35 , seed=42)  # positions for all nodes
nx.draw(G_20_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_20_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_20_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_20_08_35  = nx.minimum_spanning_tree(G_20_08_35 )
np.save('mst_20_08_35 .npy', mst_20_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_20_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_20_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_20_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_20_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=20, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_20_08_35  = np.std(distance_matrices_20_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_20_08_35 .npy", distance_matrix_std_20_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (20_08_35):\n", distance_matrix_std_20_08_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_20_08_35  = distance_matrix_std_20_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_20_08_35  = z_score * sem_matrix_20_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_20_08_35  = mean_distance_matrix_20_08_35  - margin_of_error_matrix_20_08_35 
upper_limit_intconf_matrix_20_08_35  = mean_distance_matrix_20_08_35  + margin_of_error_matrix_20_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_20_08_35  = np.maximum(lower_limit_intconf_matrix_20_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_20_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_20_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_20_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_20_08_35 .npy', lower_limit_intconf_matrix_20_08_35 )
np.save('upper_limit_intconf_matrix_20_08_35 .npy', upper_limit_intconf_matrix_20_08_35 )

In [157]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_20_08_35  = normalize_matrix(lower_limit_intconf_matrix_20_08_35 )
norm_upper_limit_intconf_matrix_20_08_35  = normalize_matrix(upper_limit_intconf_matrix_20_08_35 )
np.save('norm_lower_limit_intconf_matrix_20_08_35.npy', norm_lower_limit_intconf_matrix_20_08_35 )
np.save('norm_upper_limit_intconf_matrix_20_08_35.npy', norm_upper_limit_intconf_matrix_20_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_20_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=20, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_20_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=20, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_20_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=20, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_20_08_35 , "UMAP MST - Mean Distances - n_neighbors=20, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_20_08_35 , "UMAP MST - Lower Limit - n_neighbors=20, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_20_08_35 , "UMAP MST - Upper Limit - n_neighbors=20, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=30, min_dist=0.8

In [23]:
umap_projections_30_08_35= np.load('umap_projections_30_08_35.npy')
mean_umap_projection_30_08_35= np.load('mean_projection_30_08_35.npy')
std_projection_umap_30_08_35= np.load('std_projection_30_08_35.npy')

In [None]:
# Define parameters
n_neighbors = 30
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_30_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_30_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_30_08_35 = np.array(umap_projections_30_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_30_08_35 = np.mean(umap_projections_30_08_35, axis=0)
std_projection_30_08_35 = np.std(umap_projections_30_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_30_08_35.npy', umap_projections_30_08_35)
np.save('mean_projection_30_08_35.npy', mean_projection_30_08_35)
np.save('std_projection_30_08_35.npy', std_projection_30_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_30_08_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_30_08_35  = np.sqrt(np.sum((umap_projections_30_08_35 - mean_umap_projection_30_08_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_30_08_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_30_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_30_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_30_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_30_08_35  = np.array(distance_matrices_30_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_30_08_35  = np.mean(distance_matrices_30_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_30_08_35  = (mean_distance_matrix_30_08_35  - np.min(mean_distance_matrix_30_08_35 )) / (np.max(mean_distance_matrix_30_08_35 ) - np.min(mean_distance_matrix_30_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_30_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=30, min_dists=0.8)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_30_08_35 .npy', distance_matrices_30_08_35)
np.save('mean_distance_matrix_neighbors_30_08_35 .npy', mean_distance_matrix_30_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_30_08_35 }")

In [None]:
# Create a graph from the distance matrix
G_30_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_30_08_35 ,3))
np.save('G_30_08_35 .npy',G_30_08_35 )

# Draw the graph
pos = nx.spring_layout(G_30_08_35 , seed=42)  # positions for all nodes
nx.draw(G_30_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_30_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_30_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_30_08_35  = nx.minimum_spanning_tree(G_30_08_35 )
np.save('mst_30_08_35 .npy', mst_30_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_30_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_30_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_30_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_30_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=30, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_30_08_35  = np.std(distance_matrices_30_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_30_08_35 .npy", distance_matrix_std_30_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (30_08_35):\n", distance_matrix_std_30_08_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_30_08_35  = distance_matrix_std_30_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_30_08_35  = z_score * sem_matrix_30_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_30_08_35  = mean_distance_matrix_30_08_35  - margin_of_error_matrix_30_08_35 
upper_limit_intconf_matrix_30_08_35  = mean_distance_matrix_30_08_35  + margin_of_error_matrix_30_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_30_08_35  = np.maximum(lower_limit_intconf_matrix_30_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_30_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_30_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_30_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_30_08_35 .npy', lower_limit_intconf_matrix_30_08_35 )
np.save('upper_limit_intconf_matrix_30_08_35 .npy', upper_limit_intconf_matrix_30_08_35 )

In [167]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_30_08_35  = normalize_matrix(lower_limit_intconf_matrix_30_08_35 )
norm_upper_limit_intconf_matrix_30_08_35  = normalize_matrix(upper_limit_intconf_matrix_30_08_35 )
np.save('norm_lower_limit_intconf_matrix_30_08_35.npy', norm_lower_limit_intconf_matrix_30_08_35 )
np.save('norm_upper_limit_intconf_matrix_30_08_35.npy', norm_upper_limit_intconf_matrix_30_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_30_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=30, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_30_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=30, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_30_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=30, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_30_08_35 , "UMAP MST - Mean Distances - n_neighbors=30, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_30_08_35 , "UMAP MST - Lower Limit - n_neighbors=30, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_30_08_35 , "UMAP MST - Upper Limit - n_neighbors=30, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=50, min_dist=0.8

In [12]:
umap_projections_50_08_35= np.load('umap_projections_50_08_35.npy')
mean_umap_projection_50_08_35= np.load('mean_projection_50_08_35.npy')
std_projection_umap_50_08_35= np.load('std_projection_50_08_35.npy')
distance_matrices_50_08_35= np.load('distance_matrices_neighbors_50_08_35 .npy')
mean_distance_matrix_50_08_35= np.load('mean_distance_matrix_neighbors_50_08_35 .npy')

In [None]:
# Define parameters
n_neighbors = 50
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_50_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_50_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_50_08_35 = np.array(umap_projections_50_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_50_08_35 = np.mean(umap_projections_50_08_35, axis=0)
std_projection_50_08_35 = np.std(umap_projections_50_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_50_08_35.npy', umap_projections_50_08_35)
np.save('mean_projection_50_08_35.npy', mean_projection_50_08_35)
np.save('std_projection_50_08_35.npy', std_projection_50_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_50_08_35'.")

In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_50_08_35  = np.sqrt(np.sum((umap_projections_50_08_35 - mean_umap_projection_50_08_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_50_08_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_50_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_50_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_50_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_50_08_35  = np.array(distance_matrices_50_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_50_08_35  = np.mean(distance_matrices_50_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_08_35  = (mean_distance_matrix_50_08_35  - np.min(mean_distance_matrix_50_08_35 )) / (np.max(mean_distance_matrix_50_08_35 ) - np.min(mean_distance_matrix_50_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_50_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=50, min_dists=0.8)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_50_08_35 .npy', distance_matrices_50_08_35)
np.save('mean_distance_matrix_neighbors_50_08_35 .npy', mean_distance_matrix_50_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_50_08_35 }")

In [14]:
# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_08_35  = (mean_distance_matrix_50_08_35  - np.min(mean_distance_matrix_50_08_35 )) / (np.max(mean_distance_matrix_50_08_35 ) - np.min(mean_distance_matrix_50_08_35 ))

In [None]:
# Create a graph from the distance matrix
G_50_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_50_08_35 ,3))
np.save('G_50_08_35 .npy',G_50_08_35 )

# Draw the graph
pos = nx.spring_layout(G_50_08_35 , seed=42)  # positions for all nodes
nx.draw(G_50_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_50_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_50_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Calculate the total weight of the MST
total_weight_50_08_35 = sum(nx.get_edge_attributes(mst_50_08_35, 'weight').values())

# Print the total weight
print(f"Total weight of the MST: {total_weight_50_08_35}")

# Compute the minimum spanning tree of the graph
mst_50_08_35  = nx.minimum_spanning_tree(G_50_08_35 )
np.save('mst_50_08_35 .npy', mst_50_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_50_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_50_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_50_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_50_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=50, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_50_08_35  = np.std(distance_matrices_50_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_50_08_35 .npy", distance_matrix_std_50_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (50_08_35):\n", distance_matrix_std_50_08_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_50_08_35  = distance_matrix_std_50_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_50_08_35  = z_score * sem_matrix_50_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_50_08_35  = mean_distance_matrix_50_08_35  - margin_of_error_matrix_50_08_35 
upper_limit_intconf_matrix_50_08_35  = mean_distance_matrix_50_08_35  + margin_of_error_matrix_50_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_50_08_35  = np.maximum(lower_limit_intconf_matrix_50_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_50_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_50_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_50_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_50_08_35 .npy', lower_limit_intconf_matrix_50_08_35 )
np.save('upper_limit_intconf_matrix_50_08_35 .npy', upper_limit_intconf_matrix_50_08_35 )

In [177]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_50_08_35  = normalize_matrix(lower_limit_intconf_matrix_50_08_35 )
norm_upper_limit_intconf_matrix_50_08_35  = normalize_matrix(upper_limit_intconf_matrix_50_08_35 )
np.save('norm_lower_limit_intconf_matrix_50_08_35.npy', norm_lower_limit_intconf_matrix_50_08_35 )
np.save('norm_upper_limit_intconf_matrix_50_08_35.npy', norm_upper_limit_intconf_matrix_50_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_50_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=50, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_50_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=50, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_50_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=50, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [20]:
norm_lower_limit_intconf_matrix_50_08_35= np.load('norm_lower_limit_intconf_matrix_50_08_35.npy')
norm_upper_limit_intconf_matrix_50_08_35= np.load('norm_upper_limit_intconf_matrix_50_08_35.npy')

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_50_08_35 , "UMAP MST - Mean Distances - n_neighbors=50, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_50_08_35 , "UMAP MST - Lower Limit - n_neighbors=50, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_50_08_35 , "UMAP MST - Upper Limit - n_neighbors=50, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### n=100, min_dist=0.8

In [25]:
umap_projections_100_08_35= np.load('umap_projections_100_08_35.npy')
mean_umap_projection_100_08_35= np.load('mean_projection_100_08_35.npy')
std_projection_umap_100_08_35= np.load('std_projection_100_08_35.npy')

In [None]:
# Define parameters
n_neighbors = 100
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_100_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_100_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_100_08_35 = np.array(umap_projections_100_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_100_08_35 = np.mean(umap_projections_100_08_35, axis=0)
std_projection_100_08_35 = np.std(umap_projections_100_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_100_08_35.npy', umap_projections_100_08_35)
np.save('mean_projection_100_08_35.npy', mean_projection_100_08_35)
np.save('std_projection_100_08_35.npy', std_projection_100_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_100_08_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_100_08_35  = np.sqrt(np.sum((umap_projections_100_08_35 - mean_umap_projection_100_08_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_100_08_35 , axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_100_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_100_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_100_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_100_08_35  = np.array(distance_matrices_100_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_100_08_35  = np.mean(distance_matrices_100_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_08_35  = (mean_distance_matrix_100_08_35  - np.min(mean_distance_matrix_100_08_35 )) / (np.max(mean_distance_matrix_100_08_35 ) - np.min(mean_distance_matrix_100_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_100_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100, min_dists=0.8)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_100_08_35 .npy', distance_matrices_100_08_35)
np.save('mean_distance_matrix_neighbors_100_08_35 .npy', mean_distance_matrix_100_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_100_08_35 }")

In [None]:
# Create a graph from the distance matrix
G_100_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_100_08_35 ,3))
np.save('G_100_08_35 .npy',G_100_08_35 )

# Draw the graph
pos = nx.spring_layout(G_100_08_35 , seed=42)  # positions for all nodes
nx.draw(G_100_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_100_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_100_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_100_08_35  = nx.minimum_spanning_tree(G_100_08_35 )
np.save('mst_100_08_35 .npy', mst_100_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_100_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_100_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_100_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_100_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=100, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_100_08_35  = np.std(distance_matrices_100_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_100_08_35 .npy", distance_matrix_std_100_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (100_08_35):\n", distance_matrix_std_100_08_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_100_08_35  = distance_matrix_std_100_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_100_08_35  = z_score * sem_matrix_100_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_100_08_35  = mean_distance_matrix_100_08_35  - margin_of_error_matrix_100_08_35 
upper_limit_intconf_matrix_100_08_35  = mean_distance_matrix_100_08_35  + margin_of_error_matrix_100_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_100_08_35  = np.maximum(lower_limit_intconf_matrix_100_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_100_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_100_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_100_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_100_08_35 .npy', lower_limit_intconf_matrix_100_08_35 )
np.save('upper_limit_intconf_matrix_100_08_35 .npy', upper_limit_intconf_matrix_100_08_35 )

In [187]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_100_08_35  = normalize_matrix(lower_limit_intconf_matrix_100_08_35 )
norm_upper_limit_intconf_matrix_100_08_35  = normalize_matrix(upper_limit_intconf_matrix_100_08_35 )
np.save('norm_lower_limit_intconf_matrix_100_08_35.npy', norm_lower_limit_intconf_matrix_100_08_35 )
np.save('norm_upper_limit_intconf_matrix_100_08_35.npy', norm_upper_limit_intconf_matrix_100_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_100_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=100, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_100_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=100, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_100_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=100, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_100_08_35 , "UMAP MST - Mean Distances - n_neighbors=100, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_100_08_35 , "UMAP MST - Lower Limit - n_neighbors=100, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_100_08_35 , "UMAP MST - Upper Limit - n_neighbors=100, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

## General comparison

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot each normalized matrix as a heatmap
sns.heatmap(normalized_mean_distance_matrix_5_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0,0])
axes[0,0].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=5)")
axes[0,0].set_xlabel("Cluster")
axes[0,0].set_ylabel("Cluster")

sns.heatmap(normalized_mean_distance_matrix_10_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0,1])
axes[0,1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=10)")
axes[0,1].set_xlabel("Cluster")
axes[0,1].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_20_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0,2])
axes[0,2].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=20)")
axes[0,2].set_xlabel("Cluster")
axes[0,2].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_30_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1,0])
axes[1,0].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=30)")
axes[1,0].set_xlabel("Cluster")
axes[1,0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_50_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1,1])
axes[1,1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=50)")
axes[1,1].set_xlabel("Cluster")
axes[1,1].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_100_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1,2])
axes[1,2].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=100)")
axes[1,2].set_xlabel("Cluster")
axes[1,2].set_ylabel("")

# Adjusted layout
plt.tight_layout()
plt.show()

**Smaller n_neighbors (5, 10):**

- Clusters exhibit closer relationships, with smaller normalized distances between certain cluster pairs (e.g., clusters 2 and 3, 2 and 5).
- Cluster overlap is visible in the UMAP embeddings, as smaller n_neighbors focus on local structures, resulting in tighter but more interconnected clusters.
- Variations in distances between cluster pairs suggest some **instability** in global separation.

**Larger n_neighbors ( 50, 100):**

- Clusters become more distinct, with higher mean distances across most cluster pairs.
- The normalized values tend to converge across different pairs, indicating that clusters are uniformly separated.
- There is a loss of local relationships, but the global structure is more stable.

**Insights:**
- Smaller n_neighbors: Ideal for capturing fine-grained local relationships, but may lead to cluster overlap or ambiguity.
- Larger n_neighbors: Better for creating distinct, globally separated clusters, though at the expense of local details.

The *lack of a consistent* trend for certain cluster pairs (like Cluster 2 and Cluster 3) as n_neighbors increases—specifically the fact that normalized distances at n=50 and n=100 differ—can be explained by the *inherent trade-offs in UMAP's parameterization and how it balances local vs. global structure at different scales.*

**- UMAP is nonlinear:**

Algorithm uses local distances between neighbors to construct the high-dimensional graph, which it then embeds into a lower-dimensional space. Nonlinearities in how UMAP maps points into the embedding space can create irregularities in the behavior of normalized mean distances.

**- Global vs. Local Trade-offs:**

When n_neighbors is small (e.g., 5 or 10), the embedding is heavily focused on local connections. For cluster pairs that are naturally distinct (like Cluster 2 and Cluster 3), the local focus ensures strong separation.
As n_neighbors increases, UMAP prioritizes global consistency, which can reduce the relative separation between certain cluster pairs. However, for n_neighbors=50 vs. n_neighbors=100, small shifts in the graph structure (how clusters are connected globally) can lead to inconsistent distances.

**- Stochasticity in UMAP:**

Even with fixed parameters, UMAP embeddings can exhibit small variations due to its stochastic nature. These variations may amplify when focusing on normalized metrics like distances between clusters.

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot each normalized matrix as a heatmap
sns.heatmap(normalized_distance_matrix_std_5_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0,0])
axes[0,0].set_title("Normalized Std. Dev. Dist. Matrix (k=10, n_neighbors=5)")
axes[0,0].set_xlabel("Cluster")
axes[0,0].set_ylabel("Cluster")

sns.heatmap(normalized_distance_matrix_std_10_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0,1])
axes[0,1].set_title("Normalized Std. Dev. Dist. Matrix (k=10, n_neighbors=10)")
axes[0,1].set_xlabel("Cluster")
axes[0,1].set_ylabel("")

sns.heatmap(normalized_distance_matrix_std_20_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0,2])
axes[0,2].set_title("Normalized Std. Dev. Dist. Matrix (k=10, n_neighbors=20)")
axes[0,2].set_xlabel("Cluster")
axes[0,2].set_ylabel("")

sns.heatmap(normalized_distance_matrix_std_30_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1,0])
axes[1,0].set_title("Normalized Std. Dev. Dist. Matrix (k=10, n_neighbors=30)")
axes[1,0].set_xlabel("Cluster")
axes[1,0].set_ylabel("")

sns.heatmap(normalized_distance_matrix_std_50_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1,1])
axes[1,1].set_title("Normalized Std. Dev. Dist. Matrix (k=10, n_neighbors=50)")
axes[1,1].set_xlabel("Cluster")
axes[1,1].set_ylabel("")

sns.heatmap(normalized_distance_matrix_std_100_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1,2])
axes[1,2].set_title("Normalized Std. Dev. Dist. Matrix (k=10, n_neighbors=100)")
axes[1,2].set_xlabel("Cluster")
axes[1,2].set_ylabel("")

# Adjusted layout
plt.tight_layout()
plt.show()

**Smaller n_neighbors (5, 10):**

- Higher normalized std. deviation distances for several cluster pairs.
- Indicates more variability within clusters and greater sensitivity to noise or small changes in the data.


**Larger n_neighbors (50, 100):**

- Lower std. deviation distances across most cluster pairs, reflecting greater stability and reduced variability.
Indicates that larger n_neighbors values smooth out small-scale variations and lead to more consistent cluster shapes across runs.

**Insights:**

- Smaller n_neighbors: Capture more local variability, which may be useful for detecting finer structural differences but might introduce instability.
- Larger n_neighbors: Produce more stable clusters with lower variability, ideal for applications requiring consistent global structure.

In [None]:
# # Define a dictionary with mean distance matrices and their corresponding n_neighbors
# mean_distance_matrices = {
#     "n_neighbors=5": mean_distance_matrix_5_35,
#     "n_neighbors=10": mean_distance_matrix_10_35,
#     "n_neighbors=20": mean_distance_matrix_20_35,
#     "n_neighbors=30": mean_distance_matrix_50_35,
#     "n_neighbors=50": mean_distance_matrix_50_35,
#     "n_neighbors=100": mean_distance_matrix_100_35,
# }

# # Normalize each matrix and store them in a new dictionary
# normalized_matrices = {
#     key: (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))
#     for key, matrix in mean_distance_matrices.items()
# }

# # Set up the figure with 2 rows and 3 columns
# fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# # Flatten the axes array for easier iteration
# axes = axes.flatten()

# # Plot each normalized matrix
# for ax, (key, matrix) in zip(axes, normalized_matrices.items()):
#     sns.heatmap(matrix, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=ax)
#     ax.set_title(f"Normalized Mean Dist. Matrix ({key})")
#     ax.set_xlabel("Cluster")
#     ax.set_ylabel("Cluster" if "n_neighbors=5" in key else "")

# # Adjust layout
# plt.tight_layout()
# plt.show()

**n_neighbors = 5**
- High Local Connectivity: n_neighbors small value, more focus on local structure. Clusters close in the original high-dimensional space remain close.
- Bbroader range of values (from around 0.00 to nearly 1.00), indicating that some clusters are very close, while others are more distant. UMAP capture detailed local variations, creating a more distinct separation between clusters with less overlap.

**n_neighbors = 10**
- Mix of both local and global relationships. Moderate distance range, with slightly less contrast compared to n_neighbors=5.
- More similarity in distances across clusters, some of the values in the heatmap become more uniform.

**n_neighbors = 20**
- Emphasis on Global Structure: With n_neighbors set to 20, UMAP emphasizes global relationships between clusters. Fewer very close or very distant clusters.
- More Homogeneous Distances: less variation, indicating that UMAP is pulling clusters closer together in the reduced space. This setting can blur local distinctions and give a more general structure.

-----------------

In [None]:
import matplotlib.image as mpimg

# List of file names corresponding to the saved plots
file_names = [
    'neighbor_counts_plot_n_5_35.png',
    'neighbor_counts_plot_n_10_35.png',
    'neighbor_counts_plot_n_20_35.png',
    'neighbor_counts_plot_n_30_35.png',
    'neighbor_counts_plot_n_50_35.png',
    'neighbor_counts_plot_n_100_35.png'
]

# Number of rows and columns for the grid layout
n_rows = 2
n_cols = 3

# Create a figure to display the plots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 10))
fig.suptitle('Neighbor Counts Across Runs for Different n_neighbors Values', fontsize=16)

# Iterate through the file names and axes to load and display each plot
for ax, file_name in zip(axes.flat, file_names):
    # Load the image
    img = mpimg.imread(file_name)
    # Display the image on the current axis
    ax.imshow(img)
    # Set the title to indicate the n_neighbors value
    ax.set_title(file_name.split('_n_')[1].split('_35.png')[0], fontsize=14)
    # Remove axis ticks
    ax.axis('off')

# Adjust layout to prevent overlap
plt.tight_layout(rect=[0, 0, 1, 0.95])

# Show the grid of plots
plt.show()

**Insights from Neighbor Count Graphs**

**General Stability Across Runs:**

- For all n_neighbors values, the mean neighbor count remains relatively stable across the 35 runs, indicating that UMAP preserves the overall neighborhood structure across multiple projections.
- The max neighbor count, however, shows more variability for lower n_neighbors values (e.g., n=5 and n=10), which reduces significantly as n_neighbors increases (n=50 and n=100).

**Effect of n_neighbors:**

- As n_neighbors increases, both the mean and max neighbor counts increase. This is expected because a higher n_neighbors value results in a broader neighborhood being captured in the UMAP embedding.
- For smaller n_neighbors (e.g., n=5):
Clusters are tighter, and the max neighbor count fluctuates significantly, reflecting that some clusters are smaller or more isolated.
- For larger n_neighbors (e.g., n=100):
- The neighborhood structure stabilizes with high max neighbor counts, suggesting that clusters are more broadly connected in the embedding.

**Trend Lines:**

The trend lines for both mean and max neighbor counts show slight increases for all n_neighbors, but the slope diminishes as n_neighbors grows, reflecting diminishing returns in terms of increasing neighbor counts at higher values.

**Insights from Dynamic Radius and Minimum Distance**

**Dynamic Radius Behavior:**

- The dynamic radius decreases as n_neighbors increases. This happens because higher n_neighbors values result in clusters being more spread out, reducing the necessity for smaller, tightly-defined cluster boundaries.
- At lower n_neighbors values, the clusters are compact, requiring smaller dynamic radii to define meaningful inter-cluster relationships.

**Minimum Distance Across Runs:**

- Minimum distances (used to compute the dynamic radius) are larger for lower n_neighbors values (e.g., n=5 and n=10) because clusters are tighter and more isolated, leading to clearer separations.
- As n_neighbors increases, the minimum distances shrink slightly, indicating that clusters start to overlap more, making them harder to distinguish.

**Clusters Contributing to Minimum Distance:**

- The pairs of clusters contributing to the minimum distance vary across n_neighbors. This suggests that the interaction between clusters is highly sensitive to the neighborhood size. For example:

n=5: Clusters (3, 8) have the smallest separation.

n=100: Clusters (4, 6) show the smallest distance, possibly due to broader neighborhood connections.

-------------

In [None]:
# Define separate positions for each MST for better clarity
pos_5 = nx.spring_layout(mst_5_35, seed=42)
pos_10 = nx.spring_layout(mst_10_35, seed=42)
pos_20 = nx.spring_layout(mst_20_35, seed=42)
pos_50 = nx.spring_layout(mst_50_35, seed=42)
pos_50 = nx.spring_layout(mst_50_35, seed=42)
pos_100 = nx.spring_layout(mst_100_35, seed=42)

# Set up the figure with three subplots side by side
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot MST with n_neighbors=5
nx.draw(mst_5_35, pos_5, with_labels=True, node_color='lightblue', edge_color='red', node_size=600, font_size=9, width=2, ax=axes[0,0])
edge_labels_5 = nx.get_edge_attributes(mst_5_35, 'weight')
nx.draw_networkx_edge_labels(mst_5_35, pos_5, edge_labels=edge_labels_5, font_size=7, label_pos=0.4, ax=axes[0,0])
axes[0,0].set_title("MST - n_neighbors=5")

# Plot MST with n_neighbors=10
nx.draw(mst_10_35, pos_10, with_labels=True, node_color='lightblue', edge_color='red', node_size=600, font_size=9, width=2, ax=axes[0,1])
edge_labels_10 = nx.get_edge_attributes(mst_10_35, 'weight')
nx.draw_networkx_edge_labels(mst_10_35, pos_10, edge_labels=edge_labels_10, font_size=7, label_pos=0.4, ax=axes[0,1])
axes[0,1].set_title("MST - n_neighbors=10")

# Plot MST with n_neighbors=20
nx.draw(mst_20_35, pos_20, with_labels=True, node_color='lightblue', edge_color='red', node_size=600, font_size=9, width=2, ax=axes[0,2])
edge_labels_20 = nx.get_edge_attributes(mst_20_35, 'weight')
nx.draw_networkx_edge_labels(mst_20_35, pos_20, edge_labels=edge_labels_20, font_size=7, label_pos=0.4, ax=axes[0,2])
axes[0,2].set_title("MST - n_neighbors=20")

# Plot MST with n_neighbors=30
nx.draw(mst_50_35, pos_50, with_labels=True, node_color='lightblue', edge_color='red', node_size=600, font_size=9, width=2, ax=axes[1,0])
edge_labels_50 = nx.get_edge_attributes(mst_50_35, 'weight')
nx.draw_networkx_edge_labels(mst_50_35, pos_50, edge_labels=edge_labels_50, font_size=7, label_pos=0.4, ax=axes[1,0])
axes[1,0].set_title("MST - n_neighbors=30")

# Plot MST with n_neighbors=50
nx.draw(mst_50_35, pos_50, with_labels=True, node_color='lightblue', edge_color='red', node_size=600, font_size=9, width=2, ax=axes[1,1])
edge_labels_50 = nx.get_edge_attributes(mst_50_35, 'weight')
nx.draw_networkx_edge_labels(mst_50_35, pos_50, edge_labels=edge_labels_50, font_size=7, label_pos=0.4, ax=axes[1,1])
axes[1,1].set_title("MST - n_neighbors=50")

# Plot MST with n_neighbors=100
nx.draw(mst_100_35, pos_100, with_labels=True, node_color='lightblue', edge_color='red', node_size=600, font_size=9, width=2, ax=axes[1,2])
edge_labels_100 = nx.get_edge_attributes(mst_100_35, 'weight')
nx.draw_networkx_edge_labels(mst_100_35, pos_100, edge_labels=edge_labels_100, font_size=7, label_pos=0.4, ax=axes[1,2])
axes[1,2].set_title("MST - n_neighbors=100")

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

**Cluster Connectivity:**

**Low n_neighbors values (5, 10):**
The MST shows sparse connections, with most clusters forming direct links to only one or two other clusters. This indicates strong local relationships but limited global integration. E.g., clusters 0, 1, and 9 have fewer connections in n_neighbors=5 compared to higher n_neighbors values.

**High n_neighbors values (50, 100):**

The MST becomes more integrated, with clusters forming a more interconnected network. This reflects stronger global relationships and less separation among clusters. E.g., cluster 0 acts as a hub in n_neighbors=100.

**Edge Weights (Distances)**

**Low n_neighbors values (5, 10):**

Edge weights tend to be higher, indicating greater separation between clusters. Clusters are linked only when the distance is minimal due to the focus on local density.
E.g., in n_neighbors=5, edges like between clusters 3 and 8 show higher weights compared to higher n_neighbors.

**High n_neighbors values (50, 100):**

Edge weights decrease as the global structure becomes more cohesive, reflecting closer relationships between clusters.
E.g., weights such as between clusters 0 and 5 or 7 and 9 are smaller in n_neighbors=100.

Hub Clusters:

Clusters like 1 or 0 tend to act as central hubs in lower n_neighbors values. As n_neighbors increases, central roles shift slightly to clusters like 5 or 7, depending on their proximity and relationship with other clusters.
This shift highlights how the parameter impacts the representation of central clusters, with larger neighborhoods prioritizing global consistency over local detail.

Structural Changes:

For low n_neighbors, the MST prioritizes local connections, resulting in a more tree-like and segmented structure.
For higher n_neighbors,the tree grows denser, with more branches and inter-cluster connections. This reflects a trade-off: higher n_neighbors captures more global information but risks losing fine-grained local details.
Key Insights:
Effect of n_neighbors:

Small n_neighbors values (5, 10): Emphasize local relationships, causing clusters to be more isolated in the MST. This approach might be useful for identifying finer-grained local structure.
Large n_neighbors values (50, 100): Emphasize global structure, leading to tighter integration and highlighting broad patterns in the data.
Application:

For local structure analysis, choose small n_neighbors (e.g., 5–10).
For global relationships, large n_neighbors (e.g., 50–100) captures broader trends.
Cluster Stability:

Comparing the MST across different n_neighbors highlights how certain clusters maintain consistent connections (e.g., clusters 0 and 7 often connect), reflecting their stability across runs.

Main Insights:
Parameter Tuning: Adjusting n_neighbors influences the balance between local and global structures in the UMAP projection. For applications requiring fine-grained cluster separation, lower values of n_neighbors are better. For broader connectivity and higher-level structures, larger values are more suitable.
Cluster Stability: The consistent appearance of certain connections (e.g., 7-8, 0-1) suggests that some relationships are robust to parameter changes, reinforcing their reliability in the dataset's structure.
Hierarchical Patterns: As n_neighbors increases, the graph evolves into a more hierarchical structure, highlighting relationships across broader scales.

In [None]:
# Define separate positions for each MST for better clarity
pos_5 = nx.spring_layout(mst_std_5_35, seed=42)
pos_10 = nx.spring_layout(mst_std_10_35, seed=42)
pos_20 = nx.spring_layout(mst_std_20_35, seed=42)
pos_30 = nx.spring_layout(mst_std_30_35, seed=42)
pos_50 = nx.spring_layout(mst_std_50_35, seed=42)
pos_100 = nx.spring_layout(mst_std_100_35, seed=42)

# Set up the figure with subplots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot MST with n_neighbors=5
nx.draw(mst_std_5_35, pos_5, ax=axes[0, 0], with_labels=True, node_color='lightyellow',edge_color='green', node_size=500, font_size=10, width=2)
edge_labels_5 = nx.get_edge_attributes(mst_std_5_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_5_35, pos_5, edge_labels=edge_labels_5, ax=axes[0, 0], font_size=8, label_pos=0.3)
axes[0, 0].set_title("MST Std. Dev - n_neighbors=5")

# Plot MST with n_neighbors=10
nx.draw(mst_std_10_35, pos_10, ax=axes[0, 1], with_labels=True, node_color='lightyellow',edge_color='green', node_size=500, font_size=10, width=2)
edge_labels_10 = nx.get_edge_attributes(mst_std_10_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_10_35, pos_10, edge_labels=edge_labels_10, ax=axes[0, 1], font_size=8, label_pos=0.3)
axes[0, 1].set_title("MST Std. Dev - n_neighbors=10")

# Plot MST with n_neighbors=20
nx.draw(mst_std_20_35, pos_20, ax=axes[0, 2], with_labels=True, node_color='lightyellow',edge_color='green', node_size=500, font_size=10, width=2)
edge_labels_20 = nx.get_edge_attributes(mst_std_20_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_20_35, pos_20, edge_labels=edge_labels_20, ax=axes[0, 2], font_size=8, label_pos=0.3)
axes[0, 2].set_title("MST Std. Dev - n_neighbors=20")

# Plot MST with n_neighbors=30
nx.draw(mst_std_30_35, pos_30, ax=axes[1, 0], with_labels=True, node_color='lightyellow',edge_color='green', node_size=500, font_size=10, width=2)
edge_labels_30 = nx.get_edge_attributes(mst_std_30_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_30_35, pos_30, edge_labels=edge_labels_30, ax=axes[1, 0], font_size=8, label_pos=0.3)
axes[1, 0].set_title("MST Std. Dev - n_neighbors=30")

# Plot MST with n_neighbors=50
nx.draw(mst_std_50_35, pos_50, ax=axes[1, 1], with_labels=True, node_color='lightyellow',edge_color='green', node_size=500, font_size=10, width=2)
edge_labels_50 = nx.get_edge_attributes(mst_std_50_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_50_35, pos_50, edge_labels=edge_labels_50, ax=axes[1, 1], font_size=8, label_pos=0.3)
axes[1, 1].set_title("MST Std. Dev - n_neighbors=50")

# Plot MST with n_neighbors=100
nx.draw(mst_std_100_35, pos_100, ax=axes[1, 2], with_labels=True, node_color='lightyellow',edge_color='green', node_size=500, font_size=10, width=2)
edge_labels_100 = nx.get_edge_attributes(mst_std_100_35, 'weight')
nx.draw_networkx_edge_labels(mst_std_100_35, pos_100, edge_labels=edge_labels_100, ax=axes[1, 2], font_size=8, label_pos=0.3)
axes[1, 2].set_title("MST Std. Dev - n_neighbors=100")

# Adjust layout for better spacing
plt.tight_layout()
plt.show()


MST for means shows the average distances between cluster centroids, the MST for standard deviation highlights how consistent these distances are.

The MST for standard deviation reflects the variability or consistency of distances between clusters across multiple UMAP runs.

Lower edge weights indicate less variability, implying consistent distances between clusters in repeated runs.

Higher edge weights suggest greater variability, indicating clusters that are less stable in their relationship across runs.

**Clusters that are consistently closer in the MST for means should also have low edge weights in the MST for standard deviation**, as consistent closeness implies stability.

Conversely, if some clusters are close in the MST for means but show high edge weights in the MST for standard deviation, it suggests that their proximity varies across runs.

*As n_neighbors increases:*
The MST for standard deviation tends to become more balanced, as larger neighborhoods incorporate more data points, reducing variability in distances between clusters.

The relationship between clusters becomes more stable due to the smoother structure of the UMAP embeddings with higher n_neighbors.

**Specific Observations:**

- **For smaller n_neighbors** (5 or 10), there are noticeable high-variability edges, especially between clusters that are far apart in the MST for means.

- **For larger n_neighbors** (50 or 100), the edges in the MST for standard deviation tend to show lower weights overall, reflecting greater stability.

- Clusters that are peripheral in the MST for means (e.g., clusters 6, 8, and 9) often have higher variability, as their positions can fluctuate more significantly between runs.

**Cross-Comparison Insights:**

*Stability vs. Proximity:*

- Clusters connected by short edges in the MST for means should ideally have short edges in the MST for standard deviation. Deviations from this pattern can indicate unstable proximities.
For example:
If cluster pairs (3, 8) or (0, 9) have short edges in the means MST but long edges in the std deviation MST, it suggests their relationship fluctuates across runs.
In contrast, pairs with short edges in both MSTs are robust and consistent.

*Trend with Increasing n_neighbors:*

- The MST for standard deviation generally shows fewer high-weight edges as n_neighbors increases. This aligns with the observation that higher n_neighbors smooth out the embedding space, leading to more stable relationships between clusters.

In [None]:
# List of n_neighbors values
n_neighbors_values = [5, 10, 20, 30, 50, 100]

# Function to convert numpy.ndarray to NetworkX graph and compute MST
def convert_and_save_msts():
    for n in n_neighbors_values:
        # Load the numpy.ndarray distance matrix for means
        mean_matrix_path = f'normalized_mean_distance_matrix_{n}_35.npy'
        std_matrix_path = f'normalized_distance_matrix_std_{n}_35.npy'
        
        # Load the distance matrices
        mean_matrix = np.load(mean_matrix_path)
        std_matrix = np.load(std_matrix_path)
        
        # Convert mean distance matrix to NetworkX graph and compute MST
        G_means = nx.from_numpy_array(mean_matrix)
        mst_means = nx.minimum_spanning_tree(G_means)
        
        # Save the MST for means as a pickle
        with open(f'mst_means_{n}_35.pkl', 'wb') as f:
            pickle.dump(mst_means, f)
        
        # Convert std deviation distance matrix to NetworkX graph and compute MST
        G_std = nx.from_numpy_array(std_matrix)
        mst_std = nx.minimum_spanning_tree(G_std)
        
        # Save the MST for std deviations as a pickle
        with open(f'mst_std_{n}_35.pkl', 'wb') as f:
            pickle.dump(mst_std, f)
        
        print(f"MSTs for n_neighbors={n} have been converted and saved.")

# Run the conversion and save the MSTs
convert_and_save_msts()

In [None]:
# Define a function to compute correlation between two MSTs
def compute_mst_edge_correlation(mst_means, mst_std):
    # Extract edges and weights
    edges_means = nx.get_edge_attributes(mst_means, 'weight')
    edges_std = nx.get_edge_attributes(mst_std, 'weight')
    
    # Match edges (both MSTs should have the same edges)
    weights_means = []
    weights_std = []
    for edge in edges_means:
        if edge in edges_std:
            weights_means.append(edges_means[edge])
            weights_std.append(edges_std[edge])
    
    # Check if there are enough matching edges
    if len(weights_means) < 2 or len(weights_std) < 2:
        print(f"Insufficient overlapping edges: {len(weights_means)} edges.")
        return None, None  # Return None if insufficient data for correlation
    
    # Compute Pearson correlation
    correlation, p_value = pearsonr(weights_means, weights_std)
    return correlation, p_value

# Load and compute correlation for each n_neighbors value
n_neighbors_values = [5, 10, 20, 30, 50, 100]
correlations = {}

for n in n_neighbors_values:
    # Load MSTs from .pkl files
    try:
        with open(f'mst_means_{n}_35.pkl', 'rb') as f:
            mst_means = pickle.load(f)
        with open(f'mst_std_{n}_35.pkl', 'rb') as f:
            mst_std = pickle.load(f)
        
        # Compute correlation
        correlation, p_value = compute_mst_edge_correlation(mst_means, mst_std)
        correlations[n] = (correlation, p_value)
        if correlation is not None:
            print(f"n_neighbors={n}: Correlation = {correlation:.3f}, p-value = {p_value:.3f}")
        else:
            print(f"n_neighbors={n}: Insufficient overlapping edges for correlation.")
    except FileNotFoundError:
        print(f"n_neighbors={n}: MST files not found.")

# Optionally visualize the results for valid correlations
n_neighbors_valid = [k for k, v in correlations.items() if v[0] is not None]
correlation_values = [v[0] for k, v in correlations.items() if v[0] is not None]

plt.figure(figsize=(10, 6))
plt.plot(n_neighbors_valid, correlation_values, marker='o', color='blue', label='Pearson Correlation')
plt.axhline(0, color='gray', linestyle='--', linewidth=0.8)
plt.title('Correlation Between MST Edge Weights (Mean vs Std Deviation)', fontsize=14)
plt.xlabel('n_neighbors', fontsize=12)
plt.ylabel('Correlation', fontsize=12)
plt.xticks(n_neighbors_valid)
plt.grid(True, linestyle='--', linewidth=0.7, alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()


**Lack of Overlapping Edges:**

MSTs depend heavily on the edge weights derived from the distance matrices. As n_neighbors changes, the distances between clusters in the data manifold change, leading to different MST structures. When there are insufficient overlapping edges between the two graphs, the correlation computation cannot proceed.

**Perfect Correlations:**

For n_neighbors=30, there is perfect positive correlation. This suggests that the MSTs for means and std deviation share the same edges, and the weights are proportional across both graphs. This scenario might occur if the clusters stabilize in structure and the relationship between means and std deviation distances becomes consistent.

**Edge Weight Variability:**

For n_neighbors=5 and n_neighbors=100, the negative correlations are artifacts of limited overlapping edges. With only one edge to compare, a perfect correlation (positive or negative) is likely due to insufficient data.

In [None]:
# To open the MST graphs
# import pickle
# n_neighbors = 5  # Example
# # Load the MSTs
# with open(f'mst_means_{n_neighbors}_35.pkl', 'rb') as f:
#     mst_means = pickle.load(f)

# with open(f'mst_std_{n_neighbors}_35.pkl', 'rb') as f:
#     mst_std = pickle.load(f)

# # Now mst_means and mst_std are NetworkX graphs
# print(mst_means.nodes, mst_means.edges)

In [None]:
# Define clusters
clusters = np.arange(10)  # Clusters from 0 to 9

# Define colors for each n_neighbors
colors = {5: "orange", 10: "blue", 20: "yellow", 30: "grey", 50: "green", 100: "red"}

# Iterate over each cluster as the base cluster
for base_cluster in clusters:
    # Define the data for each n_neighbors, adjusted for the base cluster
    data = {
        5: {
            "mean": np.delete(mean_distance_matrix_5_35[base_cluster], base_cluster),  # Distances from base cluster
            "lower": np.delete(lower_limit_intconf_matrix_5_35[base_cluster], base_cluster),  # Lower bounds
            "upper": np.delete(upper_limit_intconf_matrix_5_35[base_cluster], base_cluster)   # Upper bounds
        },
        10: {
            "mean": np.delete(mean_distance_matrix_10_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_10_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_10_35[base_cluster], base_cluster)
        },
        20: {
            "mean": np.delete(mean_distance_matrix_20_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_20_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_20_35[base_cluster], base_cluster)
        },
        30: {
            "mean": np.delete(mean_distance_matrix_30_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_30_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_30_35[base_cluster], base_cluster)
        },
        50: {
            "mean": np.delete(mean_distance_matrix_50_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_50_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_50_35[base_cluster], base_cluster)
        },
        100: {
            "mean": np.delete(mean_distance_matrix_100_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_100_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_100_35[base_cluster], base_cluster)
        }
    }

    # Define clusters to be compared against (excluding the base cluster)
    compare_clusters = np.delete(clusters, base_cluster)

    # Plotting
    fig, ax = plt.subplots(figsize=(16, 8))

    width = 0.15  # Bar width
    x = np.arange(len(compare_clusters))  # X positions for clusters

    for idx, (n_neighbors, values) in enumerate(data.items()):
        # Calculate positions for the current set of bars
        x_positions = x + (idx - len(data) / 2) * width

        # Plot bars for the mean distances
        ax.bar(
            x_positions,
            values["mean"],  # Mean distances
            yerr=[
                values["mean"] - values["lower"],  # Lower error
                values["upper"] - values["mean"]   # Upper error
            ],
            width=width,
            color=colors[n_neighbors],
            alpha=0.7,
            label=f"n={n_neighbors}",
            capsize=5
        )

    # Add labels, title, and legend
    ax.set_xlabel("Clusters", fontsize=14)
    ax.set_ylabel("Distance", fontsize=14)
    ax.set_title(f"Confidence Intervals of Distances from Cluster {base_cluster} to Other Clusters", fontsize=16)
    ax.set_xticks(x)
    ax.set_xticklabels([f"{i}" for i in compare_clusters], fontsize=12)
    ax.legend(title="n_neighbors", fontsize=10)
    ax.grid(axis="y", linestyle="--", alpha=0.7)

    plt.tight_layout()
    plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Define your matrices for each n_neighbors value
matrices = {
    5: {
        "mean": normalized_mean_distance_matrix_5_01_35,
        "lower": norm_lower_limit_intconf_matrix_5_01_35,
        "upper": norm_upper_limit_intconf_matrix_5_01_35
    },
    10: {
        "mean": normalized_mean_distance_matrix_10_01_35,
        "lower": norm_lower_limit_intconf_matrix_10_01_35,
        "upper": norm_upper_limit_intconf_matrix_10_01_35
    },
    20: {
        "mean": normalized_mean_distance_matrix_20_01_35,
        "lower": norm_lower_limit_intconf_matrix_20_01_35,
        "upper": norm_upper_limit_intconf_matrix_20_01_35
    },
    30: {
        "mean": normalized_mean_distance_matrix_30_01_35,
        "lower": norm_lower_limit_intconf_matrix_30_01_35,
        "upper": norm_upper_limit_intconf_matrix_30_01_35
    },
    50: {
        "mean": normalized_mean_distance_matrix_50_01_35,
        "lower": norm_lower_limit_intconf_matrix_50_01_35,
        "upper": norm_upper_limit_intconf_matrix_50_01_35
    },
    100: {
        "mean": normalized_mean_distance_matrix_100_01_35,
        "lower": norm_lower_limit_intconf_matrix_100_01_35,
        "upper": norm_upper_limit_intconf_matrix_100_01_35
    }
}

# Open a PDF to save the plots
with PdfPages('MST_Comparisons min_dist=0.1.pdf') as pdf:
    for n_neighbors, matrix_set in matrices.items():
        # Set up the figure with three subplots
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        
        # Plot MSTs for mean, lower, and upper matrices
        plot_mst(matrix_set["mean"], f"MST - Mean Distances (n_neighbors={n_neighbors}, min_dist= 0.1)", axes[0], color='red')
        plot_mst(matrix_set["lower"], f"MST - Lower Limit (n_neighbors={n_neighbors}, min_dist= 0.1)", axes[1], color='blue')
        plot_mst(matrix_set["upper"], f"MST - Upper Limit (n_neighbors={n_neighbors}, min_dist= 0.1)", axes[2], color='green')
        
        # Adjust layout for better spacing
        plt.tight_layout()
        
        # Save the current figure to the PDF
        pdf.savefig(fig)
        plt.close(fig)

print("PDF with MST Comparisons has been successfully created.")

-----------------

### OLD VERSIONS

In [None]:
# Load the UMAP projections
umap_projections_20_35 = np.load(f'umap_projections_neighbors_20.npy')

# To see the contents of the UMAP projections
print(umap_projections_20_35)

In [None]:
kmeans_centroids_20 = np.load(f"kmeans_centroids_neighbors_20.npy")  # Load the saved centroids data
centroid_mean_neighbors_20 = np.mean(kmeans_centroids_20, axis=0)  # Calculate centroid mean
np.save(f'centroid_mean_neighbors_20.npy', centroid_mean_neighbors_20)
print(centroid_mean_neighbors_20)

In [None]:
# Calculate the standard deviation for the x-axis and y-axis across the clusters
std_dev_x_20= round(np.std(centroid_mean_neighbors_20[:, 0]),2)  # Standard deviation for the x-axis
std_dev_y_20= round(np.std(centroid_mean_neighbors_20[:, 1]),2) # Standard deviation for the y-axis

# Print the results
print(f'Standard deviation for x-axis: {std_dev_x_20}')
print(f'Standard deviation for y-axis: {std_dev_y_20}')

In [43]:
# Check if the centroid is within the 90% range
def is_within_90_percent(centroid, centroid_mean_neighbors_20, std_dev_x_20, std_dev_y_20):
    # Calculate the 90% range for x-axis
    lower_bound_x_20 = centroid_mean_neighbors_20[0] - 1.645 * std_dev_x_20
    upper_bound_x_20 = centroid_mean_neighbors_20[0] + 1.645 * std_dev_x_20
    
    # Calculate the 90% range for y-axis
    lower_bound_y_20 = centroid_mean_neighbors_20[1] - 1.645 * std_dev_y_20
    upper_bound_y_20 = centroid_mean_neighbors_20[1] + 1.645 * std_dev_y_20
    
    # Check if the centroid is within both x and y ranges
    x_in_range_20 = lower_bound_x_20 <= centroid[0] <= upper_bound_x_20
    y_in_range_20 = lower_bound_y_20 <= centroid[1] <= upper_bound_y_20
    
    # Return True if both x and y are within the range, False otherwise
    return x_in_range_20 and y_in_range_20

In [None]:
# Results table: Trial | Cluster | Centroid Coord | Inside the 90%

result_table_20_35 = []
n_runs = 35
n_clusters = 10

for run in range(n_runs):
    for i in range(n_clusters):
        centroid = kmeans_centroids_20[run][i]
        centroid_mean = centroid_mean_neighbors_20[i]

        # Check if the centroid is within the 90% range
        inside_90 = is_within_90_percent(centroid, centroid_mean, std_dev_x_20, std_dev_y_20)
        
        # Append the result to the table
        result_table_20_35.append([run + 1, i + 1, centroid, inside_90])

# Convert result_table to a DataFrame for better readability
df_results_20_35 = pd.DataFrame(result_table_20_35, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside the 90%'])

# Display the DataFrame
print(df_results_20_35)

In [45]:
# Save the result table to a CSV file
df_results_20_35.to_csv(f'result_table_neighbors_{20}_35.csv', index=False)

In [122]:
df_results_20_35= pd.read_csv(f'result_table_neighbors_{20}_35.csv')

In [None]:
# identify the runs that have at least one centroid outside the 90% range
runs_to_remove = df_results_20_35.loc[~df_results_20_35['Inside the 90%'], 'Trial'].unique()

# Filter out the identified runs
df_filtered_results_20_35 = df_results_20_35[~df_results_20_35['Trial'].isin(runs_to_remove)]

# Step 3: Continue your analysis with the remaining runs
print(f"Runs removed: {runs_to_remove}")
print(f"Remaining runs after filtering: {df_filtered_results_20_35['Trial'].unique()}")

In [None]:
# Plot changes in X-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids[:, cluster, 0], marker='o', linestyle='-', color='b')
    plt.title(f'Cluster {cluster} X-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('X Centroid Coordinate')
    plt.grid(True)
    plt.show()

# Plot changes in Y-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids[:, cluster, 1], marker='o', linestyle='-', color='g')
    plt.title(f'Cluster {cluster} Y-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('Y Centroid Coordinate')
    plt.grid(True)
    plt.show()

In [None]:
import ast

# Step: Convert the NumPy array into a DataFrame with 'Cluster', 'x_mean', and 'y_mean'
centroid_mean_neighbors_20_df = pd.DataFrame(centroid_mean_neighbors_20, columns=['x_mean', 'y_mean'])
centroid_mean_neighbors_20_df['Cluster'] = np.arange(10)

# Step: Insert commas between the coordinates
df_results_20_35['Centroid Coord'] = df_results_20_35['Centroid Coord'].str.replace(r'(\-?\d+\.?\d*)\s+(\-?\d+\.?\d*)', r'\1, \2', regex=True)

# Convert 'Centroid Coord' from string to list
df_results_20_35['Centroid Coord'] = df_results_20_35['Centroid Coord'].apply(ast.literal_eval)

# Step: Extract x and y coordinates from 'Centroid Coord' in `df_results_20_35`
df_results_20_35[['x', 'y']] = pd.DataFrame(df_results_20_35['Centroid Coord'].tolist(), index=df_results_20_35.index)

# Step: Merge the mean centroids dataframe with the results dataframe on 'Cluster'
df_merged_20 = pd.merge(df_results_20_35, centroid_mean_neighbors_20_df, on='Cluster', how='left')

# Step: Calculate the Euclidean distance between each centroid and the corresponding cluster mean
df_merged_20['Distance_to_Mean'] = np.sqrt((df_merged_20['x'] - df_merged_20['x_mean'])**2 +
                                           (df_merged_20['y'] - df_merged_20['y_mean'])**2)

# Step: Apply an outlier threshold (e.g., 90th percentile of the distance per cluster)
def filter_outliers(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)  # 90th percentile threshold
    return df[df['Distance_to_Mean'] <= threshold]

# Apply the filtering function for each cluster
df_no_outliers_20 = df_merged_20.groupby('Cluster').apply(filter_outliers).reset_index(drop=True)

# Step: Drop unnecessary columns if needed (like 'x' and 'y' if only the distance matters)
df_no_outliers_cleaned_20 = df_no_outliers_20.drop(columns=['x', 'y', 'x_mean', 'y_mean'])

# Step: Check the size of the resulting dataframe
print(f"Original DataFrame size: {df_merged_20.shape}")
print(f"DataFrame size after removing outliers: {df_no_outliers_cleaned_20.shape}")

# Step: Group the dataframe by 'Cluster'
clusters_grouped_20 = df_no_outliers_cleaned_20.groupby('Cluster')

# Step: Create a dictionary to store arrays for each cluster's centroids
clusters_centroids_20 = {}

# Step: Loop through each group (cluster) and store the centroids in arrays
for cluster, group in clusters_grouped_20:
    # Extract centroids (x, y) as a NumPy array
    centroids_array_20 = np.array(group['Centroid Coord'].tolist())  # Assuming 'Centroid Coord' contains [x, y] pairs
    clusters_centroids_20[cluster] = centroids_array_20

# Step: Loop through each cluster and plot
for cluster, group in clusters_grouped_20:
    # Extract Trial numbers, x and y coordinates from the group
    trials = group['Trial'].values
    centroids_x_20 = np.array([coord[0] for coord in group['Centroid Coord']])
    centroids_y_20 = np.array([coord[1] for coord in group['Centroid Coord']])

    # Step: Plot Trial vs. X-coordinate for this cluster
    plt.figure(figsize=(10, 5))
    plt.plot(trials, centroids_x_20, marker='o', linestyle='-', label=f'Cluster {cluster} X-coordinate')
    plt.title(f'Cluster {cluster}: Trial vs. X-coordinate')
    plt.xlabel('Trial')
    plt.ylabel('X-coordinate')
    plt.grid(True)
    plt.legend()
    plt.show()

    # Step: Plot Trial vs. Y-coordinate for this cluster
    plt.figure(figsize=(10, 5))
    plt.plot(trials, centroids_y_20, marker='o', linestyle='-', label=f'Cluster {cluster} Y-coordinate')
    plt.title(f'Cluster {cluster}: Trial vs. Y-coordinate')
    plt.xlabel('Trial')
    plt.ylabel('Y-coordinate')
    plt.grid(True)
    plt.legend()
    plt.show()

# Step: Create a dictionary to store the size of each cluster
cluster_sizes_20 = {cluster: len(centroids) for cluster, centroids in clusters_centroids_20.items()}

# Step: Print the size of each cluster
for cluster, size in cluster_sizes_20.items():
    print(f"Cluster {cluster} has {size} centroids considered.")


**Distance matrix**: elemnt d_{ij} has the distance between the center of cluster i and cluster j.

In [None]:
# Store distance matrices for each run
distance_matrices_20_35 = []

# Iterate over all runs and calculate the distance matrix for each run
for run_centroids in kmeans_centroids_20:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix_20_35 = cdist(run_centroids, run_centroids, metric='euclidean')
    distance_matrices_20_35.append(distance_matrix_20_35)

# Convert the list to a numpy array
distance_matrices_20_35 = np.array(distance_matrices_20_35)

# Save the distance matrices for all runs
np.save(f'distance_matrices_neighbors_20_all_runs.npy', distance_matrices_20_35)

# Optionally, inspect the distance matrix for the first run
print(f"Distance matrix for the first run:\n{distance_matrices_20_35[0]}")

In [None]:
# Iterate through all runs and print each distance matrix
for run_idx, distance_matrix in enumerate(distance_matrices_20_35):
    print(f"Distance matrix for run {run_idx + 1}:\n{distance_matrix}\n")

In [66]:
avg_dist_n5 = np.mean(distance_matrices_5_35, axis=0)
avg_dist_n10 = np.mean(distance_matrices_10_35, axis=0)
avg_dist_n20 = np.mean(distance_matrices_20_35, axis=0)

In [None]:
# List of matrices and corresponding titles
matrices = [avg_dist_n5, avg_dist_n10, avg_dist_n20]
titles = ['Heatmap of Mean Distance Matrix (avg_dist_n5)',
          'Heatmap of Mean Distance Matrix (avg_dist_n10)',
          'Heatmap of Mean Distance Matrix (avg_dist_n20)']

# Loop through matrices and titles to create heatmaps
for matrix, title in zip(matrices, titles):
    plt.figure(figsize=(10, 8))
    sns.heatmap(matrix, annot=True, fmt=".2f", cmap='viridis', cbar=True)
    plt.title(title, fontsize=16)
    plt.xlabel('Cluster')
    plt.ylabel('Cluster')
    plt.show()

--------

-----------

In [None]:
# Load the UMAP projections
umap_projections_20 = np.load(f'umap_projections_neighbors_20.npy')

# To see the contents of the UMAP projections
print(umap_projections_20)


In [None]:
centroid_mean_neighbors_20 = centroid_mean
# centroid_mean_neighbors_20= np.load(f'centroid_mean_neighbors_20.npy')
kmeans_centroids_20 = np.load(f"kmeans_centroids_neighbors_20.npy")
print(centroid_mean_neighbors_20)

In [None]:
# Calculate the standard deviation for the x-axis and y-axis across the clusters
std_dev_x_20 = np.std(centroid_mean_neighbors_20[:, 0])  # Standard deviation for the x-axis
std_dev_y_20 = np.std(centroid_mean_neighbors_20[:, 1])  # Standard deviation for the y-axis

# Print the results
print(f'Standard deviation for x-axis: {std_dev_x_20}')
print(f'Standard deviation for y-axis: {std_dev_y_20}')

In [39]:
# Check if the centroid is within the 90% range
def is_within_90_percent(centroid, centroid_mean_neighbors_20, std_dev_x_20, std_dev_y_20):
    # Calculate the 90% range for x-axis
    lower_bound_x_20 = centroid_mean_neighbors_20[0] - 1.645 * std_dev_x_20
    upper_bound_x_20 = centroid_mean_neighbors_20[0] + 1.645 * std_dev_x_20
    
    # Calculate the 90% range for y-axis
    lower_bound_y_20 = centroid_mean_neighbors_20[1] - 1.645 * std_dev_y_20
    upper_bound_y_20 = centroid_mean_neighbors_20[1] + 1.645 * std_dev_y_20
    
    # Check if the centroid is within both x and y ranges
    x_in_range_20 = lower_bound_x_20 <= centroid[0] <= upper_bound_x_20
    y_in_range_20 = lower_bound_y_20 <= centroid[1] <= upper_bound_y_20
    
    # Return True if both x and y are within the range, False otherwise
    return x_in_range_20 and y_in_range_20

In [None]:
# Create a table to store the results: Trial | Cluster | Centroid Coord | Inside the 90%
result_table_20 = []
n_runs = 35
n_clusters = 10

for run in range(n_runs):
    for i in range(n_clusters):
        centroid_20 = kmeans_centroids_20[run][i]
        centroid_mean_20 = centroid_mean_neighbors_20[i]

        # Check if the centroid is within the 90% range
        inside_90_20 = is_within_90_percent(centroid_20, centroid_mean_20, std_dev_x_20, std_dev_y_20)
        
        # Append the result to the table
        result_table_20.append([run + 1, i + 1, centroid_20, inside_90_20])

# Convert result_table to a DataFrame for better readability
df_results_20 = pd.DataFrame(result_table_20, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside the 90%'])

# Display the DataFrame
print(df_results_20)

In [None]:
# Function to plot an ellipse representing the 90% confidence interval
def plot_confidence_ellipse_25(ax, centroid_mean_25, std_dev_x_25, std_dev_y_25, edgecolor='blue'):
    ellipse = Ellipse(xy=centroid_mean_25, width=2*1.645*std_dev_x_25, height=2*1.645*std_dev_y_25,
                      edgecolor=edgecolor, fc='None', lw=2)
    ax.add_patch(ellipse)

# Create a plot
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the centroids
for i in range(n_clusters):
    for run in range(n_runs):
        centroid_25 = kmeans_centroids_25[run][i]
        inside_90_25 = is_within_90_percent(centroid_25, centroid_mean_25, std_dev_x_25, std_dev_y_25)
        
        # Choose a color depending on whether the centroid is inside the 90% range
        color = 'green' if inside_90 else 'red'
        
        # Plot the centroid
        ax.scatter(centroid_25[0], centroid_25[1], color=color, label=f'Cluster {i + 1}' if run == 0 else "")

    # Plot the 90% confidence ellipse for each cluster's mean centroid
    plot_confidence_ellipse_25(ax, centroid_mean_neighbors_25[i], std_dev_x_25, std_dev_y_25, edgecolor='blue')

# Labeling the plot
ax.set_title('Centroids and 90% Confidence Range')
ax.set_xlabel('UMAP X-axis')
ax.set_ylabel('UMAP Y-axis')

# Add legend
plt.legend(loc='upper right')

# Show the plot
plt.show()

In [None]:
# Create a table to store the results: Trial | Cluster | Centroid Coord | Inside the 90%
result_table = []

for run in range(n_runs):
    for i in range(n_clusters):
        centroid = kmeans_centroids[run][i]
        mean = centroid_mean[i]
        std = centroid_std[i]
        inside_90 = is_within_90_percent(centroid, mean, std)
        result_table.append([run + 1, i, centroid, inside_90])

# Convert result_table to a DataFrame for better readability
df_results = pd.DataFrame(result_table, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside the 90%'])

# Display the DataFrame using standard Pandas functions
print(df_results)

# Print mean and standard deviation for clarity
print("Centroid Means (per cluster):\n", centroid_mean)
print("Centroid Standard Deviations (per cluster):\n", centroid_std)

# Save the result table to a CSV file
df_results.to_csv(f'result_table_neighbors_{n_neighbors3}.csv', index=False)

# Save the centroid means and standard deviations
np.save(f'centroid_mean_neighbors_{n_neighbors3}.npy', centroid_mean)
np.save(f'centroid_std_neighbors_{n_neighbors3}.npy', centroid_std)

--------------------

--------

## min_dist Trials

### min_dist = 0, n_neighbors= 5 

In [118]:
umap_projections_5_0_2 = np.load(f'umap_projections_neighbors_5_0_2.npy')
centroid_mean_5_0_2= np.load(f'centroid_mean_5_0_2.npy')
centroid_std_5_0_2= np.load(f'centroid_std_5_0_2.npy')
kmeans_centroids_5_0_2 = np.load(f"kmeans_centroids_neighbors_5_0_2.npy")
df_results_5_0_2=pd.read_csv(f'result_table_neighbors_v2_20_35.csv')
mean_distance_matrix_5_0_2= np.load(f'mean_distance_matrix_neighbors_5_0_2.npy')
normalized_mean_distance_matrix_5_0_2= np.load(f'normalized_mean_distance_matrix_5_0_2.npy')
distance_matrix_std_5_0_2= np.load(f"distance_matrix_std_5_0_2.npy")
normalized_distance_matrix_std_5_0_2= np.load(f"normalized_distance_matrix_std_5_0_2.npy")
mst_std_5_0_2= np.load(f'mst_std_5_0_2.npy')
mst_5_0_2= np.load(f'mst_5_0_2.npy')

In [None]:
# Define the number of UMAP and KMeans runs
n_runs = 35
n_clusters = 10  # Set the number of clusters (for KMeans)
min_dist = 0
n_neighbors = 5
n_components = 2

# Store UMAP and KMeans results for each run
umap_projections_5_0_2 = []
kmeans_centroids_list_5_0_2 = []  # Use this to store centroids for each run

# Define a helper function to calculate the centroid of each cluster
def calculate_centroids(kmeans, x_umap):
    centroids_5_0_2 = []
    for i in range(n_clusters):
        cluster_points = x_umap[kmeans.labels_ == i]
        centroid = np.mean(cluster_points, axis=0)
        centroids_5_0_2.append(centroid)
    return np.array(centroids_5_0_2)

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap_5_0_2 = umap_model.fit_transform(x_train_flattened)
    
     # Apply KMeans clustering on the UMAP projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_train_umap_5_0_2)

    # Calculate centroids for this run
    centroids_5_0_2 = calculate_centroids(kmeans, x_train_umap_5_0_2)
    
    # Store the UMAP projections and KMeans models
    umap_projections_5_0_2.append(x_train_umap_5_0_2)
    kmeans_centroids_list_5_0_2.append(centroids_5_0_2)

# Now we calculate the mean and standard deviation of the centroids across all runs
kmeans_centroids_5_0_2 = np.array(kmeans_centroids_list_5_0_2)  

# Calculate mean and std deviation for centroids' coordinates
centroid_mean_5_0_2 = np.mean(kmeans_centroids_5_0_2, axis=0)
centroid_std_5_0_2 = np.std(kmeans_centroids_5_0_2, axis=0)

# Save the UMAP projections and KMeans centroids
np.save(f'umap_projections_neighbors_{n_neighbors}_{min_dist}_{n_components}.npy', np.array(umap_projections_5_0_2))
np.save(f'kmeans_centroids_neighbors_{n_neighbors}_{min_dist}_{n_components}.npy', np.array(kmeans_centroids_list_5_0_2))

In [None]:
# Load the UMAP projections
umap_projections_5_0_2 = np.load(f'umap_projections_neighbors_5_0_2.npy')

# To see the contents of the UMAP projections
print(umap_projections_5_0_2)

In [115]:
### NO NEED TO RE RUN ###

# Save the centroid_mean and centroid_std
np.save(f'centroid_mean_{n_neighbors}_0_2.npy', np.array(centroid_mean_5_0_2))
np.save(f'centroid_std_{n_neighbors}_0_2.npy', np.array(centroid_mean_5_0_2))

In [116]:
centroid_mean_5_0_2= np.load(f'centroid_mean_5_0_2.npy')
centroid_std_5_0_2= np.load(f'centroid_std_5_0_2.npy')

In [117]:
kmeans_centroids_5_0_2 = np.load(f"kmeans_centroids_neighbors_5_0_2.npy")  # Load the saved centroids data

------------------------

Standard deviation calculation

In [None]:
### NO NEED TO RE RUN ###
# Initialize arrays to store standard deviations
std_dev_x_5_0_2 = np.zeros(10)
std_dev_y_5_0_2 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_5_0_2[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_5_0_2[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_5_0_2[i] = np.std(cluster_x_coords)
    std_dev_y_5_0_2[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_5_0_2)
print("Standard deviation of y coordinates per cluster:", std_dev_y_5_0_2)

In [119]:
### NO NEED TO RE RUN ###
# Create an empty list to hold the data for the DataFrame
data_5_0_2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_5_0_2[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_5_0_2[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x_5_0_2[cluster], mean_x + 2 * std_dev_x_5_0_2[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y_5_0_2[cluster], mean_y + 2 * std_dev_y_5_0_2[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_5_0_2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_5_0_2 = pd.DataFrame(data_5_0_2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [120]:
### NO NEED TO RE RUN ###
# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true_5_0_2 = df_results_5_0_2.groupby('Trial')['Inside 2 std'].all()

In [121]:
### NO NEED TO RE RUN ###
# Filter the trials where all clusters were True
trials_with_all_true_5_0_2 = trials_all_true_5_0_2[trials_all_true_5_0_2].index.tolist()

In [None]:
### NO NEED TO RE RUN ###
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true_5_0_2)

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where not all clusters were True
trials_with_some_false_5_0_2 = trials_all_true_5_0_2[~trials_all_true_5_0_2].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false_5_0_2)

In [124]:
### NO NEED TO RE RUN ###
# Save the result table to a CSV file
df_results_5_0_2.to_csv(f'result_table_neighbors_v2_{5}_0_2.csv', index=False)

-----------------

Removal outliers process

In [125]:
df_results_5_0_2=pd.read_csv('result_table_neighbors_v2_20_35.csv')

In [126]:
# Convert the NumPy array into a DataFrame with 'Cluster', 'x_mean', and 'y_mean'
centroid_mean_5_0_2_df = pd.DataFrame(centroid_mean_5_0_2, columns=['x_mean', 'y_mean'])
centroid_mean_5_0_2_df['Cluster'] = np.arange(10)

In [128]:
# Step 1: Add commas between numbers in 'Centroid Coord' entries if they are missing
df_results_5_0_2['Centroid Coord'] = df_results_5_0_2['Centroid Coord'].str.replace(
    r'(\-?\d+\.\d+)\s+(\-?\d+\.\d+)', r'\1, \2', regex=True
)

# Step 2: Convert 'Centroid Coord' from string to list
df_results_5_0_2['Centroid Coord'] = df_results_5_0_2['Centroid Coord'].apply(ast.literal_eval)

# Step 3: Verify if each entry in 'Centroid Coord' is a list of length 2
invalid_rows = df_results_5_0_2[df_results_5_0_2['Centroid Coord'].apply(lambda x: not (isinstance(x, list) and len(x) == 2))]


In [129]:
# Extract x and y coordinates
df_results_5_0_2[['x', 'y']] = pd.DataFrame(df_results_5_0_2['Centroid Coord'].tolist(), index=df_results_5_0_2.index)

# Merge the mean centroids dataframe with the results dataframe on 'Cluster'
df_merged_5_0_2 = pd.merge(df_results_5_0_2, centroid_mean_5_0_2_df, on='Cluster', how='left')

In [None]:
# Plot changes in X-coordinate for each cluster over all runs
n_runs = 35
n_clusters = 10
n_neighbors = 5

for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_5_0_2[:, cluster, 0], marker='o', linestyle='-', color='b')
    plt.title(f'Cluster {cluster} X-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('X Centroid Coordinate')
    plt.grid(True)
    plt.show()

# Plot changes in Y-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_5_0_2[:, cluster, 1], marker='o', linestyle='-', color='g')
    plt.title(f'Cluster {cluster} Y-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('Y Centroid Coordinate')
    plt.grid(True)
    plt.show()

In [None]:
# Calculate Euclidean distance from each centroid to its cluster's mean
df_merged_5_0_2['Distance_to_Mean'] = np.sqrt((df_merged_5_0_2['x'] - df_merged_5_0_2['x_mean'])**2 + (df_merged_5_0_2['y'] - df_merged_5_0_2['y_mean'])**2)

# Apply an outlier threshold (e.g., 90th percentile of the distance per cluster)
def filter_outliers(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return df[df['Distance_to_Mean'] <= threshold]

# Apply the filtering function for each cluster
df_no_outliers_5_0_2 = df_merged_5_0_2.groupby('Cluster').apply(filter_outliers).reset_index(drop=True)

# Step 7: Drop unnecessary columns if needed (like 'x' and 'y' if only the distance matters)
df_no_outliers_cleaned_5_0_2 = df_no_outliers_5_0_2.drop(columns=['x', 'y', 'x_mean', 'y_mean'])

# Step 8: Check the size of the resulting dataframe
print(f"Original DataFrame size: {df_merged_5_0_2.shape}")
print(f"DataFrame size after removing outliers: {df_no_outliers_cleaned_5_0_2.shape}")

# Display the final dataframe to the user
df_no_outliers_cleaned_5_0_2

In [132]:
# Group the dataframe by 'Cluster'
clusters_grouped_5_0_2 = df_no_outliers_cleaned_5_0_2.groupby('Cluster')

# Create a dictionary to store arrays for each cluster's centroids
clusters_centroids_5_0_2 = {}

# Loop through each group (cluster) and store the centroids in arrays
for cluster, group in clusters_grouped_5_0_2:
    # Extract centroids (x, y) as a NumPy array
    centroids_array = np.array(group['Centroid Coord'].tolist())  # Assuming 'Centroid Coord' contains [x, y] pairs
    clusters_centroids_5_0_2[cluster] = centroids_array

In [None]:
# Create a dictionary to store the size of each cluster
cluster_sizes_5_0_2 = {cluster: len(centroids) for cluster, centroids in clusters_centroids_5_0_2.items()}

# Print the size of each cluster
for cluster, size in cluster_sizes_5_0_2.items():
    print(f"Cluster {cluster} has {size} centroids considered.")

--------

Check to verify that if it is fine to have all clusters with the same number of centroids after filtering out outliers. This must be due to:
- The Distance Distributions are Likely Very Similar
- Uniform Data structure

In [None]:
# Loop through each cluster and plot the distribution of distances
for cluster, group in clusters_grouped_5_0_2:
    plt.figure(figsize=(10, 5))
    plt.hist(group['Distance_to_Mean'], bins=10, edgecolor='black')
    plt.title(f'Cluster {cluster}: Distance to Mean Distribution')
    plt.xlabel('Distance to Mean')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
# Percentile threshold per cluster check
def check_percentiles(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return threshold

# Function applied to each cluster and print the result
for cluster, group in clusters_grouped_5_0_2:
    threshold = check_percentiles(group)
    print(f"Cluster {cluster}: 90th percentile threshold = {threshold}")

In [None]:
# For each cluster, calculate the 70th percentile of distances and filter accordingly
for cluster, group in clusters_grouped_5_0_2:
    # Calculate the 70th percentile threshold for the current cluster
    threshold = np.percentile(group['Distance_to_Mean'], 70)
    
    # Filter centroids based on the 70th percentile
    filtered_group = group[group['Distance_to_Mean'] <= threshold]
    
    # Print the size of the group before and after filtering
    print(f"Cluster {cluster}: Original size = {len(group)}, Filtered size = {len(filtered_group)}")

--------

#### Distance matrix n=5, min_dist= 0

##### Distance Mean matrix

**Distance matrix**: elemnt d_{ij} has the distance between the center of cluster i and cluster j.

In [None]:
# Store distance matrices for each run
distance_matrices_5_0_2 = []

# Iterate over all runs and calculate the distance matrix for each run
for run_centroids in kmeans_centroids_5_0_2:
    # Calculate the pairwise Euclidean distance between centroids for this run
    dist_matrix = cdist(run_centroids, run_centroids, metric='euclidean')
    distance_matrices_5_0_2.append(dist_matrix)

# Convert the list of distance matrices to a numpy array (35 runs, 10x10 distance matrices)
distance_matrices_5_0_2 = np.array(distance_matrices_5_0_2)

# Calculate the mean distance matrix across all runs
mean_distance_matrix_5_0_2 = np.mean(distance_matrices_5_0_2, axis=0)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_0_2 = (mean_distance_matrix_5_0_2 - np.min(mean_distance_matrix_5_0_2)) / (np.max(mean_distance_matrix_20_35) - np.min(mean_distance_matrix_5_0_2))

# Plot of the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_0_2, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=5, min_dist = 0)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_0_2_all_runs.npy', distance_matrices_5_0_2)
np.save('mean_distance_matrix_neighbors_5_0_2.npy', mean_distance_matrix_5_0_2)

# Mean distance matrix
print(f"Mean distance matrix across all runs:\n{mean_distance_matrix_5_0_2}")

**MST Analysis**

In [None]:
# Create a graph from the distance matrix
G_5_0_2 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_5_0_2,3))
np.save('G_5_0_2.npy',G_5_0_2)

# Draw the graph
pos = nx.spring_layout(G_5_0_2, seed=42)  # positions for all nodes
nx.draw(G_5_0_2, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_5_0_2, 'weight')
nx.draw_networkx_edge_labels(G_5_0_2, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_5_0_2 = nx.minimum_spanning_tree(G_5_0_2)
np.save('mst_5_0_2.npy', mst_5_0_2)

# Define positions for all nodes
pos = nx.spring_layout(mst_5_0_2, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_5_0_2, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_5_0_2, 'weight')
nx.draw_networkx_edge_labels(mst_5_0_2, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST - min_dist= 0")
plt.show()

##### Distance Std. dev. Matrix

In [None]:
# Calculate the pairwise distance matrix for the standard deviations
distance_matrix_std_5_0_2 = cdist(centroid_std_5_0_2, centroid_std_5_0_2, metric='euclidean')

# Normalize the distance matrix
normalized_distance_matrix_std_5_0_2 = (distance_matrix_std_5_0_2 - np.min(distance_matrix_std_5_0_2)) / (np.max(distance_matrix_std_5_0_2) - np.min(distance_matrix_std_5_0_2))

# Visualize the normalized distance matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(normalized_distance_matrix_std_5_0_2, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Distance Matrix for Centroid Std Deviations (min_dist= 0)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()

# Save the distance matrix for later analysis
np.save("distance_matrix_std_5_0_2.npy", distance_matrix_std_5_0_2)
np.save("normalized_distance_matrix_std_5_0_2.npy", normalized_distance_matrix_std_5_0_2)


In [None]:
# Create a graph from the distance matrix
G_std_5_0_2 = nx.from_numpy_array(np.round(normalized_distance_matrix_std_5_0_2,3))
np.save('G_std_5_0_2.npy',G_std_5_0_2)

# Draw the graph
pos = nx.spring_layout(G_std_5_0_2, seed=42)  # positions for all nodes
nx.draw(G_std_5_0_2, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_std_5_0_2, 'weight')
nx.draw_networkx_edge_labels(G_std_5_0_2, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_std_5_0_2 = nx.minimum_spanning_tree(G_std_5_0_2)
np.save('mst_std_5_0_2.npy',mst_std_5_0_2)

# Define positions for all nodes
pos_std_5_0_2 = nx.spring_layout(mst_std_5_0_2, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_std_5_0_2, pos_std_5_0_2, with_labels=True, node_color='lightyellow', edge_color='green', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels_std_5_0_2 = nx.get_edge_attributes(mst_std_5_0_2, 'weight')
nx.draw_networkx_edge_labels(mst_std_5_0_2, pos_std_5_0_2, edge_labels=edge_labels_std_5_0_2, font_size=8, label_pos=0.3)

plt.title("MST Std. Deviation - min_dist= 0")
plt.show()

In [None]:
print("Edges in the original graph (G_std_5_0_2):")
print(G_std_5_0_2.edges(data=True))

In [None]:
path = nx.shortest_path(mst_std_5_0_2, source=6, target=1, weight='weight')
path_length = nx.shortest_path_length(mst_std_5_0_2, source=6, target=1, weight='weight')
print("Path between nodes 6 and 1:", path)
print("Path length:", path_length)

In [None]:
# Define a function to create heatmaps
def plot_heatmaps_side_by_side(matrices, titles, figsize=(16, 8), cmap="viridis", annot=True):
    """
    Plots multiple heatmaps side by side for given matrices and titles.

    Args:
        matrices (list): List of 2D matrices to plot as heatmaps.
        titles (list): List of titles corresponding to each matrix.
        figsize (tuple): Size of the entire figure (default: (16, 8)).
        cmap (str): Color map to use for all heatmaps (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    n = len(matrices)  # Number of heatmaps
    fig, axes = plt.subplots(1, n, figsize=figsize)

    for i, (matrix, title) in enumerate(zip(matrices, titles)):
        sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5, ax=axes[i])
        axes[i].set_title(title)
        axes[i].set_xlabel("Cluster")
        axes[i].set_ylabel("Cluster" if i == 0 else "")  # Only label y-axis for the first plot

    plt.tight_layout()
    plt.show()

# Call the function with the two heatmaps
plot_heatmaps_side_by_side(
    matrices=[
        normalized_distance_matrix_std_5_0_2,
        normalized_mean_distance_matrix_5_0_2
    ],
    titles=[
        "Normalized Distance Matrix for Centroid Std Deviations (min_dist=0)",
        "Normalized Mean Distance Matrix (k=10, min_dist=0)"
    ]
)

In [None]:
# Define a threshold for "low" values
threshold = 0.65

# Identify pairs of clusters with low values in both matrices
low_low_pairs = []
for i in range(normalized_mean_distance_matrix_5_0_2.shape[0]):
    for j in range(normalized_mean_distance_matrix_5_0_2.shape[1]):
        if i != j:  # Skip diagonal
            mean_value = normalized_mean_distance_matrix_5_0_2[i, j]
            std_value = normalized_distance_matrix_std_5_0_2[i, j]
            if mean_value < threshold and std_value < threshold:
                low_low_pairs.append((i, j, mean_value, std_value))

# Display the results
for pair in low_low_pairs:
    print(f"Clusters {pair[0]} and {pair[1]}: Mean Distance = {pair[2]:.2f}, Std Distance = {pair[3]:.2f}")

**0.65  lowest threshold so far.**

Depending on the goal of the analysis we can think of it as:
- If the aim is to identify the strongest relationships between clusters, a lower threshold would make more sense.
- If we want to explore the broader connections, then it is fine.

In [None]:
max_distances_5_0_2_d= np.load(f'max_intra_cluster_distances_dynamic_5_0_2.npy')
neighbor_counts_5_0_2_d= np.load(f'neighbor_counts_within_dynamic_radius_5_0_2.npy')
kmeans_labels_list_5_0_2_d= np.load(f'kmeans_labels_list_5_0_2.npy')

In [None]:
# Example: Replace with your cluster pairs from low_low_pairs
low_low_pairs = [(1, 8, 0.65, 0.19), (3, 8, 0.65, 0.22)]

# UMAP projections and cluster labels (replace with your actual data)
umap_projections = np.load("umap_projections_neighbors_5_0_2.npy")
kmeans_labels = np.load("kmeans_labels_list_5_0_2.npy")  # Shape: (n_runs, n_samples)

# Function to plot clusters
def plot_clusters(umap_projection, labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair
    points_a = umap_projection[labels == cluster_a]
    points_b = umap_projection[labels == cluster_b]

    plt.figure(figsize=(8, 6))
    plt.scatter(points_a[:, 0], points_a[:, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.6)
    plt.scatter(points_b[:, 0], points_b[:, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.6)
    plt.title(f"Run {run_idx}: Cluster {cluster_a} vs. Cluster {cluster_b}")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Analyze each cluster pair
for cluster_pair in low_low_pairs:
    cluster_a, cluster_b = cluster_pair[0], cluster_pair[1]
    print(f"Analyzing Cluster Pair: {cluster_a} and {cluster_b}")
    
    # For simplicity, visualize them in a specific UMAP run (e.g., the first run)
    run_idx = 0  # Use the first run for visualization
    plot_clusters(umap_projections[run_idx], kmeans_labels[run_idx], (cluster_a, cluster_b), run_idx)

    # Calculate additional statistics if needed
    distances_a_to_b = np.linalg.norm(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a].mean(axis=0) - 
                                      umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b].mean(axis=0))
    print(f"Mean Centroid Distance (Run {run_idx}): {distances_a_to_b:.2f}")

    # Variability comparison
    cluster_a_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a], axis=0)
    cluster_b_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b], axis=0)
    print(f"Cluster {cluster_a} Std Dev: {cluster_a_std}")
    print(f"Cluster {cluster_b} Std Dev: {cluster_b_std}")
    print("\n")

**Cluster 1 and Cluster 8** have a moderate spatial relationship with visible overlap in the UMAP space. Their differing variability patterns suggest distinct structures, but the overlap points might represent shared features or transitions between the clusters.
The large spatial separation between their centroids suggests they represent distinct structures or classes in the data.

**Cluster 0 and Cluster 9** 9 appears more compact and stable, while Cluster 0 is larger and more variable.
Their distinct regions in the UMAP space and differing standard deviations reinforce their meaningful separation.
Insights from Variability:

The variability of Cluster 0 could indicate sensitivity to UMAP parameters or noise in the data.

#### Intra class evaluation

In [None]:
# Variables to track the minimum distance and corresponding clusters
overall_min_distance_5_0_2 = float('inf')
min_distance_clusters_5_0_2 = None
min_distance_run_idx_5_0_2= None

for run_idx, run_centroids in enumerate(kmeans_centroids_5_0_2):
    # Compute pairwise distances between centroids
    pairwise_distances_5_0_2 = cdist(run_centroids, run_centroids, metric='euclidean')
    
    # Get the indices of the minimum non-zero distance
    np.fill_diagonal(pairwise_distances_5_0_2, np.inf)  # Ignore zero distances (self-comparisons)
    min_distance = np.min(pairwise_distances_5_0_2)
    if min_distance < overall_min_distance_5_0_2:
        overall_min_distance_5_0_2 = min_distance
        # Find the indices of the clusters corresponding to the minimum distance
        cluster_indices = np.unravel_index(np.argmin(pairwise_distances_5_0_2), pairwise_distances_5_0_2.shape)
        min_distance_clusters_5_0_2 = cluster_indices
        min_distance_run_idx_5_0_2 = run_idx

# Calculate dynamic radius
dynamic_radius_5_0_2 = overall_min_distance_5_0_2 / 2
print(f"Dynamic radius: {dynamic_radius_5_0_2}")
print(f"Minimum distance: {overall_min_distance_5_0_2}")
print(f"Clusters contributing to minimum distance: {min_distance_clusters_5_0_2}")
print(f"Run index: {min_distance_run_idx_5_0_2}")

# Save dynamic radius
np.save('dynamic_radius_results_5_0_2.npy', dynamic_radius_5_0_2)


In [None]:
# Define parameters
n_clusters = 10

# Dynamic radius, previously calculated
radius = dynamic_radius_5_0_2

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_5_0_2_d = []
neighbor_counts_5_0_2_d = []
kmeans_labels_list_5_0_2_d = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_5_0_2):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_5_0_2_d = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_5_0_2_d.append(kmeans_labels_5_0_2_d)
    
    run_max_distances_5_0_2_d = []
    run_neighbor_counts_5_0_2_d = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_5_0_2_d == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_5_0_2_d = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_5_0_2_d.append(max_distance_5_0_2_d)
        
        # Calculate number of neighbors within the dynamic radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_5_0_2_d = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_5_0_2_d.append(neighbors_within_radius_5_0_2_d)
    
    # Append results for this run
    max_distances_5_0_2_d.append(run_max_distances_5_0_2_d)
    neighbor_counts_5_0_2_d.append(run_neighbor_counts_5_0_2_d)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_5_0_2_d = np.array(max_distances_5_0_2_d)  # Shape: (n_runs, n_clusters)
neighbor_counts_5_0_2_d = np.array(neighbor_counts_5_0_2_d)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_5_0_2_d = np.array(kmeans_labels_list_5_0_2_d)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_dynamic_5_0_2.npy', max_distances_5_0_2_d)
np.save('neighbor_counts_within_dynamic_radius_5_0_2.npy', neighbor_counts_5_0_2_d)
np.save('kmeans_labels_list_5_0_2.npy', kmeans_labels_list_5_0_2_d)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_5_0_2_d)
print("\nNeighbor counts within dynamic radius for each run and each cluster:\n", neighbor_counts_5_0_2_d)
print("\nKMeans labels saved successfully.")

In [None]:
# Plot neighbor counts for each cluster across all runs
plt.figure(figsize=(16, 12))

# Iterate through each cluster
for cluster_idx in range(neighbor_counts_5_0_2_d.shape[1]):
    plt.subplot(2, 5, cluster_idx + 1)  # Create subplots for 10 clusters (2 rows, 5 columns)
    plt.plot(range(1, neighbor_counts_5_0_2_d.shape[0] + 1), neighbor_counts_5_0_2_d[:, cluster_idx], marker='o')
    plt.title(f'Cluster {cluster_idx}')
    plt.xlabel('Run')
    plt.ylabel('Neighbor Count')
    plt.xticks(range(1, neighbor_counts_5_0_2_d.shape[0] + 1, 5))  # Show every 5th run on the x-axis for clarity
    plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.suptitle('min_dist=0 Neighbor Counts per Cluster Across Runs', y=1.02, fontsize=16)  # Add a global title
plt.show()

In [None]:
# Calculate mean and max values across clusters for each run
mean_neighbors_5_0_2 = np.mean(neighbor_counts_5_0_2_d, axis=1)  # Shape: (n_runs,)
max_neighbors_5_0_2 = np.max(neighbor_counts_5_0_2_d, axis=1)    # Shape: (n_runs,)

# Compute trend lines for mean and max
runs = np.arange(1, len(mean_neighbors_5_0_2) + 1)
mean_slope, mean_intercept, _, _, _ = linregress(runs, mean_neighbors_5_0_2)
max_slope, max_intercept, _, _, _ = linregress(runs, max_neighbors_5_0_2)

# Calculate trend line values
mean_trend_5_0_2 = mean_slope * runs + mean_intercept
max_trend_5_0_2 = max_slope * runs + max_intercept

# Plot the results
plt.figure(figsize=(12, 6))

# Mean neighbor counts
plt.plot(runs, mean_neighbors_5_0_2, label='Mean Neighbor Count', marker='o', color='blue')

# Max neighbor counts
plt.plot(runs, max_neighbors_5_0_2, label='Max Neighbor Count', marker='s', color='orange')

# Trend lines
plt.plot(runs, mean_trend_5_0_2, linestyle='--', color='green',label='Mean Trend Line')
plt.plot(runs, max_trend_5_0_2, linestyle='--', color='green', label='Max Trend Line')

# Add labels, legend, and title
plt.title('min_dist=0 Neighbor Counts Across Runs (Mean vs. Max with Trend Lines)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Neighbor Count', fontsize=12)
plt.xticks(range(1, len(mean_neighbors_5_0_2) + 1, 5))  # Show every 5th run for readability
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the plot to a file
plt.savefig(f'neighbor_counts_plot_n_5_0_2.png', dpi=300)

# Show the plot
plt.show()

-------------

### min_dist= 0.5

In [121]:
umap_projections_5_05_2 = np.load(f'umap_projections_neighbors_5_0.5_2.npy')
centroid_mean_5_05_2= np.load(f'centroid_mean_5_05_2.npy')
centroid_std_5_05_2= np.load(f'centroid_std_5_05_2.npy')
kmeans_centroids_5_05_2 = np.load(f"kmeans_centroids_neighbors_5_0.5_2.npy")
df_results_5_05_2=pd.read_csv(f'result_table_neighbors_v2_20_35.csv')
mean_distance_matrix_5_05_2= np.load(f'mean_distance_matrix_neighbors_5_05_2.npy')
normalized_mean_distance_matrix_5_05_2= np.load(f'normalized_mean_distance_matrix_5_05_2.npy')
distance_matrix_std_5_05_2= np.load(f"distance_matrix_std_5_05_2.npy")
normalized_distance_matrix_std_5_05_2= np.load(f"normalized_distance_matrix_std_5_05_2.npy")
mst_std_5_05_2= np.load(f'mst_std_5_05_2.npy')
mst_5_05_2= np.load(f'mst_5_05_2.npy')

In [None]:
# Define the number of UMAP and KMeans runs
n_runs = 35
n_clusters = 10  # Set the number of clusters (for KMeans)
min_dist = 0.5
n_neighbors = 5
n_components = 2

# Store UMAP and KMeans results for each run
umap_projections_5_05_2 = []
kmeans_centroids_list_5_05_2 = []  # Use this to store centroids for each run

# Define a helper function to calculate the centroid of each cluster
def calculate_centroids(kmeans, x_umap):
    centroids_5_05_2 = []
    for i in range(n_clusters):
        cluster_points = x_umap[kmeans.labels_ == i]
        centroid = np.mean(cluster_points, axis=0)
        centroids_5_05_2.append(centroid)
    return np.array(centroids_5_05_2)

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap_5_05_2 = umap_model.fit_transform(x_train_flattened)
    
    # Apply KMeans clustering on the UMAP projection
    kmeans_5_05_2 = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans_5_05_2.fit(x_train_umap_5_05_2)

    # Calculate centroids for this run
    centroids_5_05_2 = calculate_centroids(kmeans, x_train_umap_5_05_2)
    
    # Store the UMAP projections and KMeans models
    umap_projections_5_05_2.append(x_train_umap_5_05_2)
    kmeans_centroids_list_5_05_2.append(centroids_5_05_2)

# Now we calculate the mean and standard deviation of the centroids across all runs
kmeans_centroids_5_05_2 = np.array(kmeans_centroids_list_5_05_2)  # Shape: (n_runs, n_clusters, 2)

# Calculate mean and std deviation for centroids' coordinates
centroid_mean_5_05_2 = np.mean(kmeans_centroids_5_05_2, axis=0)
centroid_std_5_05_2 = np.std(kmeans_centroids_5_05_2, axis=0)

# Save the UMAP projections and KMeans centroids
np.save(f'umap_projections_neighbors_{n_neighbors}_{min_dist}_{n_components}.npy', np.array(umap_projections_5_05_2))
np.save(f'kmeans_centroids_neighbors_{n_neighbors}_{min_dist}_{n_components}.npy', np.array(kmeans_centroids_list_5_05_2))

In [None]:
# Load the UMAP projections
umap_projections_5_05_2 = np.load(f'umap_projections_neighbors_5_0.5_2.npy')

# To see the contents of the UMAP projections
print(umap_projections_5_05_2)

In [57]:
### NO NEED TO RE RUN ###

# Save the centroid_mean and centroid_std
np.save(f'centroid_mean_{n_neighbors}_05_2.npy', np.array(centroid_mean_5_05_2))
np.save(f'centroid_std_{n_neighbors}_05_2.npy', np.array(centroid_mean_5_05_2))

In [58]:
centroid_mean_5_05_2= np.load(f'centroid_mean_5_05_2.npy')
centroid_std_5_05_2= np.load(f'centroid_std_5_05_2.npy')

In [59]:
kmeans_centroids_5_05_2 = np.load(f"kmeans_centroids_neighbors_5_0.5_2.npy")  # Load the saved centroids data

------------------------

Standard deviation calculation

In [None]:
### NO NEED TO RE RUN ###
# Initialize arrays to store standard deviations
std_dev_x_5_05_2 = np.zeros(10)
std_dev_y_5_05_2 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_5_05_2[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_5_05_2[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_5_05_2[i] = np.std(cluster_x_coords)
    std_dev_y_5_05_2[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_5_05_2)
print("Standard deviation of y coordinates per cluster:", std_dev_y_5_05_2)

In [61]:
### NO NEED TO RE RUN ###
# Create an empty list to hold the data for the DataFrame
data_5_05_2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_5_05_2[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_5_05_2[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x_5_05_2[cluster], mean_x + 2 * std_dev_x_5_05_2[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y_5_05_2[cluster], mean_y + 2 * std_dev_y_5_05_2[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_5_05_2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_5_05_2 = pd.DataFrame(data_5_05_2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [62]:
### NO NEED TO RE RUN ###
# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true_5_05_2 = df_results_5_05_2.groupby('Trial')['Inside 2 std'].all()

In [63]:
### NO NEED TO RE RUN ###
# Filter the trials where all clusters were True
trials_with_all_true_5_05_2 = trials_all_true_5_05_2[trials_all_true_5_05_2].index.tolist()

In [None]:
### NO NEED TO RE RUN ###
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true_5_05_2)

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where not all clusters were True
trials_with_some_false_5_05_2 = trials_all_true_5_05_2[~trials_all_true_5_05_2].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false_5_05_2)

In [66]:
### NO NEED TO RE RUN ###
# Save the result table to a CSV file
df_results_5_05_2.to_csv(f'result_table_neighbors_v2_{5}_05_2.csv', index=False)

-----------------

**BELOW To edit**

In [95]:
import ast

In [100]:
# Convert the NumPy array into a DataFrame with 'Cluster', 'x_mean', and 'y_mean'
centroid_mean_5_05_2_df = pd.DataFrame(centroid_mean_5_05_2, columns=['x_mean', 'y_mean'])
centroid_mean_5_05_2_df['Cluster'] = np.arange(10)

In [None]:
# Step 1: Add commas between numbers in 'Centroid Coord' entries if they are missing
df_results_5_05_2['Centroid Coord'] = df_results_5_05_2['Centroid Coord'].str.replace(
    r'(\-?\d+\.\d+)\s+(\-?\d+\.\d+)', r'\1, \2', regex=True
)

# Step 2: Convert 'Centroid Coord' from string to list
df_results_5_05_2['Centroid Coord'] = df_results_5_05_2['Centroid Coord'].apply(ast.literal_eval)

# Step 3: Verify if each entry in 'Centroid Coord' is a list of length 2
invalid_rows = df_results_5_05_2[df_results_5_05_2['Centroid Coord'].apply(lambda x: not (isinstance(x, list) and len(x) == 2))]

In [102]:
# Extract x and y coordinates
df_results_5_05_2[['x', 'y']] = pd.DataFrame(df_results_5_05_2['Centroid Coord'].tolist(), index=df_results_5_05_2.index)

# Merge the mean centroids dataframe with the results dataframe on 'Cluster'
df_merged_5_05_2 = pd.merge(df_results_5_05_2, centroid_mean_5_05_2_df, on='Cluster', how='left')

In [None]:
# Calculate Euclidean distance from each centroid to its cluster's mean
df_merged_5_05_2['Distance_to_Mean'] = np.sqrt((df_merged_5_05_2['x'] - df_merged_5_05_2['x_mean'])**2 + (df_merged_5_05_2['y'] - df_merged_5_05_2['y_mean'])**2)

# Apply an outlier threshold (e.g., 90th percentile of the distance per cluster)
def filter_outliers(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return df[df['Distance_to_Mean'] <= threshold]

# Apply the filtering function for each cluster
df_no_outliers_5_05_2 = df_merged_5_05_2.groupby('Cluster').apply(filter_outliers).reset_index(drop=True)

# Step 7: Drop unnecessary columns if needed (like 'x' and 'y' if only the distance matters)
df_no_outliers_cleaned_5_05_2 = df_no_outliers_5_05_2.drop(columns=['x', 'y', 'x_mean', 'y_mean'])

# Step 8: Check the size of the resulting dataframe
print(f"Original DataFrame size: {df_merged_5_05_2.shape}")
print(f"DataFrame size after removing outliers: {df_no_outliers_cleaned_5_05_2.shape}")

# Display the final dataframe to the user
df_no_outliers_cleaned_5_05_2

In [104]:
# Group the dataframe by 'Cluster'
clusters_grouped_5_05_2 = df_no_outliers_cleaned_5_05_2.groupby('Cluster')

# Create a dictionary to store arrays for each cluster's centroids
clusters_centroids_5_05_2 = {}

# Loop through each group (cluster) and store the centroids in arrays
for cluster, group in clusters_grouped_5_05_2:
    # Extract centroids (x, y) as a NumPy array
    centroids_array = np.array(group['Centroid Coord'].tolist())  # Assuming 'Centroid Coord' contains [x, y] pairs
    clusters_centroids_5_05_2[cluster] = centroids_array

In [None]:
# Create a dictionary to store the size of each cluster
cluster_sizes_5_05_2 = {cluster: len(centroids) for cluster, centroids in clusters_centroids_5_05_2.items()}

# Print the size of each cluster
for cluster, size in cluster_sizes_5_05_2.items():
    print(f"Cluster {cluster} has {size} centroids considered.")

--------

Check to verify that if it is fine to have all clusters with the same number of centroids after filtering out outliers. This must be due to:
- The Distance Distributions are Likely Very Similar
- Uniform Data structure

In [None]:
# Loop through each cluster and plot the distribution of distances
for cluster, group in clusters_grouped_5_05_2:
    plt.figure(figsize=(10, 5))
    plt.hist(group['Distance_to_Mean'], bins=10, edgecolor='black')
    plt.title(f'Cluster {cluster}: Distance to Mean Distribution')
    plt.xlabel('Distance to Mean')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
# Percentile threshold per cluster check
def check_percentiles(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return threshold

# Function applied to each cluster and print the result
for cluster, group in clusters_grouped_5_05_2:
    threshold = check_percentiles(group)
    print(f"Cluster {cluster}: 90th percentile threshold = {threshold}")

In [None]:
# For each cluster, calculate the 70th percentile of distances and filter accordingly
for cluster, group in clusters_grouped_5_05_2:
    # Calculate the 70th percentile threshold for the current cluster
    threshold = np.percentile(group['Distance_to_Mean'], 70)
    
    # Filter centroids based on the 70th percentile
    filtered_group = group[group['Distance_to_Mean'] <= threshold]
    
    # Print the size of the group before and after filtering
    print(f"Cluster {cluster}: Original size = {len(group)}, Filtered size = {len(filtered_group)}")

--------

#### Distance matrix n=5, min_dist= 0.5

##### Distance Mean matrix

**Distance matrix**: elemnt d_{ij} has the distance between the center of cluster i and cluster j.

In [None]:
# Store distance matrices for each run
distance_matrices_5_05_2 = []

# Iterate over all runs and calculate the distance matrix for each run
for run_centroids in kmeans_centroids_5_05_2:
    # Calculate the pairwise Euclidean distance between centroids for this run
    dist_matrix = cdist(run_centroids, run_centroids, metric='euclidean')
    distance_matrices_5_05_2.append(dist_matrix)

# Convert the list of distance matrices to a numpy array (35 runs, 10x10 distance matrices)
distance_matrices_5_05_2 = np.array(distance_matrices_5_05_2)

# Calculate the mean distance matrix across all runs
mean_distance_matrix_5_05_2 = np.mean(distance_matrices_5_05_2, axis=0)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_05_2 = (mean_distance_matrix_5_05_2 - np.min(mean_distance_matrix_5_05_2)) / (np.max(mean_distance_matrix_20_35) - np.min(mean_distance_matrix_5_05_2))

# Plot of the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_05_2, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=5, min_dist = 0.5)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_05_2_all_runs.npy', distance_matrices_5_05_2)
np.save('mean_distance_matrix_neighbors_5_05_2.npy', mean_distance_matrix_5_05_2)
np.save('normalized_mean_distance_matrix_5_05_2', normalized_mean_distance_matrix_5_05_2)

# Mean distance matrix
print(f"Mean distance matrix across all runs:\n{mean_distance_matrix_5_05_2}")

**MST Analysis**

In [None]:
# Create a graph from the distance matrix
G_5_05_2 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_5_05_2,3))
np.save('G_5_05_2.npy',G_5_05_2)

# Draw the graph
pos = nx.spring_layout(G_5_05_2, seed=42)  # positions for all nodes
nx.draw(G_5_05_2, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_5_05_2, 'weight')
nx.draw_networkx_edge_labels(G_5_05_2, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_5_05_2 = nx.minimum_spanning_tree(G_5_05_2)
np.save('mst_5_05_2.npy', mst_5_05_2)

# Define positions for all nodes
pos = nx.spring_layout(mst_5_05_2, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_5_05_2, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_5_05_2, 'weight')
nx.draw_networkx_edge_labels(mst_5_05_2, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST - min_dist= 0.5")
plt.show()

##### Distance Std. dev. Matrix

In [None]:
# Calculate the pairwise distance matrix for the standard deviations
distance_matrix_std_5_05_2 = cdist(centroid_std_5_05_2, centroid_std_5_05_2, metric='euclidean')

# Normalize the distance matrix
normalized_distance_matrix_std_5_05_2 = (distance_matrix_std_5_05_2 - np.min(distance_matrix_std_5_05_2)) / (np.max(distance_matrix_std_5_05_2) - np.min(distance_matrix_std_5_05_2))

# Visualize the normalized distance matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(normalized_distance_matrix_std_5_05_2, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Distance Matrix for Centroid Std Deviations (min_dist= 0.5)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()

# Save the distance matrix for later analysis
np.save("distance_matrix_std_5_05_2.npy", distance_matrix_std_5_05_2)
np.save("normalized_distance_matrix_std_5_05_2.npy", normalized_distance_matrix_std_5_05_2)


In [None]:
# Create a graph from the distance matrix
G_std_5_05_2 = nx.from_numpy_array(np.round(normalized_distance_matrix_std_5_05_2,3))
np.save('G_std_5_05_2.npy',G_std_5_05_2)

# Draw the graph
pos = nx.spring_layout(G_std_5_05_2, seed=42)  # positions for all nodes
nx.draw(G_std_5_05_2, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_std_5_05_2, 'weight')
nx.draw_networkx_edge_labels(G_std_5_05_2, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_std_5_05_2 = nx.minimum_spanning_tree(G_std_5_05_2)
np.save('mst_std_5_05_2.npy',mst_std_5_05_2)

# Define positions for all nodes
pos_std_5_05_2 = nx.spring_layout(mst_std_5_05_2, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_std_5_05_2, pos_std_5_05_2, with_labels=True, node_color='lightyellow', edge_color='green', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels_std_5_05_2 = nx.get_edge_attributes(mst_std_5_05_2, 'weight')
nx.draw_networkx_edge_labels(mst_std_5_05_2, pos_std_5_05_2, edge_labels=edge_labels_std_5_05_2, font_size=8, label_pos=0.3)

plt.title("MST Std. Deviation - min_dist= 0.5")
plt.show()

In [None]:
# Define a function to create heatmaps
def plot_heatmaps_side_by_side(matrices, titles, figsize=(16, 8), cmap="viridis", annot=True):
    """
    Plots multiple heatmaps side by side for given matrices and titles.

    Args:
        matrices (list): List of 2D matrices to plot as heatmaps.
        titles (list): List of titles corresponding to each matrix.
        figsize (tuple): Size of the entire figure (default: (16, 8)).
        cmap (str): Color map to use for all heatmaps (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    n = len(matrices)  # Number of heatmaps
    fig, axes = plt.subplots(1, n, figsize=figsize)

    for i, (matrix, title) in enumerate(zip(matrices, titles)):
        sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5, ax=axes[i])
        axes[i].set_title(title)
        axes[i].set_xlabel("Cluster")
        axes[i].set_ylabel("Cluster" if i == 0 else "")  # Only label y-axis for the first plot

    plt.tight_layout()
    plt.show()

# Call the function with the two heatmaps
plot_heatmaps_side_by_side(
    matrices=[
        normalized_distance_matrix_std_5_05_2,
        normalized_mean_distance_matrix_5_05_2
    ],
    titles=[
        "Normalized Distance Matrix for Centroid Std Deviations (min_dist=0.5)",
        "Normalized Mean Distance Matrix (k=10, min_dist=0.5)"
    ]
)

In [None]:
# Define a threshold for "low" values
threshold = 0.4

# Identify pairs of clusters with low values in both matrices
low_low_pairs = []
for i in range(normalized_mean_distance_matrix_5_05_2.shape[0]):
    for j in range(normalized_mean_distance_matrix_5_05_2.shape[1]):
        if i != j:  # Skip diagonal
            mean_value = normalized_mean_distance_matrix_5_05_2[i, j]
            std_value = normalized_distance_matrix_std_5_05_2[i, j]
            if mean_value < threshold and std_value < threshold:
                low_low_pairs.append((i, j, mean_value, std_value))

# Display the results
for pair in low_low_pairs:
    print(f"Clusters {pair[0]} and {pair[1]}: Mean Distance = {pair[2]:.2f}, Std Distance = {pair[3]:.2f}")

**0.4 lowest threshold so far. The limit is in 0.35 but there is only one matching**

Depending on the goal of the analysis we can think of it as:
- If the aim is to identify the strongest relationships between clusters, a lower threshold would make more sense.
- If we want to explore the broader connections, then it is fine.

In [None]:
# Define parameters
n_clusters = 10

# Dynamic radius, previously calculated
radius = dynamic_radius_5_05_2

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_5_05_2_d = []
neighbor_counts_5_05_2_d = []
kmeans_labels_list_5_05_2_d = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_5_05_2):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_5_05_2_d = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_5_05_2_d.append(kmeans_labels_5_05_2_d)
    
    run_max_distances_5_05_2_d = []
    run_neighbor_counts_5_05_2_d = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_5_05_2_d == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_5_05_2_d = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_5_05_2_d.append(max_distance_5_05_2_d)
        
        # Calculate number of neighbors within the dynamic radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_5_05_2_d = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_5_05_2_d.append(neighbors_within_radius_5_05_2_d)
    
    # Append results for this run
    max_distances_5_05_2_d.append(run_max_distances_5_05_2_d)
    neighbor_counts_5_05_2_d.append(run_neighbor_counts_5_05_2_d)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_5_05_2_d = np.array(max_distances_5_05_2_d)  # Shape: (n_runs, n_clusters)
neighbor_counts_5_05_2_d = np.array(neighbor_counts_5_05_2_d)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_5_05_2_d = np.array(kmeans_labels_list_5_05_2_d)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_dynamic_5_05_2.npy', max_distances_5_05_2_d)
np.save('neighbor_counts_within_dynamic_radius_5_05_2.npy', neighbor_counts_5_05_2_d)
np.save('kmeans_labels_list_5_05_2.npy', kmeans_labels_list_5_05_2_d)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_5_05_2_d)
print("\nNeighbor counts within dynamic radius for each run and each cluster:\n", neighbor_counts_5_05_2_d)
print("\nKMeans labels saved successfully.")

In [133]:
max_distances_5_05_2_d= np.load(f'max_intra_cluster_distances_dynamic_5_05_2.npy')
neighbor_counts_5_05_2_d= np.load(f'neighbor_counts_within_dynamic_radius_5_05_2.npy')
kmeans_labels_list_5_05_2_d= np.load(f'kmeans_labels_list_5_05_2.npy')

In [None]:
# Example: Replace with your cluster pairs from low_low_pairs
low_low_pairs = [(0, 8, 0.4, 0.21), (0, 7, 0.3, 0.27), (0, 6, 0.4, 0.2)]

# UMAP projections and cluster labels (replace with your actual data)
umap_projections = np.load("umap_projections_neighbors_5_0.5_2.npy")
kmeans_labels = np.load("kmeans_labels_list_5_05_2.npy")  # Shape: (n_runs, n_samples)

# Function to plot clusters
def plot_clusters(umap_projection, labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair
    points_a = umap_projection[labels == cluster_a]
    points_b = umap_projection[labels == cluster_b]

    plt.figure(figsize=(8, 6))
    plt.scatter(points_a[:, 0], points_a[:, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.6)
    plt.scatter(points_b[:, 0], points_b[:, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.6)
    plt.title(f"Run {run_idx}: Cluster {cluster_a} vs. Cluster {cluster_b}")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Analyze each cluster pair
for cluster_pair in low_low_pairs:
    cluster_a, cluster_b = cluster_pair[0], cluster_pair[1]
    print(f"Analyzing Cluster Pair: {cluster_a} and {cluster_b}")
    
    # For simplicity, visualize them in a specific UMAP run (e.g., the first run)
    run_idx = 0  # Use the first run for visualization
    plot_clusters(umap_projections[run_idx], kmeans_labels[run_idx], (cluster_a, cluster_b), run_idx)

    # Calculate additional statistics if needed
    distances_a_to_b = np.linalg.norm(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a].mean(axis=0) - 
                                      umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b].mean(axis=0))
    print(f"Mean Centroid Distance (Run {run_idx}): {distances_a_to_b:.2f}")

    # Variability comparison
    cluster_a_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a], axis=0)
    cluster_b_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b], axis=0)
    print(f"Cluster {cluster_a} Std Dev: {cluster_a_std}")
    print(f"Cluster {cluster_b} Std Dev: {cluster_b_std}")
    print("\n")

**Cluster 1 and Cluster 8** have a moderate spatial relationship with visible overlap in the UMAP space. Their differing variability patterns suggest distinct structures, but the overlap points might represent shared features or transitions between the clusters.
The large spatial separation between their centroids suggests they represent distinct structures or classes in the data.

**Cluster 0 and Cluster 9** 9 appears more compact and stable, while Cluster 0 is larger and more variable.
Their distinct regions in the UMAP space and differing standard deviations reinforce their meaningful separation.
Insights from Variability:

The variability of Cluster 0 could indicate sensitivity to UMAP parameters or noise in the data.

#### Intra class evaluation

In [None]:
# Variables to track the minimum distance and corresponding clusters
overall_min_distance_5_05_2 = float('inf')
min_distance_clusters_5_05_2 = None
min_distance_run_idx_5_05_2= None

for run_idx, run_centroids in enumerate(kmeans_centroids_5_05_2):
    # Compute pairwise distances between centroids
    pairwise_distances_5_05_2 = cdist(run_centroids, run_centroids, metric='euclidean')
    
    # Get the indices of the minimum non-zero distance
    np.fill_diagonal(pairwise_distances_5_05_2, np.inf)  # Ignore zero distances (self-comparisons)
    min_distance = np.min(pairwise_distances_5_05_2)
    if min_distance < overall_min_distance_5_05_2:
        overall_min_distance_5_05_2 = min_distance
        # Find the indices of the clusters corresponding to the minimum distance
        cluster_indices = np.unravel_index(np.argmin(pairwise_distances_5_05_2), pairwise_distances_5_05_2.shape)
        min_distance_clusters_5_05_2 = cluster_indices
        min_distance_run_idx_5_05_2 = run_idx

# Calculate dynamic radius
dynamic_radius_5_05_2 = overall_min_distance_5_05_2 / 2
print(f"Dynamic radius: {dynamic_radius_5_05_2}")
print(f"Minimum distance: {overall_min_distance_5_05_2}")
print(f"Clusters contributing to minimum distance: {min_distance_clusters_5_05_2}")
print(f"Run index: {min_distance_run_idx_5_05_2}")

# Save dynamic radius
np.save('dynamic_radius_results_5_05_2.npy', dynamic_radius_5_05_2)


In [None]:
# Plot neighbor counts for each cluster across all runs
plt.figure(figsize=(16, 12))

# Iterate through each cluster
for cluster_idx in range(neighbor_counts_5_05_2_d.shape[1]):
    plt.subplot(2, 5, cluster_idx + 1)  # Create subplots for 10 clusters (2 rows, 5 columns)
    plt.plot(range(1, neighbor_counts_5_05_2_d.shape[0] + 1), neighbor_counts_5_05_2_d[:, cluster_idx], marker='o')
    plt.title(f'Cluster {cluster_idx}')
    plt.xlabel('Run')
    plt.ylabel('Neighbor Count')
    plt.xticks(range(1, neighbor_counts_5_05_2_d.shape[0] + 1, 5))  # Show every 5th run on the x-axis for clarity
    plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.suptitle('min_dist=0.5 Neighbor Counts per Cluster Across Runs', y=1.02, fontsize=16)  # Add a global title
plt.show()

In [None]:
# Calculate mean and max values across clusters for each run
mean_neighbors_5_05_2 = np.mean(neighbor_counts_5_05_2_d, axis=1)  # Shape: (n_runs,)
max_neighbors_5_05_2 = np.max(neighbor_counts_5_05_2_d, axis=1)    # Shape: (n_runs,)

# Compute trend lines for mean and max
runs = np.arange(1, len(mean_neighbors_5_05_2) + 1)
mean_slope, mean_intercept, _, _, _ = linregress(runs, mean_neighbors_5_05_2)
max_slope, max_intercept, _, _, _ = linregress(runs, max_neighbors_5_05_2)

# Calculate trend line values
mean_trend_5_05_2 = mean_slope * runs + mean_intercept
max_trend_5_05_2 = max_slope * runs + max_intercept

# Plot the results
plt.figure(figsize=(12, 6))

# Mean neighbor counts
plt.plot(runs, mean_neighbors_5_05_2, label='Mean Neighbor Count', marker='o', color='blue')

# Max neighbor counts
plt.plot(runs, max_neighbors_5_05_2, label='Max Neighbor Count', marker='s', color='orange')

# Trend lines
plt.plot(runs, mean_trend_5_05_2, linestyle='--', color='green',label='Mean Trend Line')
plt.plot(runs, max_trend_5_05_2, linestyle='--', color='green', label='Max Trend Line')

# Add labels, legend, and title
plt.title('min_dist=0.5 Neighbor Counts Across Runs (Mean vs. Max with Trend Lines)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Neighbor Count', fontsize=12)
plt.xticks(range(1, len(mean_neighbors_5_05_2) + 1, 5))  # Show every 5th run for readability
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the plot to a file
plt.savefig(f'neighbor_counts_plot_n_5_05_2.png', dpi=300)

# Show the plot
plt.show()

-------------

### min_dist= 0, n=10

In [187]:
umap_projections_10_0_2 = np.load(f'umap_projections_n_neighbors_10_min_dist_0_n_comp_2.npy')
centroid_mean_10_0_2= np.load(f'centroid_mean_n_neighbors_10_min_dist_0_n_comp_2.npy')
centroid_std_10_0_2= np.load(f'centroid_std_n_neighbors_10_min_dist_0_n_comp_2.npy')
kmeans_centroids_10_0_2= np.load(f'kmeans_centroids_n_neighbors_10_min_dist_0_n_comp_2.npy')
df_results_10_0_2=pd.read_csv(f'df_results_10_0_2.csv')
# mean_distance_matrix_5_05_2= np.load(f'mean_distance_matrix_neighbors_5_05_2.npy')
# normalized_mean_distance_matrix_5_05_2= np.load(f'normalized_mean_distance_matrix_5_05_2.npy')
# distance_matrix_std_5_05_2= np.load(f"distance_matrix_std_5_05_2.npy")
# normalized_distance_matrix_std_5_05_2= np.load(f"normalized_distance_matrix_std_5_05_2.npy")
# mst_std_5_05_2= np.load(f'mst_std_5_05_2.npy')
# mst_5_05_2= np.load(f'mst_5_05_2.npy')


In [None]:
# Define parameters for the experiment
n_runs = 35
n_clusters = 10  # Number of clusters for KMeans
min_dist = 0
n_neighbors = 10
n_components = 2

# Initialize lists to store results
umap_projections_list = []  # Store UMAP projections for each run
kmeans_centroids_list = []  # Store KMeans centroids for each run

# Define a reusable function to calculate cluster centroids
def calculate_centroids(labels, x_umap, n_clusters):
    """Calculate centroids of clusters."""
    centroids = [np.mean(x_umap[labels == i], axis=0) for i in range(n_clusters)]
    return np.array(centroids)

# Perform UMAP and KMeans clustering for each run
for run in range(n_runs):
    # UMAP projection
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=None)
    x_umap = umap_model.fit_transform(x_train_flattened)
    
    # KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    
    # Calculate centroids
    centroids = calculate_centroids(kmeans.labels_, x_umap, n_clusters)
    
    # Store results
    umap_projections_list.append(x_umap)
    kmeans_centroids_list.append(centroids)

# Convert results to numpy arrays for further analysis
umap_projections_array = np.array(umap_projections_list)  # Shape: (n_runs, n_samples, 2)
kmeans_centroids_array = np.array(kmeans_centroids_list)  # Shape: (n_runs, n_clusters, 2)

# Compute mean and standard deviation for centroids
centroid_mean_array = np.mean(kmeans_centroids_array, axis=0)  # Shape: (n_clusters, 2)
centroid_std_array = np.std(kmeans_centroids_array, axis=0)    # Shape: (n_clusters, 2)

# Save results with descriptive filenames
np.save(f'umap_projections_n_neighbors_{n_neighbors}_min_dist_{min_dist}_n_comp_{n_components}.npy', umap_projections_array)
np.save(f'kmeans_centroids_n_neighbors_{n_neighbors}_min_dist_{min_dist}_n_comp_{n_components}.npy', kmeans_centroids_array)
np.save(f'centroid_mean_n_neighbors_{n_neighbors}_min_dist_{min_dist}_n_comp_{n_components}.npy', centroid_mean_array)
np.save(f'centroid_std_n_neighbors_{n_neighbors}_min_dist_{min_dist}_n_comp_{n_components}.npy', centroid_std_array)


In [None]:
# # Load the UMAP projections
# umap_projections_10_0_2 = np.load(f'umap_projections_n_neighbors_10_min_dist_0_n_comp_2.npy')

# # To see the contents of the UMAP projections
# print(umap_projections_10_0_2)

In [None]:
### NO NEED TO RE RUN ###

# Save the centroid_mean and centroid_std
# np.save(f'centroid_mean_10_0_2.npy', np.array(centroid_mean_10_0_2))
# np.save(f'centroid_std_10_0_2.npy', np.array(centroid_mean_10_0_2))

In [None]:
# kmeans_centroids_10_0_2 = np.load(f"kmeans_centroids_neighbors_10_0_2.npy")  # Load the saved centroids data

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_neighbors_10_min_dist_0_n_components_2 = np.zeros(10)
std_dev_y_neighbors_10_min_dist_0_n_components_2 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_10_0_2[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_10_0_2[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_neighbors_10_min_dist_0_n_components_2[i] = np.std(cluster_x_coords)
    std_dev_y_neighbors_10_min_dist_0_n_components_2[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:",
      std_dev_x_neighbors_10_min_dist_0_n_components_2)
print("Standard deviation of y coordinates per cluster:",
      std_dev_y_neighbors_10_min_dist_0_n_components_2)


In [191]:
# Create an empty list to hold the data for the DataFrame 
data_neighbors_10_min_dist_0_n_components_2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_10_0_2[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_10_0_2[cluster]
        lower_bound_x = mean_x - 2 * std_dev_x_neighbors_10_min_dist_0_n_components_2[cluster]
        upper_bound_x = mean_x + 2 * std_dev_x_neighbors_10_min_dist_0_n_components_2[cluster]
        lower_bound_y = mean_y - 2 * std_dev_y_neighbors_10_min_dist_0_n_components_2[cluster]
        upper_bound_y = mean_y + 2 * std_dev_y_neighbors_10_min_dist_0_n_components_2[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std_neighbors_10_min_dist_0_n_components_2 = ((lower_bound_x <= centroid_coord[0] <= upper_bound_x) and(lower_bound_y <= centroid_coord[1] <= upper_bound_y)        )
        
        # Append the data as a new row in the list
        data_neighbors_10_min_dist_0_n_components_2.append([trial + 1, cluster, centroid_coord, inside_2_std_neighbors_10_min_dist_0_n_components_2])

# Create a DataFrame from the list of data
df_results_neighbors_10_min_dist_0_n_components_2 = pd.DataFrame(data_neighbors_10_min_dist_0_n_components_2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])


In [192]:
# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true_10_0_2 = df_results_neighbors_10_min_dist_0_n_components_2.groupby('Trial')['Inside 2 std'].all()

In [193]:
# Filter the trials where all clusters were True
trials_with_all_true_10_0_2 = trials_all_true_10_0_2[trials_all_true_10_0_2].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true_10_0_2)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false_10_0_2 = trials_all_true_10_0_2[~trials_all_true_10_0_2].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false_10_0_2)

In [196]:
# Save the result table to a CSV file
df_results_neighbors_10_min_dist_0_n_components_2.to_csv(f'df_results_10_0_2.csv', index=False)

-------------

Removal outliers process

In [197]:
# Convert the NumPy array into a DataFrame with 'Cluster', 'x_mean', and 'y_mean'
centroid_mean_10_0_2_df = pd.DataFrame(centroid_mean_10_0_2, columns=['x_mean', 'y_mean'])
centroid_mean_10_0_2_df['Cluster'] = np.arange(10)

In [None]:
centroid_mean_10_0_2_df

In [None]:
# Extract x and y coordinates
df_results_10_0_2[['x', 'y']] = pd.DataFrame(df_results_10_0_2['Centroid Coord'].tolist(), index=df_results_10_0_2.index)

# Merge the mean centroids dataframe with the results dataframe on 'Cluster'
df_merged = pd.merge(df_results_10_0_2, centroid_mean_10_0_2_df, on='Cluster', how='left')

In [None]:
# Plot changes in X-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_10[:, cluster, 0], marker='o', linestyle='-', color='b')
    plt.title(f'Cluster {cluster} X-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('X Centroid Coordinate')
    plt.grid(True)
    plt.show()

# Plot changes in Y-coordinate for each cluster over all runs
for cluster in range(n_clusters):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, n_runs + 1), kmeans_centroids_10[:, cluster, 1], marker='o', linestyle='-', color='g')
    plt.title(f'Cluster {cluster} Y-coordinate Change Across Runs')
    plt.xlabel('Run')
    plt.ylabel('Y Centroid Coordinate')
    plt.grid(True)
    plt.show()

In [None]:
# Calculate Euclidean distance from each centroid to its cluster's mean
df_merged['Distance_to_Mean'] = np.sqrt((df_merged['x'] - df_merged['x_mean'])**2 + (df_merged['y'] - df_merged['y_mean'])**2)

# Apply an outlier threshold (e.g., 90th percentile of the distance per cluster)
def filter_outliers(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return df[df['Distance_to_Mean'] <= threshold]

# Apply the filtering function for each cluster
df_no_outliers = df_merged.groupby('Cluster').apply(filter_outliers).reset_index(drop=True)

# Step 7: Drop unnecessary columns if needed (like 'x' and 'y' if only the distance matters)
df_no_outliers_cleaned_10_0_2 = df_no_outliers.drop(columns=['x', 'y', 'x_mean', 'y_mean'])

# Step 8: Check the size of the resulting dataframe
print(f"Original DataFrame size: {df_merged.shape}")
print(f"DataFrame size after removing outliers: {df_no_outliers_cleaned.shape}")

# Display the final dataframe
df_no_outliers_cleaned_10_0_2

In [None]:
# Group the dataframe by 'Cluster'
clusters_grouped_10_0_2 = df_no_outliers_cleaned_10_0_2.groupby('Cluster')

# Create a dictionary to store arrays for each cluster's centroids
clusters_centroids_10_0_2 = {}

# Loop through each group (cluster) and store the centroids in arrays
for cluster, group in clusters_grouped_10_0_2:
    # Extract centroids (x, y) as a NumPy array
    centroids_array = np.array(group['Centroid Coord'].tolist())  # Assuming 'Centroid Coord' contains [x, y] pairs
    clusters_centroids_10_0_2[cluster] = centroids_array

In [None]:
# Create a dictionary to store the size of each cluster
cluster_sizes_10_0_2 = {cluster: len(centroids) for cluster, centroids in clusters_centroids_10_0_2.items()}

# Print the size of each cluster
for cluster, size in cluster_sizes_10_0_2.items():
    print(f"Cluster {cluster} has {size} centroids considered.")

--------

Check to verify that if it is fine to have all clusters with the same number of centroids after filtering out outliers. This must be due to:
- The Distance Distributions are Likely Very Similar
- Uniform Data structure

In [None]:
# Loop through each cluster and plot the distribution of distances
for cluster, group in clusters_grouped:
    plt.figure(figsize=(10, 5))
    plt.hist(group['Distance_to_Mean'], bins=10, edgecolor='black')
    plt.title(f'Cluster {cluster}: Distance to Mean Distribution')
    plt.xlabel('Distance to Mean')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

In [None]:
# Percentile threshold per cluster check
def check_percentiles(df):
    threshold = np.percentile(df['Distance_to_Mean'], 90)
    return threshold

# Function applied to each cluster and print the result
for cluster, group in clusters_grouped:
    threshold = check_percentiles(group)
    print(f"Cluster {cluster}: 90th percentile threshold = {threshold}")

In [None]:
# For each cluster, calculate the 70th percentile of distances and filter accordingly
for cluster, group in clusters_grouped:
    # Calculate the 70th percentile threshold for the current cluster
    threshold = np.percentile(group['Distance_to_Mean'], 70)
    
    # Filter centroids based on the 70th percentile
    filtered_group = group[group['Distance_to_Mean'] <= threshold]
    
    # Print the size of the group before and after filtering
    print(f"Cluster {cluster}: Original size = {len(group)}, Filtered size = {len(filtered_group)}")

------

#### Distance matrix n=10

##### Distance Mean Matrix

**Distance matrix**: elemnt d_{ij} has the distance between the center of cluster i and cluster j.

In [None]:
# Store distance matrices for each run
distance_matrices_10_0_2 = []

# Iterate over all runs and calculate the distance matrix for each run
for run_centroids in kmeans_centroids_10:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix_10_0_2 = cdist(run_centroids, run_centroids, metric='euclidean')
    distance_matrices_10_0_2.append(distance_matrix_10_0_2)

# Convert the list of distance matrices to a numpy array (35 runs, 10x10 distance matrices)
distance_matrices_10_0_2 = np.array(distance_matrices_10_0_2)

# Calculate the mean distance matrix across all runs
mean_distance_matrix_10_0_2 = np.mean(distance_matrices_10_0_2, axis=0)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_10_0_2 = (mean_distance_matrix_10_0_2 - np.min(mean_distance_matrix_10_0_2)) / (np.max(mean_distance_matrix_10_0_2) - np.min(mean_distance_matrix_5_35))

# Plot of the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_10_0_2, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_10_all_runs.npy', distance_matrices_10_0_2)
np.save('mean_distance_matrix_neighbors_10_0_2.npy', mean_distance_matrix_10_0_2)

# Mean distance matrix
print(f"Mean distance matrix across all runs:\n{mean_distance_matrix_10_0_2}")

In [None]:
mean_distance_matrix_10_0_2= np.load(f'mean_distance_matrix_neighbors_10_0_2.npy')
# mean_distance_matrix_10_0_2=np.round(mean_distance_matrix_10_0_2,3)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_10_0_2 = (mean_distance_matrix_10_0_2 - np.min(mean_distance_matrix_10_0_2)) / (np.max(mean_distance_matrix_10_0_2) - np.min(mean_distance_matrix_10_0_2))

In [None]:
# Create a graph from the distance matrix
G_10_0_2 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_10_0_2,3))
np.save('G_10_0_2.npy',G_10_0_2)

# Draw the graph
pos = nx.spring_layout(G_10_0_2, seed=42)  # positions for all nodes
nx.draw(G_10_0_2, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_10_0_2, 'weight')
nx.draw_networkx_edge_labels(G_10_0_2, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Minimum spanning tree

# Compute the minimum spanning tree of the graph
mst_10_0_2 = nx.minimum_spanning_tree(G_10_0_2)
np.save('mst_10_0_2.npy',mst_10_0_2)

# Define positions for all nodes
pos = nx.spring_layout(mst_10_0_2, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_10_0_2, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_10_0_2, 'weight')
nx.draw_networkx_edge_labels(mst_10_0_2, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST - n_neighbors=10")
plt.show()

##### Distance Std. dev. Matrix

In [None]:
# Calculate the pairwise distance matrix for the standard deviations
distance_matrix_std_10_0_2 = cdist(centroid_std_10_0_2, centroid_std_10_0_2, metric='euclidean')

# Normalize the distance matrix
normalized_distance_matrix_std_10_0_2 = (distance_matrix_std_10_0_2 - np.min(distance_matrix_std_10_0_2)) / (np.max(distance_matrix_std_10_0_2) - np.min(distance_matrix_std_10_0_2))

# Visualize the normalized distance matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(normalized_distance_matrix_std_10_0_2, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=10)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.tight_layout()
plt.show()

# Save the distance matrix for later analysis
np.save("distance_matrix_std_10_0_2.npy", distance_matrix_std_10_0_2)
np.save("normalized_distance_matrix_std_10_0_2.npy", normalized_distance_matrix_std_10_0_2)


In [None]:
# Create a graph from the distance matrix
G_std_10_0_2 = nx.from_numpy_array(np.round(normalized_distance_matrix_std_10_0_2,3))
np.save('G_std_10_0_2.npy', G_std_10_0_2)

# Draw the graph
pos = nx.spring_layout(G_std_10_0_2, seed=42)  # positions for all nodes
nx.draw(G_std_10_0_2, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_std_10_0_2, 'weight')
nx.draw_networkx_edge_labels(G_std_10_0_2, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_std_10_0_2 = nx.minimum_spanning_tree(G_std_10_0_2)
np.save('mst_std_10_0_2.npy', mst_std_10_0_2)

# Define positions for all nodes
pos_std_10_0_2 = nx.spring_layout(mst_std_10_0_2, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_std_10_0_2, pos_std_10_0_2, with_labels=True, node_color='lightyellow', edge_color='green', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels_std_10_0_2 = nx.get_edge_attributes(mst_std_10_0_2, 'weight')
nx.draw_networkx_edge_labels(mst_std_10_0_2, pos_std_10_0_2, edge_labels=edge_labels_std_10_0_2, font_size=8, label_pos=0.3)

plt.title("MST Std. Deviation - n_neighbors=10")
plt.show()

In [None]:
def plot_heatmap(matrix, title, xlabel, ylabel, figsize=(10, 8), cmap="viridis", annot=True):
    """
    Plots a heatmap for a given matrix with customizable parameters.

    Args:
        matrix (ndarray): The 2D matrix to plot as a heatmap.
        title (str): Title of the heatmap.
        xlabel (str): Label for the x-axis.
        ylabel (str): Label for the y-axis.
        figsize (tuple): Size of the figure (default: (10, 8)).
        cmap (str): Color map to use (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    plt.figure(figsize=figsize)
    sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.show()

# Call the function for both heatmaps
plot_heatmap(
    normalized_distance_matrix_std_10_0_2,
    title="Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=10)",
    xlabel="Cluster",
    ylabel="Cluster",
    figsize=(8, 6)
)

plot_heatmap(
    normalized_mean_distance_matrix_10_0_2,
    title="Normalized Mean Distance Matrix (k=10, n_neighbors=10)",
    xlabel="Cluster",
    ylabel="Cluster",
    figsize=(8, 6)
)

In [None]:
# Define a function to create heatmaps
def plot_heatmaps_side_by_side(matrices, titles, figsize=(16, 8), cmap="viridis", annot=True):
    """
    Plots multiple heatmaps side by side for given matrices and titles.

    Args:
        matrices (list): List of 2D matrices to plot as heatmaps.
        titles (list): List of titles corresponding to each matrix.
        figsize (tuple): Size of the entire figure (default: (16, 8)).
        cmap (str): Color map to use for all heatmaps (default: "viridis").
        annot (bool): Whether to annotate cells with their values (default: True).
    """
    n = len(matrices)  # Number of heatmaps
    fig, axes = plt.subplots(1, n, figsize=figsize)

    for i, (matrix, title) in enumerate(zip(matrices, titles)):
        sns.heatmap(matrix, annot=annot, cmap=cmap, fmt=".2f", linewidths=0.5, ax=axes[i])
        axes[i].set_title(title)
        axes[i].set_xlabel("Cluster")
        axes[i].set_ylabel("Cluster" if i == 0 else "")  # Only label y-axis for the first plot

    plt.tight_layout()
    plt.show()

# Call the function with the two heatmaps
plot_heatmaps_side_by_side(
    matrices=[
        normalized_distance_matrix_std_10_0_2,
        normalized_mean_distance_matrix_10_0_2
    ],
    titles=[
        "Normalized Distance Matrix for Centroid Std Deviations (n_neighbors=5)",
        "Normalized Mean Distance Matrix (k=10, n_neighbors=5)"
    ]
)

In [None]:
# Define a threshold for "low" values
threshold = 0.65

# Identify pairs of clusters with low values in both matrices
low_low_pairs = []
for i in range(normalized_mean_distance_matrix_10_0_2.shape[0]):
    for j in range(normalized_mean_distance_matrix_10_0_2.shape[1]):
        if i != j:  # Skip diagonal
            mean_value = normalized_mean_distance_matrix_10_0_2[i, j]
            std_value = normalized_distance_matrix_std_10_0_2[i, j]
            if mean_value < threshold and std_value < threshold:
                low_low_pairs.append((i, j, mean_value, std_value))

# Display the results
for pair in low_low_pairs:
    print(f"Clusters {pair[0]} and {pair[1]}: Mean Distance = {pair[2]:.2f}, Std Distance = {pair[3]:.2f}")

0.65 can seem like a high value, since it is on the upper-mid range.

Depending on the goal of the analysis we can think of it as:
- If the aim is to identify the strongest relationships between clusters, a lower threshold would make more sense.
- If we want to explore the broader connections, then it is fine.

In [None]:
# Example: Replace with your cluster pairs from low_low_pairs
low_low_pairs = [(0, 9, 0.64, 0.18), (0, 6, 0.64, 0.23),(7, 9, 0.62, 0.26)]  # Example cluster pairs

# UMAP projections and cluster labels (replace with your actual data)
umap_projections = np.load("umap_projections_neighbors_10.npy")
kmeans_labels = np.load("kmeans_labels_list_10_0_2.npy")  # Shape: (n_runs, n_samples)

# Function to plot clusters
def plot_clusters(umap_projection, labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair
    points_a = umap_projection[labels == cluster_a]
    points_b = umap_projection[labels == cluster_b]

    plt.figure(figsize=(8, 6))
    plt.scatter(points_a[:, 0], points_a[:, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.6)
    plt.scatter(points_b[:, 0], points_b[:, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.6)
    plt.title(f"Run {run_idx}: Cluster {cluster_a} vs. Cluster {cluster_b}")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Analyze each cluster pair
for cluster_pair in low_low_pairs:
    cluster_a, cluster_b = cluster_pair[0], cluster_pair[1]
    print(f"Analyzing Cluster Pair: {cluster_a} and {cluster_b}")
    
    # For simplicity, visualize them in a specific UMAP run (e.g., the first run)
    run_idx = 0  # Use the first run for visualization
    plot_clusters(umap_projections[run_idx], kmeans_labels[run_idx], (cluster_a, cluster_b), run_idx)

    # Calculate additional statistics if needed
    distances_a_to_b = np.linalg.norm(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a].mean(axis=0) - 
                                      umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b].mean(axis=0))
    print(f"Mean Centroid Distance (Run {run_idx}): {distances_a_to_b:.2f}")

    # Variability comparison
    cluster_a_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_a], axis=0)
    cluster_b_std = np.std(umap_projections[run_idx][kmeans_labels[run_idx] == cluster_b], axis=0)
    print(f"Cluster {cluster_a} Std Dev: {cluster_a_std}")
    print(f"Cluster {cluster_b} Std Dev: {cluster_b_std}")
    print("\n")

**Cluster 1 and Cluster 8** have a moderate spatial relationship with visible overlap in the UMAP space. Their differing variability patterns suggest distinct structures, but the overlap points might represent shared features or transitions between the clusters.
The large spatial separation between their centroids suggests they represent distinct structures or classes in the data.

**Cluster 0 and Cluster 9** 9 appears more compact and stable, while Cluster 0 is larger and more variable.
Their distinct regions in the UMAP space and differing standard deviations reinforce their meaningful separation.
Insights from Variability:

The variability of Cluster 0 could indicate sensitivity to UMAP parameters or noise in the data.

In [None]:
def examine_mnist_overlap(umap_projection, kmeans_labels, mnist_labels, cluster_pair, run_idx):
    cluster_a, cluster_b = cluster_pair

    # Get points in Cluster A and Cluster B
    points_a_indices = np.where(kmeans_labels == cluster_a)[0]
    points_b_indices = np.where(kmeans_labels == cluster_b)[0]

    # Find the overlapping points (indices)
    overlap_indices = np.intersect1d(points_a_indices, points_b_indices)

    # Get the original labels of overlapping points
    overlap_labels = np.array(mnist_labels)[overlap_indices]

    # Analyze the original labels
    overlap_label_counts = pd.Series(overlap_labels).value_counts()

    # Display the overlap statistics
    print(f"Overlap between Cluster {cluster_a} and Cluster {cluster_b} (Run {run_idx}):")
    print(f"Number of overlapping points: {len(overlap_indices)}")
    print(f"Original label distribution of overlapping points:\n{overlap_label_counts}")

    # Visualize the overlap
    plt.figure(figsize=(8, 6))
    plt.scatter(umap_projection[points_a_indices, 0], umap_projection[points_a_indices, 1], color="blue", label=f"Cluster {cluster_a}", alpha=0.5)
    plt.scatter(umap_projection[points_b_indices, 0], umap_projection[points_b_indices, 1], color="orange", label=f"Cluster {cluster_b}", alpha=0.5)
    if len(overlap_indices) > 0:
        plt.scatter(umap_projection[overlap_indices, 0], umap_projection[overlap_indices, 1], color="red", label="Overlap", alpha=0.7)
    plt.title(f"Cluster Overlap: Cluster {cluster_a} vs. Cluster {cluster_b} (Run {run_idx})")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.legend()
    plt.tight_layout()
    plt.show()

# Example usage:
# Load your MNIST data
dataloader = MnistDataloader(
    training_images_filepath="train-images.idx3-ubyte",
    training_labels_filepath="train-labels.idx1-ubyte",
    test_images_filepath="t10k-images.idx3-ubyte",
    test_labels_filepath="t10k-labels.idx1-ubyte"
)

# Load data
(x_train, y_train), (x_test, y_test) = dataloader.load_data()

# Flatten the training images for UMAP (if needed for alignment with projections)
x_train_flattened = np.array([np.array(img).flatten() for img in x_train])

# Example variables (replace these with your actual data)
run_idx = 0  # Analyze the first UMAP run
cluster_pair = (1, 8)  # Compare Cluster 1 and Cluster 8
umap_projection = umap_projections[run_idx]  # UMAP projection for the given run
kmeans_labels = kmeans_labels[run_idx]  # KMeans labels for the given run

# Examine overlap
examine_mnist_overlap(umap_projection, kmeans_labels, y_train, cluster_pair, run_idx)

#### Intra class evaluation

In [None]:
# Define parameters
n_clusters = 10
radius = 0.5

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_10_0_2 = []
neighbor_counts_10_0_2 = []
kmeans_labels_list_10_0_2 = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_10):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_10_0_2 = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_10_0_2.append(kmeans_labels_10_0_2)
    
    run_max_distances_10_0_2 = []
    run_neighbor_counts_10_0_2 = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_10_0_2 == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_10_0_2 = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_10_0_2.append(max_distance_10_0_2)
        
        # Calculate number of neighbors within the radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_10_0_2 = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_10_0_2.append(neighbors_within_radius_10_0_2)
    
    # Append results for this run
    max_distances_10_0_2.append(run_max_distances_10_0_2)
    neighbor_counts_10_0_2.append(run_neighbor_counts_10_0_2)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_10_0_2 = np.array(max_distances_10_0_2)  # Shape: (n_runs, n_clusters)
neighbor_counts_10_0_2 = np.array(neighbor_counts_10_0_2)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_10_0_2 = np.array(kmeans_labels_list_10_0_2)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_10_0_2.npy', max_distances_10_0_2)
np.save('neighbor_counts_within_radius_10_0_2.npy', neighbor_counts_10_0_2)
np.save('kmeans_labels_list_10_0_2 .npy', kmeans_labels_list_10_0_2)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_10_0_2)
print("\nNeighbor counts within radius for each run and each cluster:\n", neighbor_counts_10_0_2)
print("\nKMeans labels saved successfully.")

In [None]:
max_distances_10_0_2_d= np.load(f'max_intra_cluster_distances_dynamic_10_0_2.npy')
neighbor_counts_10_0_2_d= np.load(f'neighbor_counts_within_dynamic_radius_10_0_2.npy')
kmeans_labels_list_10_0_2_d= np.load(f'kmeans_labels_list_10_0_2.npy')

In [None]:
# Variables to track the minimum distance and corresponding clusters
overall_min_distance_10 = float('inf')
min_distance_clusters_10 = None
min_distance_run_idx_10= None

for run_idx, run_centroids in enumerate(kmeans_centroids_10):
    # Compute pairwise distances between centroids
    pairwise_distances_10 = cdist(run_centroids, run_centroids, metric='euclidean')
    
    # Get the indices of the minimum non-zero distance
    np.fill_diagonal(pairwise_distances_10, np.inf)  # Ignore zero distances (self-comparisons)
    min_distance = np.min(pairwise_distances_10)
    if min_distance < overall_min_distance_10:
        overall_min_distance_10 = min_distance
        # Find the indices of the clusters corresponding to the minimum distance
        cluster_indices = np.unravel_index(np.argmin(pairwise_distances_10), pairwise_distances_10.shape)
        min_distance_clusters_10 = cluster_indices
        min_distance_run_idx_10 = run_idx

# Calculate dynamic radius
dynamic_radius_10_0_2 = overall_min_distance_10 / 2
print(f"Dynamic radius: {dynamic_radius_10_0_2}")
print(f"Minimum distance: {overall_min_distance_10}")
print(f"Clusters contributing to minimum distance: {min_distance_clusters_10}")
print(f"Run index: {min_distance_run_idx_10}")

# Save dynamic radius
np.save('dynamic_radius_results_10_0_2.npy', dynamic_radius_10_0_2)


In [None]:
# Define parameters
n_clusters = 10

# Dynamic radius, previously calculated
radius = dynamic_radius_10_0_2

# Lists to store max distances, neighbor counts, and KMeans labels for each cluster in each run
max_distances_10_0_2_d = []
neighbor_counts_10_0_2_d = []
kmeans_labels_list_10_0_2_d = []

# Calculate intra-cluster metrics for each run and each cluster
for run_idx, x_umap in enumerate(umap_projections_10):
    # Re-run KMeans to get labels for each projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_umap)
    kmeans_labels_10_0_2_d = kmeans.labels_
    
    # Store the labels for this run
    kmeans_labels_list_10_0_2_d.append(kmeans_labels_10_0_2_d)
    
    run_max_distances_10_0_2_d = []
    run_neighbor_counts_10_0_2_d = []
    
    # For each cluster, calculate max intra-cluster distance and neighbor count around centroid
    for cluster_idx in range(n_clusters):
        # Get all points in the current cluster
        cluster_points = x_umap[kmeans_labels_10_0_2_d == cluster_idx]
        
        # Calculate pairwise distances within the cluster
        intra_distances = cdist(cluster_points, cluster_points, metric='euclidean')
        
        # Max distance within the cluster
        max_distance_10_0_2_d = np.max(intra_distances) if len(cluster_points) > 1 else 0
        run_max_distances_10_0_2_d.append(max_distance_10_0_2_d)
        
        # Calculate number of neighbors within the dynamic radius around the centroid
        centroid = np.mean(cluster_points, axis=0)
        distances_to_centroid = np.linalg.norm(cluster_points - centroid, axis=1)
        neighbors_within_radius_10_0_2_d = np.sum(distances_to_centroid <= radius)
        
        run_neighbor_counts_10_0_2_d.append(neighbors_within_radius_10_0_2_d)
    
    # Append results for this run
    max_distances_10_0_2_d.append(run_max_distances_10_0_2_d)
    neighbor_counts_10_0_2_d.append(run_neighbor_counts_10_0_2_d)

# Convert lists to numpy arrays for easier analysis if needed
max_distances_10_0_2_d = np.array(max_distances_10_0_2_d)  # Shape: (n_runs, n_clusters)
neighbor_counts_10_0_2_d = np.array(neighbor_counts_10_0_2_d)  # Shape: (n_runs, n_clusters)
kmeans_labels_list_10_0_2_d = np.array(kmeans_labels_list_10_0_2_d)  # Shape: (n_runs, n_samples)

# Save the max distances, neighbor counts, and KMeans labels arrays
np.save('max_intra_cluster_distances_dynamic_10_0_2.npy', max_distances_10_0_2_d)
np.save('neighbor_counts_within_dynamic_radius_10_0_2.npy', neighbor_counts_10_0_2_d)
np.save('kmeans_labels_list_10_0_2.npy', kmeans_labels_list_10_0_2_d)

# Output the results
print("Max intra-cluster distances for each run and each cluster:\n", max_distances_10_0_2_d)
print("\nNeighbor counts within dynamic radius for each run and each cluster:\n", neighbor_counts_10_0_2_d)
print("\nKMeans labels saved successfully.")

In [None]:
# Plot neighbor counts for each cluster across all runs
plt.figure(figsize=(16, 12))

# Iterate through each cluster
for cluster_idx in range(neighbor_counts_10_0_2_d.shape[1]):
    plt.subplot(2, 5, cluster_idx + 1)  # Create subplots for 10 clusters (2 rows, 5 columns)
    plt.plot(range(1, neighbor_counts_10_0_2_d.shape[0] + 1), neighbor_counts_10_0_2_d[:, cluster_idx], marker='o')
    plt.title(f'Cluster {cluster_idx}')
    plt.xlabel('Run')
    plt.ylabel('Neighbor Count')
    plt.xticks(range(1, neighbor_counts_10_0_2_d.shape[0] + 1, 5))  # Show every 5th run on the x-axis for clarity
    plt.grid(True)

# Adjust layout and show the plot
plt.tight_layout()
plt.suptitle('N=10 Neighbor Counts per Cluster Across Runs', y=1.02, fontsize=16)  # Add a global title
plt.show()

In [None]:
# Calculate mean and max values across clusters for each run
mean_neighbors_10 = np.mean(neighbor_counts_10_0_2_d, axis=1)  # Shape: (n_runs,)
max_neighbors_10 = np.max(neighbor_counts_10_0_2_d, axis=1)    # Shape: (n_runs,)

# Compute trend lines for mean and max
runs = np.arange(1, len(mean_neighbors_10) + 1)
mean_slope, mean_intercept, _, _, _ = linregress(runs, mean_neighbors_10)
max_slope, max_intercept, _, _, _ = linregress(runs, max_neighbors_10)

# Calculate trend line values
mean_trend_10 = mean_slope * runs + mean_intercept
max_trend_10 = max_slope * runs + max_intercept

# Plot the results
plt.figure(figsize=(12, 6))

# Mean neighbor counts
plt.plot(runs, mean_neighbors_10, label='Mean Neighbor Count', marker='o', color='blue')

# Max neighbor counts
plt.plot(runs, max_neighbors_10, label='Max Neighbor Count', marker='s', color='orange')

# Trend lines
plt.plot(runs, mean_trend_10, linestyle='--', color='green',label='Mean Trend Line')
plt.plot(runs, max_trend_10, linestyle='--', color='green', label='Max Trend Line')

# Add labels, legend, and title
plt.title('N=10 Neighbor Counts Across Runs (Mean vs. Max with Trend Lines)', fontsize=16)
plt.xlabel('Run', fontsize=12)
plt.ylabel('Neighbor Count', fontsize=12)
plt.xticks(range(1, len(mean_neighbors_10) + 1, 5))  # Show every 5th run for readability
plt.legend()
plt.grid(True)
plt.tight_layout()

# Save the plot to a file
plt.savefig(f'neighbor_counts_plot_n_10_0_2.png', dpi=300)

# Show the plot
plt.show()

-----------

-------

## Epochs Trials

- n_epochs=200: Faster but potentially less precise.
- n_epochs=500: Close to default, balanced performance (used for previous analysises)
- n_epochs=1000: Slower but potentially more precise.

#### n_epochs= 200

In [None]:
# Define the number of UMAP and KMeans runs
n_runs = 35
n_clusters = 10  # Set the number of clusters (for KMeans)
min_dist = 0.1
n_neighbors = 5
n_components = 2
n_epochs = 200

# Store UMAP and KMeans results for each run
umap_projections_5_01_2_200 = []
kmeans_centroids_list_5_01_2_200 = []  # Use this to store centroids for each run

# Define a helper function to calculate the centroid of each cluster
def calculate_centroids(kmeans, x_umap):
    centroids_5_01_2_200 = []
    for i in range(n_clusters):
        cluster_points = x_umap[kmeans.labels_ == i]
        centroid = np.mean(cluster_points, axis=0)
        centroids_5_01_2_200.append(centroid)
    return np.array(centroids_5_01_2_200)

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, n_epochs=n_epochs, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap_5_01_2_200 = umap_model.fit_transform(x_train_flattened)
    
    # Apply KMeans clustering on the UMAP projection
    kmeans_5_01_2_200 = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans_5_01_2_200.fit(x_train_umap_5_01_2_200)

    # Calculate centroids for this run
    centroids_5_01_2_200 = calculate_centroids(kmeans, x_train_umap_5_01_2_200)
    
    # Store the UMAP projections and KMeans models
    umap_projections.append(x_train_umap_5_01_2_200)
    kmeans_centroids_list_5_01_2_200.append(centroids_5_01_2_200)

# Now we calculate the mean and standard deviation of the centroids across all runs
kmeans_centroids_5_01_2_200 = np.array(kmeans_centroids_list_5_01_2_200)  # Shape: (n_runs, n_clusters, 2)

# Calculate mean and std deviation for centroids' coordinates
centroid_mean_5_01_2_200 = np.mean(kmeans_centroids_5_01_2_200, axis=0)
centroid_std_5_01_2_200 = np.std(kmeans_centroids_5_01_2_200, axis=0)

# Save the UMAP projections and KMeans centroids
np.save(f'umap_projections_neighbors_{n_neighbors}_{min_dist}_{n_components}_{n_epochs}.npy', np.array(umap_projections_5_01_2_200))
np.save(f'kmeans_centroids_neighbors_{n_neighbors}_{min_dist}_{n_components}{n_epochs}.npy', np.array(kmeans_centroids_list_5_01_2_200))

In [None]:
# Create a table to store the results: Trial | Cluster | Centroid Coord | Inside the 90%
result_table_200epochs = []

for run in range(n_runs):
    for i in range(n_clusters):
        centroid = kmeans_centroids[run][i]
        mean = centroid_mean[i]
        std = centroid_std[i]
        inside_90 = is_within_90_percent(centroid, mean, std)
        result_table_200epochs.append([run + 1, i, centroid, inside_90])

# Convert result_table to a DataFrame for better readability
df_results_200epochs = pd.DataFrame(result_table_200epochs, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside the 90%'])

# Display the DataFrame using standard Pandas functions
print(df_results_200epochs)

# Print mean and standard deviation for clarity
print("Centroid Means (per cluster):\n", centroid_mean)
print("Centroid Standard Deviations (per cluster):\n", centroid_std)

#### n_epochs=1000

In [None]:
# Define the number of UMAP and KMeans runs
n_runs = 10
n_clusters = 10  # Set the number of clusters (for KMeans)

# Store UMAP and KMeans results for each run
umap_projections = []
kmeans_centroids_list = []  # Use this to store centroids for each run

# Define a helper function to calculate the centroid of each cluster
def calculate_centroids(kmeans, x_umap):
    centroids = []
    for i in range(n_clusters):
        cluster_points = x_umap[kmeans.labels_ == i]
        centroid = np.mean(cluster_points, axis=0)
        centroids.append(centroid)
    return np.array(centroids)

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=5, min_dist=0.1, n_components=2, n_epochs=1000, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap = umap_model.fit_transform(x_train_flattened)
    
    # Apply KMeans clustering on the UMAP projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_train_umap)

    # Calculate centroids for this run
    centroids = calculate_centroids(kmeans, x_train_umap)
    
    # Store the UMAP projections and KMeans models
    umap_projections.append(x_train_umap)
    kmeans_centroids_list.append(centroids)

# Now we calculate the mean and standard deviation of the centroids across all runs
kmeans_centroids = np.array(kmeans_centroids_list)  # Shape: (n_runs, n_clusters, 2)

# Calculate mean and std deviation for centroids' coordinates
centroid_mean = np.mean(kmeans_centroids, axis=0)
centroid_std = np.std(kmeans_centroids, axis=0)

# Define a function to check if a centroid is within the 90% range
def is_within_90_percent(centroid, mean, std):
    lower_bound = mean - 1.645 * std  # 90% interval lower bound
    upper_bound = mean + 1.645 * std  # 90% interval upper bound
    return np.all((centroid >= lower_bound) & (centroid <= upper_bound))

In [None]:
# Create a table to store the results: Trial | Cluster | Centroid Coord | Inside the 90%
result_table_1000epochs = []

for run in range(n_runs):
    for i in range(n_clusters):
        centroid = kmeans_centroids[run][i]
        mean = centroid_mean[i]
        std = centroid_std[i]
        inside_90 = is_within_90_percent(centroid, mean, std)
        result_table_1000epochs.append([run + 1, i, centroid, inside_90])

# Convert result_table to a DataFrame for better readability
df_results_1000epochs = pd.DataFrame(result_table_1000epochs, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside the 90%'])

# Display the DataFrame using standard Pandas functions
print(df_results_1000epochs)

# Print mean and standard deviation for clarity
print("Centroid Means (per cluster):\n", centroid_mean)
print("Centroid Standard Deviations (per cluster):\n", centroid_std)

In [None]:
# identify the runs that have at least one centroid outside the 90% range
runs_to_remove = df_results_1000epochs.loc[~df_results_1000epochs['Inside the 90%'], 'Trial'].unique()

# Filter out the identified runs
df_filtered_results = df_results_1000epochs[~df_results_1000epochs['Trial'].isin(runs_to_remove)]

# Step 3: Continue your analysis with the remaining runs
print(f"Runs removed: {runs_to_remove}")
print(f"Remaining runs after filtering: {df_filtered_results['Trial'].unique()}")

--------------

### Silhoute score analysis

Evaluation of how well UMAP clusters the digits. The silhouette score measures how similar each point is to its own cluster compared to other clusters. 
Higher values indicate better clustering.
 - Different 'n_neighbors' and 'min_dist' can maximize the score.

In [None]:
# # Silhouette score analysis
# umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
# x_train_umap = umap_model.fit_transform(x_train_flattened)

# # Calculate silhouette score (labels should be the ground truth digit labels)
# score = silhouette_score(x_train_umap, y_train)
# print(f'Silhouette Score: {score}')

-----------

### UMAP - PCA - TSNE Comparison

In [9]:
# # Apply PCA
# pca_model = PCA(n_components=2)
# x_train_pca = pca_model.fit_transform(x_train_flattened)

In [10]:
# # Apply t-SNE
# tsne_model = TSNE(n_components=2, random_state=42)
# x_train_tsne = tsne_model.fit_transform(x_train_flattened)

In [None]:
# # Plot PCA
# plt.figure(figsize=(10, 5))
# plt.subplot(1, 2, 1)
# plt.scatter(x_train_pca[:, 0], x_train_pca[:, 1], c=y_train, cmap='Spectral', s=0.1)
# plt.title('PCA projection of MNIST data')

# # Plot t-SNE
# plt.subplot(1, 2, 2)
# plt.scatter(x_train_tsne[:, 0], x_train_tsne[:, 1], c=y_train, cmap='Spectral', s=0.1)
# plt.title('t-SNE projection of MNIST data')

# plt.show()

In [None]:
# # PCA silhouette score (labels should be the ground truth digit labels)
# score = silhouette_score(x_train_pca, y_train)
# print(f'Silhouette Score: {score}')

In [None]:
# # TSNE silhouette score (labels should be the ground truth digit labels)
# score = silhouette_score(x_train_tsne, y_train)
# print(f'Silhouette Score: {score}')

In [None]:
# # Silhouette score analysis
# umap_model = umap.UMAP(n_neighbors=15, min_dist=0.2, n_components=2, random_state=42)
# x_train_umap = umap_model.fit_transform(x_train_flattened)

# # Calculate silhouette score (labels should be the ground truth digit labels)
# score = silhouette_score(x_train_umap, y_train)
# print(f'Silhouette Score: {score}')

### Kmeans: Cluster centers

In [None]:
# Number of clusters (digits in MNIST)
n_clusters = 10

# KMeans clustering on the UMAP projection
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(x_train_umap)

In [13]:
# Get the centroids of each cluster
centroids = kmeans.cluster_centers_

# Assign clusters to each point in UMAP space
cluster_labels = kmeans.labels_

In [14]:
# Calculate the spread (standard deviation) of points around the centroid for each cluster
cluster_spreads = []
for cluster_idx in range(n_clusters):
    # Get the points in the current cluster
    cluster_points = x_train_umap[cluster_labels == cluster_idx]
    
    # Calculate the spread (standard deviation of x and y)
    spread_x = np.std(cluster_points[:, 0])
    spread_y = np.std(cluster_points[:, 1])
    
    cluster_spreads.append({
        'cluster': cluster_idx,
        'spread_x': spread_x,
        'spread_y': spread_y
    })

In [None]:
# Convert the results to a DataFrame for easier visualization
spread_df = pd.DataFrame(cluster_spreads)

# Display centroids and spreads
print("Cluster centroids:\n", centroids)
print("\nCluster spreads:\n", spread_df)

In [29]:
from scipy.spatial.distance import cdist
# Calculate the pairwise distances between centroids
distances = cdist(centroids, centroids, metric='euclidean')

distances_rounded = np.round(distances, 3)

# # Convert the distance matrix to a DataFrame for easier visualization
distance_df = pd.DataFrame(distances_rounded, columns=[f'Centroid {i}' for i in range(n_clusters)])

# Save Centroid Distances in npy format.
np.save('centroid_distance_matrix.npy', distances_rounded)


In [None]:
from tabulate import tabulate

# Display DataFrame using tabulate
print(tabulate(distance_df, headers='keys', tablefmt='pretty'))

In [None]:
# Assuming x_train_umap is your UMAP projection and kmeans is the KMeans model
plt.figure(figsize=(10, 8))

# Plot the UMAP-projected points and color them based on the cluster label from KMeans
plt.scatter(x_train_umap[:, 0], x_train_umap[:, 1], c=kmeans.labels_, cmap='Spectral', s=0.1, alpha=0.5)

# Plot the centroids of each cluster
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, marker='x', label='Centroids')

# Add labels and title
plt.title('UMAP Clusters with Centroids')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend()

# Show the plot
plt.show()

In [None]:
# Define the number of UMAP and KMeans runs
n_runs = 5
#n_clusters = 10  # Set the number of clusters (for KMeans)

# Store UMAP and KMeans results for each run
umap_projections = []
kmeans_models = []

# Run UMAP and KMeans multiple times
for run in range(n_runs):
    # Apply UMAP with the same parameters for each run
    umap_model = umap.UMAP(n_neighbors=5, min_dist=0.1, n_components=2, random_state=None)  # No random_state to allow randomness, use random_state=None
    x_train_umap = umap_model.fit_transform(x_train_flattened)
    
    # Apply KMeans clustering on the UMAP projection
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(x_train_umap)
    
    # Store the UMAP projections and KMeans models
    umap_projections.append(x_train_umap)
    kmeans_models.append(kmeans)

# Plot the results for each run
fig, axes = plt.subplots(1, n_runs, figsize=(20, 5))

for i in range(n_runs):
    axes[i].scatter(umap_projections[i][:, 0], umap_projections[i][:, 1], c=kmeans_models[i].labels_, cmap='Spectral', s=0.1, alpha=0.5)
    axes[i].scatter(kmeans_models[i].cluster_centers_[:, 0], kmeans_models[i].cluster_centers_[:, 1], c='red', s=200, marker='x')
    axes[i].set_title(f'Run {i+1}')
    axes[i].axis('off')

plt.suptitle('UMAP Projections and KMeans Centroids (5 Runs)', fontsize=16)
plt.show()


----------

### Min Dist Variation

In [None]:
### min_dist Variation
Controls how tightly UMAP packs points together. Lower values lead to more compact clusters, while higher values spread the clusters apart.
min_dist_values = [0.01, 0.1, 0.2]
fig, axes = plt.subplots(1, 3, figsize=(20, 3))

for idx, min_dist in enumerate(min_dist_values):
    umap_model = umap.UMAP(n_neighbors=15, min_dist=min_dist, n_components=2, random_state=42)
    x_train_umap = umap_model.fit_transform(x_train_flattened)
    scatter = axes[idx].scatter(x_train_umap[:, 0], x_train_umap[:, 1], c=y_train, cmap='Spectral', s=0.1)
    axes[idx].set_title(f'min_dist = {min_dist}')
    axes[idx].axis('off')

fig.colorbar(scatter, ax=axes, orientation='vertical', fraction=0.02, pad=0.04)
plt.suptitle('UMAP with Different min_dist Values', fontsize=16)
plt.show()

## Procrustes analysis

In [None]:
from scipy.spatial import procrustes

In [None]:
# Load the UMAP projections and KMeans centroids for a specific n_neighbors value
umap_projections_5 = np.load(f'umap_projections_neighbors_5.npy', allow_pickle=True)
kmeans_centroids_5 = np.load(f'kmeans_centroids_neighbors_5.npy', allow_pickle=True)

umap_projections_10 = np.load(f'umap_projections_neighbors_10.npy', allow_pickle=True)
kmeans_centroids_10 = np.load(f'kmeans_centroids_neighbors_10.npy', allow_pickle=True)

umap_projections_25 = np.load(f'umap_projections_neighbors_25.npy', allow_pickle=True)
kmeans_centroids_25 = np.load(f'kmeans_centroids_neighbors_103.npy', allow_pickle=True)

In [None]:
# Function to calculate Procrustes distance
def procrustes_analysis(embedding1, embedding2):
    mtx1, mtx2, disparity = procrustes(embedding1, embedding2)
    return disparity

# Compare UMAP embeddings across different n_neighbors values
disparity_5_10 = procrustes_analysis(umap_projections_5[0], umap_projections_10[0])
disparity_10_25 = procrustes_analysis(umap_projections_10[0], umap_projections_25[0])
disparity_5_25 = procrustes_analysis(umap_projections_5[0], umap_projections_25[0])

print(f"Procrustes distance between n_neighbors=5 and n_neighbors=10: {disparity_5_10}")
print(f"Procrustes distance between n_neighbors=10 and n_neighbors=25: {disparity_10_25}")
print(f"Procrustes distance between n_neighbors=5 and n_neighbors=25: {disparity_5_25}")


# UMAP for Emotions

In [3]:
import os
import zipfile
from PIL import Image
from sklearn.preprocessing import LabelEncoder

Load images and labels

In [4]:
### YES ###

class EmotionsDataloader(object):
    def __init__(self, image_dir, label_file):
        self.image_dir = image_dir
        self.label_file = label_file

    def read_images_labels(self):  
        train_images, train_labels = [], []
        test_images, test_labels = [], []

        # Read the label file and match labels to images
        with open(self.label_file, 'r') as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) == 2:
                    image_name, label = parts[0], int(parts[1])
                    aligned_image_name = f"{image_name.split('.')[0]}_aligned.jpg"
                    image_path = os.path.join(self.image_dir, aligned_image_name)

                    if os.path.exists(image_path):
                        image = Image.open(image_path).convert('L')  # Convert to grayscale
                        image = image.resize((48, 48))  # Resize to 48x48

                        # Check if the label belongs to train or test set
                        if "train" in image_name:
                            train_images.append(np.array(image))
                            train_labels.append(label)
                        elif "test" in image_name:
                            test_images.append(np.array(image))
                            test_labels.append(label)
                    else:
                        print(f"Image not found: {aligned_image_name}")

        print(f"Loaded {len(train_images)} training images and {len(test_images)} test images.")
        print(f"Loaded {len(train_labels)} training labels and {len(test_labels)} test labels.")

        return (
            (np.array(train_images), np.array(train_labels)),
            (np.array(test_images), np.array(test_labels))
        )

    def load_data(self):
        return self.read_images_labels()

In [None]:
### YES ###

# Set file paths
input_path = 'C:/Users/Lorenzo/OneDrive/Documents/DTU/Python/2024 Fall/MSc Thesis'
image_dir = os.path.join(input_path, 'extracted_data/Image/aligned')
label_file = os.path.join(input_path, 'extracted_data/EmoLabel/list_patition_label.txt')

# Instantiate and load the dataset
emotions_dataloader = EmotionsDataloader(image_dir, label_file)
(x_train, y_train), (x_test, y_test) = emotions_dataloader.load_data()

# Print dataset shapes
print(f"Training set shape: {x_train.shape}, {y_train.shape}")
print(f"Testing set shape: {x_test.shape}, {y_test.shape}")

# Display some random train and test images
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images) / cols) + 1
    plt.figure(figsize=(15, 10))
    for i, (image, title) in enumerate(zip(images, title_texts)):
        plt.subplot(rows, cols, i + 1)
        plt.imshow(image, cmap=plt.cm.gray)
        plt.title(title, fontsize=12)
        plt.axis('off')

# Show some random train and test images
images_to_show = []
titles_to_show = []

for i in range(10):
    idx = np.random.randint(0, len(x_train))
    images_to_show.append(x_train[idx])
    titles_to_show.append(f"Train[{idx}] = {y_train[idx]}")

for i in range(5):
    idx = np.random.randint(0, len(x_test))
    images_to_show.append(x_test[idx])
    titles_to_show.append(f"Test[{idx}] = {y_test[idx]}")

show_images(images_to_show, titles_to_show)

In [6]:
### YES ###

assert len(x_train) == len(y_train), "Mismatch in training images and labels!"
assert len(x_test) == len(y_test), "Mismatch in test images and labels!"

In [None]:
### YES ###

from sklearn.preprocessing import StandardScaler

# Step 1: Flatten the Images into 1D Vectors
x_train_flattened = x_train.reshape(x_train.shape[0], -1)  # Flatten to (num_samples, 2304)
x_test_flattened = x_test.reshape(x_test.shape[0], -1)    # Flatten to (num_samples, 2304)

# Verify shapes
print("Shape of x_train_flattened:", x_train_flattened.shape)  # (num_train_samples, 2304)
print("Shape of x_test_flattened:", x_test_flattened.shape)    # (num_test_samples, 2304)

# Step 2: Normalize the Flattened Data
scaler = StandardScaler()
x_train_emotion_norm = scaler.fit_transform(x_train_flattened)
x_test_emotion_norm = scaler.transform(x_test_flattened)

# Verify normalization
print("x_train_norma mean:", x_train_emotion_norm.mean(axis=0).mean())  # ~0
print("x_train_norma std:", x_test_emotion_norm.std(axis=0).mean())    # ~1

# Print final shapes
print("Final shape of x_train_norm:", x_train_emotion_norm.shape)
print("Final shape of x_test_norm:", x_test_emotion_norm.shape)

In [None]:
### YES ###

# Check for missing values
print(f"Missing values in x_train_emotion_norm: {np.isnan(x_train_emotion_norm).sum()}")
print(f"Missing values in x_test_emotion_norm: {np.isnan(x_test_emotion_norm).sum()}")

-----------------

## Only UMAP

### Emotion as labels

In [None]:
### YES ###

# Initialize UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.01, n_components=2, random_state=42)

# Fit and transform training data
umap_emotion_train_embedding = reducer.fit_transform(x_train_emotion_norm)

# Optionally, transform test data
umap_emotion_test_embedding = reducer.transform(x_test_emotion_norm)

In [2]:
#np.save("umap_15_01_emotion_test_embedding.npy", umap_emotion_test_embedding)
umap_emotion_test_embedding=np.load(f'umap_15_01_emotion_test_embedding.npy')


In [None]:
# Scatter plot for training data embeddings
plt.scatter(umap_emotion_train_embedding[:, 0], umap_emotion_train_embedding[:, 1], c=y_train, cmap='Spectral', s=5)
plt.colorbar(label="Emotion Label")
plt.title("UMAP n=15, m_dis=0.1 Projection of Training Data (2D)")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.show()

In [16]:
from sklearn.metrics import davies_bouldin_score

In [None]:
# Calculate Silhouette Score
sil_score = silhouette_score(umap_emotion_train_embedding, y_train)
print(f"Silhouette Score: {sil_score:.2f}")

# Calculate Davies-Bouldin Index (lower is better)
db_score = davies_bouldin_score(umap_emotion_train_embedding, y_train)
print(f"Davies-Bouldin Index: {db_score:.2f}")

In [None]:
# Scatter plot comparing training and test embeddings
plt.scatter(umap_emotion_train_embedding[:, 0], umap_emotion_train_embedding[:, 1], c=y_train_emotion, cmap='Spectral', s=5, alpha=0.7, label="Train")
plt.scatter(umap_emotion_test_embedding[:, 0], umap_emotion_test_embedding[:, 1], c=y_test_emotion, cmap='Spectral', s=20, edgecolor='k', label="Test")
plt.legend()
plt.title("UMAP Projection: Train vs Test")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.show()

-----------------

### Gender as labels

In [11]:
# Normalize image data
x_train_gender_norm = x_train_gender / 255.0
x_test_gender_norm = x_test_gender / 255.0

In [None]:
# Check for missing values
print(f"Missing values in x_train_gender_norm: {np.isnan(x_train_gender_norm).sum()}")
print(f"Missing values in x_test_gender_norm: {np.isnan(x_test_gender_norm).sum()}")

In [None]:
# Initialize UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.01, n_components=2, random_state=42)

# Fit and transform training data
umap_gender_train_embedding = reducer.fit_transform(x_train_gender_norm)

# Optionally, transform test data
umap_gender_test_embedding = reducer.transform(x_test_gender_norm)

In [None]:
# Scatter plot for training data embeddings
plt.scatter(umap_gender_train_embedding[:, 0], umap_gender_train_embedding[:, 1], c=y_train_gender, cmap='Spectral', s=5)
plt.colorbar(label="gender Label")
plt.title("UMAP Gender Projection of Training Data (2D)")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.show()

In [15]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [None]:
# Calculate Silhouette Score
sil_score = silhouette_score(umap_gender_train_embedding, y_train_gender)
print(f"Silhouette Score: {sil_score:.2f}")

# Calculate Davies-Bouldin Index (lower is better)
db_score = davies_bouldin_score(umap_gender_train_embedding, y_train_gender)
print(f"Davies-Bouldin Index: {db_score:.2f}")

In [None]:
print(y_train_gender.value_counts())

### UMAP 10 runs

#### Unsupervised

In [2]:
# load the projections, mean, and standard deviation for the training set
raf_unsup_umap_projections_train_10_01= np.load('raf_unsup_umap_projections_train_10_01.npy')
raf_mean_unsup_umap_projection_train_10_01= np.load('raf_mean_unsup_umap_projection_train_10_01.npy')
raf_std_unsup_umap_projection_train_10_01= np.load('raf_std_unsup_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_unsup_umap_projections_test_10_01= np.load('raf_unsup_umap_projections_test_10_01.npy')
raf_mean_unsup_umap_projection_test_10_01= np.load('raf_mean_unsup_umap_projection_test_10_01.npy')
raf_std_unsup_umap_projection_test_10_01= np.load('raf_std_unsup_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
raf_unsup_umap_projections_train_10_01 = []
raf_unsup_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_emotion_norm)
    raf_unsup_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_emotion_norm)
    raf_unsup_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_unsup_umap_projections_train_10_01 = np.array(raf_unsup_umap_projections_train_10_01)
raf_unsup_umap_projections_test_10_01 = np.array(raf_unsup_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_unsup_umap_projection_train_10_01 = np.mean(raf_unsup_umap_projections_train_10_01, axis=0)
raf_std_unsup_umap_projection_train_10_01 = np.std(raf_unsup_umap_projections_train_10_01, axis=0)

raf_mean_unsup_umap_projection_test_10_01 = np.mean(raf_unsup_umap_projections_test_10_01, axis=0)
raf_std_unsup_umap_projection_test_10_01 = np.std(raf_unsup_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_unsup_umap_projections_train_10_01.npy', raf_unsup_umap_projections_train_10_01)
np.save('raf_mean_unsup_umap_projection_train_10_01.npy', raf_mean_unsup_umap_projection_train_10_01)
np.save('raf_std_unsup_umap_projection_train_10_01.npy', raf_std_unsup_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_unsup_umap_projections_test_10_01.npy', raf_unsup_umap_projections_test_10_01)
np.save('raf_mean_unsup_umap_projection_test_10_01.npy', raf_mean_unsup_umap_projection_test_10_01)
np.save('raf_std_unsup_umap_projection_test_10_01.npy', raf_std_unsup_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [107]:
# Load the projections, mean, and standard deviation for the training set
raf_unsup_umap_projections_train_10_01= np.load('raf_unsup_umap_projections_train_10_01.npy')
raf_mean_unsup_umap_projection_train_10_01= np.load('raf_mean_unsup_umap_projection_train_10_01.npy')
raf_std_unsup_umap_projection_train_10_01= np.load('raf_std_unsup_umap_projection_train_10_01.npy')

# Load the projections, mean, and standard deviation for the test set
raf_unsup_umap_projections_test_10_01= np.load('raf_unsup_umap_projections_test_10_01.npy')
raf_mean_unsup_umap_projection_test_10_01= np.load('raf_mean_unsup_umap_projection_test_10_01.npy')
raf_std_unsup_umap_projection_test_10_01= np.load('raf_std_unsup_umap_projection_test_10_01.npy')

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_unsup_umap_projection_train_10_01[:, 0],
    raf_mean_unsup_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Unsupervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_umap_unsup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_umap_unsup_10_01:.4f}")
# Silhouette Score
silhouette_raf_umap_unsup_10_01 = silhouette_score(raf_mean_unsup_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_umap_unsup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_unsup_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_mean_unsup_10_01_db_score = davies_bouldin_score(
    raf_mean_unsup_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_mean_unsup_10_01_db_score:.2f}")

In [None]:
# # Step 1: Load the saved mean UMAP projections
# mean_projection = np.load('raf_mean_unsup_umap_projection_train_10_01.npy')  # Shape: (n_samples, 2)

# # Step 2: Separate the mean projection by class
# classes = np.unique(y_train)
# class_gaussians = {}

# # Calculate the mean and covariance for each class
# for c in classes:
#     class_points = mean_projection[y_train == c]  # Filter by class
#     mean = np.mean(class_points, axis=0)
#     cov = np.cov(class_points, rowvar=False)
#     class_gaussians[c] = {"mean": mean, "cov": cov}

# # Step 3: Visualize Gaussian distributions
# plt.figure(figsize=(10, 8))

# # Plot UMAP embeddings for each class
# for c in classes:
#     class_points = mean_projection[y_train == c]
#     plt.scatter(class_points[:, 0], class_points[:, 1], label=f"Class {c}", alpha=0.5, s=10)

#     # Plot Gaussian contours
#     mean = class_gaussians[c]["mean"]
#     cov = class_gaussians[c]["cov"]
#     x, y = np.meshgrid(
#         np.linspace(mean[0] - 3, mean[0] + 3, 100), 
#         np.linspace(mean[1] - 3, mean[1] + 3, 100)
#     )
#     pos = np.dstack((x, y))
#     rv = multivariate_normal(mean, cov)
#     plt.contour(x, y, rv.pdf(pos), levels=5, alpha=0.8)

# plt.title("UMAP Mean Projections with Gaussian Distributions per Class")
# plt.xlabel("UMAP Component 1")
# plt.ylabel("UMAP Component 2")
# plt.legend()
# plt.show()

# # Step 4: Evaluate likelihood for a random point
# random_point = np.array([0, 0])  # Example point in UMAP space
# likelihoods = {c: multivariate_normal(class_gaussians[c]["mean"], class_gaussians[c]["cov"]).pdf(random_point)
#                for c in classes}

# print("Likelihoods for Random Point:", likelihoods)

#### Supervised

In [11]:
# load the projections, mean, and standard deviation for the training set
raf_sup_umap_projections_train_10_01= np.load('raf_sup_umap_projections_train_10_01.npy')
raf_mean_sup_umap_projection_train_10_01= np.load('raf_mean_sup_umap_projection_train_10_01.npy')
raf_std_sup_umap_projection_train_10_01= np.load('raf_std_sup_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_sup_umap_projections_test_10_01= np.load('raf_sup_umap_projections_test_10_01.npy')
raf_mean_sup_umap_projection_test_10_01= np.load('raf_mean_sup_umap_projection_test_10_01.npy')
raf_std_sup_umap_projection_test_10_01= np.load('raf_std_sup_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
raf_sup_umap_projections_train_10_01 = []
raf_sup_umap_projections_test_10_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running Supervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(x_train_emotion_norm, y_train)
    raf_sup_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_emotion_norm)
    raf_sup_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_sup_umap_projections_train_10_01 = np.array(raf_sup_umap_projections_train_10_01)
raf_sup_umap_projections_test_10_01 = np.array(raf_sup_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_sup_umap_projection_train_10_01 = np.mean(raf_sup_umap_projections_train_10_01, axis=0)
raf_std_sup_umap_projection_train_10_01 = np.std(raf_sup_umap_projections_train_10_01, axis=0)

raf_mean_sup_umap_projection_test_10_01 = np.mean(raf_sup_umap_projections_test_10_01, axis=0)
raf_std_sup_umap_projection_test_10_01 = np.std(raf_sup_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_sup_umap_projections_train_10_01.npy', raf_sup_umap_projections_train_10_01)
np.save('raf_mean_sup_umap_projection_train_10_01.npy', raf_mean_sup_umap_projection_train_10_01)
np.save('raf_std_sup_umap_projection_train_10_01.npy', raf_std_sup_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_sup_umap_projections_test_10_01.npy', raf_sup_umap_projections_test_10_01)
np.save('raf_mean_sup_umap_projection_test_10_01.npy', raf_mean_sup_umap_projection_test_10_01)
np.save('raf_std_sup_umap_projection_test_10_01.npy', raf_std_sup_umap_projection_test_10_01)

# Output confirmation
print("Supervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_sup_umap_projection_train_10_01[:, 0],
    raf_mean_sup_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Supervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_raf_umap_sup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_umap_projection_train_10_01, y_train).predict(raf_mean_sup_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_umap_sup_10_01:.4f}")
# Silhouette Score
silhouette_raf_umap_sup_10_01 = silhouette_score(raf_mean_sup_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_umap_projection_train_10_01, y_train).predict(raf_mean_sup_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_umap_sup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_sup_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_mean_sup_10_01_db_score = davies_bouldin_score(
    raf_mean_sup_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_mean_sup_10_01_db_score:.2f}")

In [None]:
# # Step 1: Load the saved mean UMAP projections
# mean_projection = np.load('raf_mean_sup_umap_projection_10_01.npy')  # Shape: (n_samples, 2)

# # Step 2: Separate the mean projection by class
# classes = np.unique(y_train)
# class_gaussians = {}

# # Calculate the mean and covariance for each class
# for c in classes:
#     class_points = mean_projection[y_train == c]  # Filter by class
#     mean = np.mean(class_points, axis=0)
#     cov = np.cov(class_points, rowvar=False)
#     class_gaussians[c] = {"mean": mean, "cov": cov}

# # Step 3: Visualize Gaussian distributions
# plt.figure(figsize=(10, 8))

# # Plot UMAP embeddings for each class
# for c in classes:
#     class_points = mean_projection[y_train == c]
#     plt.scatter(class_points[:, 0], class_points[:, 1], label=f"Class {c}", alpha=0.5, s=10)

#     # Plot Gaussian contours
#     mean = class_gaussians[c]["mean"]
#     cov = class_gaussians[c]["cov"]
#     x, y = np.meshgrid(
#         np.linspace(mean[0] - 3, mean[0] + 3, 100), 
#         np.linspace(mean[1] - 3, mean[1] + 3, 100)
#     )
#     pos = np.dstack((x, y))
#     rv = multivariate_normal(mean, cov)
#     plt.contour(x, y, rv.pdf(pos), levels=5, alpha=0.8)

# plt.title("UMAP Mean Projections with Gaussian Distributions per Class")
# plt.xlabel("UMAP Component 1")
# plt.ylabel("UMAP Component 2")
# plt.legend()
# plt.show()

# # Step 4: Evaluate likelihood for a random point
# random_point = np.array([0, 0])  # Example point in UMAP space
# likelihoods = {c: multivariate_normal(class_gaussians[c]["mean"], class_gaussians[c]["cov"]).pdf(random_point)
#                for c in classes}

# print("Likelihoods for Random Point:", likelihoods)

------------------

## PCA Before UMAP

In [None]:
# Step 2: Apply PCA
pca = PCA(0.95)
x_train_pca_emotions = pca.fit_transform(x_train_emotion_norm)
x_test_pca_emotions = pca.transform(x_test_emotion_norm)

# print(f"Original number of features: {x_train_emotion_norm.shape[1]}")
# print(f"Reduced number of features: {x_train_pca_emotions.shape[1]}")

In [None]:
print(f"Original number of features: {x_train_emotion_norm.shape[1]}")

In [35]:
# Save the projections, mean, and standard deviation
np.save('x_train_raf_pca_emotions.npy', x_train_pca_emotions)
np.save('x_test_raf_pca_emotions.npy', x_test_pca_emotions)

### PCA + UMAP Unsupervised 10 runs

In [13]:
# load the projections, mean, and standard deviation for the training set
raf_unsup_pca_umap_projections_train_10_01= np.load('raf_unsup_pca_umap_projections_train_10_01.npy')
raf_mean_unsup_pca_umap_projection_train_10_01= np.load('raf_mean_unsup_pca_umap_projection_train_10_01.npy')
raf_std_unsup_pca_umap_projection_train_10_01= np.load('raf_std_unsup_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_unsup_pca_umap_projections_test_10_01= np.load('raf_unsup_pca_umap_projections_test_10_01.npy')
raf_mean_unsup_pca_umap_projection_test_10_01= np.load('raf_mean_unsup_pca_umap_projection_test_10_01.npy')
raf_std_unsup_pca_umap_projection_test_10_01= np.load('raf_std_unsup_pca_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
raf_unsup_pca_umap_projections_train_10_01 = []
raf_unsup_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_pca_emotions)
    raf_unsup_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_pca_emotions)
    raf_unsup_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_unsup_pca_umap_projections_train_10_01 = np.array(raf_unsup_pca_umap_projections_train_10_01)
raf_unsup_pca_umap_projections_test_10_01 = np.array(raf_unsup_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_unsup_pca_umap_projection_train_10_01 = np.mean(raf_unsup_pca_umap_projections_train_10_01, axis=0)
raf_std_unsup_pca_umap_projection_train_10_01 = np.std(raf_unsup_pca_umap_projections_train_10_01, axis=0)

raf_mean_unsup_pca_umap_projection_test_10_01 = np.mean(raf_unsup_pca_umap_projections_test_10_01, axis=0)
raf_std_unsup_pca_umap_projection_test_10_01 = np.std(raf_unsup_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_unsup_pca_umap_projections_train_10_01.npy', raf_unsup_pca_umap_projections_train_10_01)
np.save('raf_mean_unsup_pca_umap_projection_train_10_01.npy', raf_mean_unsup_pca_umap_projection_train_10_01)
np.save('raf_std_unsup_pca_umap_projection_train_10_01.npy', raf_std_unsup_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_unsup_pca_umap_projections_test_10_01.npy', raf_unsup_pca_umap_projections_test_10_01)
np.save('raf_mean_unsup_pca_umap_projection_test_10_01.npy', raf_mean_unsup_pca_umap_projection_test_10_01)
np.save('raf_std_unsup_pca_umap_projection_test_10_01.npy', raf_std_unsup_pca_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_unsup_pca_umap_projection_train_10_01[:, 0],
    raf_mean_unsup_pca_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("PCA + Unsupervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_pca_umap_unsup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_pca_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_pca_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_pca_umap_unsup_10_01:.2f}")
# Silhouette Score
silhouette_raf_pca_umap_unsup_10_01 = silhouette_score(raf_mean_unsup_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_pca_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_pca_umap_unsup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_unsup_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_mean_unsup_pca_umap_10_01_db_score = davies_bouldin_score(
    raf_mean_unsup_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_mean_unsup_pca_umap_10_01_db_score:.2f}")

In [None]:
# # Define parameters
# n_neighbors = 10
# min_dist = 0.1
# n_components = 2
# n_runs = 10  # Number of runs

# # Store UMAP projections for each run
# raf_pca_unsup_umap_projections_10_01 = []

# # Run UMAP multiple times
# for run in range(n_runs):
#     # Create UMAP model
#     umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=None)
    
#     # Fit and transform the data
#     raf_pca_unsup_umap_projection = umap_model.fit_transform(x_train_pca_emotions)
    
#     # Store the projection
#     raf_pca_unsup_umap_projections_10_01.append(raf_pca_unsup_umap_projection)

# # Convert the list of projections to a numpy array
# raf_pca_unsup_umap_projections_10_01 = np.array(raf_pca_unsup_umap_projections_10_01)

# # Calculate mean and standard deviation of projections across runs
# raf_mean_pca_unsup_umap_projection_10_01 = np.mean(raf_pca_unsup_umap_projections_10_01, axis=0)
# raf_std_pca_unsup_umap_projection_10_01 = np.std(raf_pca_unsup_umap_projections_10_01, axis=0)

# # Save the projections, mean, and standard deviation
# np.save('raf_pca_unsup_umap_projections_10_01.npy', raf_pca_unsup_umap_projections_10_01)
# np.save('raf_mean_pca_unsup_umap_projection_10_01.npy', raf_mean_pca_unsup_umap_projection_10_01)
# np.save('raf_std_pca_unsup_umap_projection_10_01.npy', raf_std_pca_unsup_umap_projection_10_01)

# # Output confirmation
# print("UMAP projections, mean, and standard deviation have been saved with identifiers '_10_01'.")

In [None]:
# # Calculate Silhouette Score
# sil_score_raf_std_pca_unsup_umap = silhouette_score(raf_mean_pca_unsup_umap_projection_10_01, y_train)
# print(f"Silhouette Score: {sil_score_raf_std_pca_unsup_umap:.2f}")

# # Calculate Davies-Bouldin Index (lower is better)
# db_score_raf_std_pca_unsup_umap = davies_bouldin_score(raf_mean_pca_unsup_umap_projection_10_01, y_train)
# print(f"Davies-Bouldin Index: {db_score_raf_std_pca_unsup_umap:.2f}")

In [None]:
# Check with SEBA maybe it is interesting to share it like this

# # Iterate over each UMAP projection run
# sil_scores_pca_umap = []
# db_scores_pca_umap = []

# for i, projection in enumerate(raf_pca_unsup_umap_projections_10_01):
#     sil_score_pca_umap = silhouette_score(projection, y_train)
#     db_score_pca_umap = davies_bouldin_score(projection, y_train)
#     sil_scores_pca_umap.append(sil_score_pca_umap)
#     db_scores_pca_umap.append(db_score_pca_umap)
#     print(f"Run {i+1}: Silhouette Score = {sil_score_pca_umap:.2f}, Davies-Bouldin Index = {db_score_pca_umap:.2f}")

# # Calculate mean and standard deviation of scores across runs
# mean_sil_score = np.mean(sil_scores_pca_umap)
# std_sil_score = np.std(sil_scores_pca_umap)
# mean_db_score = np.mean(db_scores_pca_umap)
# std_db_score = np.std(db_scores_pca_umap)

# print(f"Mean Silhouette Score: {mean_sil_score:.2f} ± {std_sil_score:.2f}")
# print(f"Mean Davies-Bouldin Index: {mean_db_score:.2f} ± {std_db_score:.2f}")

### PCA + UMAP Supervised 10 runs

In [None]:
# # Step 2: Apply PCA
# pca = PCA(0.95)
# x_train_pca_emotions = pca.fit_transform(x_train_emotion_norm, y_train)
# #x_test_pca_emotions = pca.transform(x_test_emotion_norm, y_test)

# #print(f"Original number of features: {x_test_emotion_norm.shape[1]}")
# print(f"Reduced number of features: {x_train_pca_emotions.shape[1]}")

In [15]:
# load the projections, mean, and standard deviation for the training set
raf_sup_pca_umap_projections_train_10_01= np.load('raf_sup_pca_umap_projections_train_10_01.npy')
raf_mean_sup_pca_umap_projection_train_10_01= np.load('raf_mean_sup_pca_umap_projection_train_10_01.npy')
raf_std_sup_pca_umap_projection_train_10_01= np.load('raf_std_sup_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_sup_pca_umap_projections_test_10_01= np.load('raf_sup_pca_umap_projections_test_10_01.npy')
raf_mean_sup_pca_umap_projection_test_10_01= np.load('raf_mean_sup_pca_umap_projection_test_10_01.npy')
raf_std_sup_pca_umap_projection_test_10_01= np.load('raf_std_sup_pca_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
raf_sup_pca_umap_projections_train_10_01 = []
raf_sup_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running Supervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(x_train_pca_emotions, y_train)
    raf_sup_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_pca_emotions)
    raf_sup_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_sup_pca_umap_projections_train_10_01 = np.array(raf_sup_pca_umap_projections_train_10_01)
raf_sup_pca_umap_projections_test_10_01 = np.array(raf_sup_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_sup_pca_umap_projection_train_10_01 = np.mean(raf_sup_pca_umap_projections_train_10_01, axis=0)
raf_std_sup_pca_umap_projection_train_10_01 = np.std(raf_sup_pca_umap_projections_train_10_01, axis=0)

raf_mean_sup_pca_umap_projection_test_10_01 = np.mean(raf_sup_pca_umap_projections_test_10_01, axis=0)
raf_std_sup_pca_umap_projection_test_10_01 = np.std(raf_sup_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_sup_pca_umap_projections_train_10_01.npy', raf_sup_pca_umap_projections_train_10_01)
np.save('raf_mean_sup_pca_umap_projection_train_10_01.npy', raf_mean_sup_pca_umap_projection_train_10_01)
np.save('raf_std_sup_pca_umap_projection_train_10_01.npy', raf_std_sup_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_sup_pca_umap_projections_test_10_01.npy', raf_sup_pca_umap_projections_test_10_01)
np.save('raf_mean_sup_pca_umap_projection_test_10_01.npy', raf_mean_sup_pca_umap_projection_test_10_01)
np.save('raf_std_sup_pca_umap_projection_test_10_01.npy', raf_std_sup_pca_umap_projection_test_10_01)

# Output confirmation
print("Supervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_sup_pca_umap_projection_train_10_01[:, 0],
    raf_mean_sup_pca_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("PCA + Supervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_pca_umap_sup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_pca_umap_projection_train_10_01, y_train).predict(raf_mean_sup_pca_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_pca_umap_sup_10_01:.2f}")
# Silhouette Score
silhouette_raf_pca_umap_sup_10_01 = silhouette_score(raf_mean_sup_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_pca_umap_projection_train_10_01, y_train).predict(raf_mean_sup_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_pca_umap_sup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_sup_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_mean_sup_pca_umap_10_01_db_score = davies_bouldin_score(
    raf_mean_sup_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_mean_sup_pca_umap_10_01_db_score:.2f}")

------------

## LLE  Before UMAP

In [None]:
# Define parameters for LLE
n_neighbors = 10
n_components = 147

# Initialize the LLE model
lle = LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=n_components, method='standard', random_state=42)

# Fit and transform the training data
print("Running LLE on the training set...")
x_train_raf_lle = lle.fit_transform(x_train_emotion_norm)
print("LLE transformation on training set completed.")

# Transform the test data using the fitted LLE model
print("Running LLE on the test set...")
x_test_raf_lle = lle.transform(x_test_emotion_norm)
print("LLE transformation on test set completed.")

# Print shapes of transformed data
print(f"Shape of LLE-transformed training data: {x_train_raf_lle.shape}")
print(f"Shape of LLE-transformed test data: {x_test_raf_lle.shape}")

# Optional: Save the LLE-transformed data for later use
np.save('x_train_lle.npy', x_train_raf_lle)
np.save('x_test_lle.npy', x_test_raf_lle)

# Output confirmation
print("LLE-transformed data has been saved.")

### LLE + UMAP Unsupervised 10 runs

In [17]:
# Save the projections, mean, and standard deviation for the training set
raf_unsup_lle_umap_projections_train_10_01= np.load('raf_unsup_lle_umap_projections_train_10_01.npy')
raf_mean_unsup_lle_umap_projection_train_10_01= np.load('raf_mean_unsup_lle_umap_projection_train_10_01.npy')
raf_std_unsup_lle_umap_projection_train_10_01= np.load('raf_std_unsup_lle_umap_projection_train_10_01.npy')

# Save the projections, mean, and standard deviation for the test set
raf_unsup_lle_umap_projections_test_10_01= np.load('raf_unsup_lle_umap_projections_test_10_01.npy')
raf_mean_unsup_lle_umap_projection_test_10_01= np.load('raf_mean_unsup_lle_umap_projection_test_10_01.npy')
raf_std_unsup_lle_umap_projection_test_10_01= np.load('raf_std_unsup_lle_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
raf_unsup_lle_umap_projections_train_10_01 = []
raf_unsup_lle_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_raf_lle)
    raf_unsup_lle_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_raf_lle)
    raf_unsup_lle_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_unsup_lle_umap_projections_train_10_01 = np.array(raf_unsup_lle_umap_projections_train_10_01)
raf_unsup_lle_umap_projections_test_10_01 = np.array(raf_unsup_lle_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_unsup_lle_umap_projection_train_10_01 = np.mean(raf_unsup_lle_umap_projections_train_10_01, axis=0)
raf_std_unsup_lle_umap_projection_train_10_01 = np.std(raf_unsup_lle_umap_projections_train_10_01, axis=0)

raf_mean_unsup_lle_umap_projection_test_10_01 = np.mean(raf_unsup_lle_umap_projections_test_10_01, axis=0)
raf_std_unsup_lle_umap_projection_test_10_01 = np.std(raf_unsup_lle_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_unsup_lle_umap_projections_train_10_01.npy', raf_unsup_lle_umap_projections_train_10_01)
np.save('raf_mean_unsup_lle_umap_projection_train_10_01.npy', raf_mean_unsup_lle_umap_projection_train_10_01)
np.save('raf_std_unsup_lle_umap_projection_train_10_01.npy', raf_std_unsup_lle_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_unsup_lle_umap_projections_test_10_01.npy', raf_unsup_lle_umap_projections_test_10_01)
np.save('raf_mean_unsup_lle_umap_projection_test_10_01.npy', raf_mean_unsup_lle_umap_projection_test_10_01)
np.save('raf_std_unsup_lle_umap_projection_test_10_01.npy', raf_std_unsup_lle_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_unsup_lle_umap_projection_train_10_01[:, 0],
    raf_mean_unsup_lle_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("LLE + Unsupervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_lle_umap_unsup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_lle_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_lle_umap_projection_test_10_01)) # second argument is y_test_pred_lle
print(f"ARI: {ari_raf_lle_umap_unsup_10_01:.2f}")
# Silhouette Score
silhouette_raf_lle_umap_unsup_10_01 = silhouette_score(raf_mean_unsup_lle_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_lle_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_lle_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_lle_umap_unsup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_unsup_lle_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_lle_umap_unsup_10_01_db_score = davies_bouldin_score(
    raf_mean_unsup_lle_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_lle_umap_unsup_10_01_db_score:.2f}")

### LLE + UMAP Supervised 10 runs

In [19]:
# Load the projections, mean, and standard deviation for the training set
raf_sup_lle_umap_projections_train_10_01= np.load('raf_sup_lle_umap_projections_train_10_01.npy')
raf_mean_sup_lle_umap_projection_train_10_01= np.load('raf_mean_sup_lle_umap_projection_train_10_01.npy')
raf_std_sup_lle_umap_projection_train_10_01= np.load('raf_std_sup_lle_umap_projection_train_10_01.npy')

# Load the projections, mean, and standard deviation for the test set
raf_sup_lle_umap_projections_test_10_01= np.load('raf_sup_lle_umap_projections_test_10_01.npy')
raf_mean_sup_lle_umap_projection_test_10_01= np.load('raf_mean_sup_lle_umap_projection_test_10_01.npy')
raf_std_sup_lle_umap_projection_test_10_01= np.load('raf_std_sup_lle_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
raf_sup_lle_umap_projections_train_10_01 = []
raf_sup_lle_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_raf_lle,y_train)
    raf_sup_lle_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_raf_lle)
    raf_sup_lle_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_sup_lle_umap_projections_train_10_01 = np.array(raf_sup_lle_umap_projections_train_10_01)
raf_sup_lle_umap_projections_test_10_01 = np.array(raf_sup_lle_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_sup_lle_umap_projection_train_10_01 = np.mean(raf_sup_lle_umap_projections_train_10_01, axis=0)
raf_std_sup_lle_umap_projection_train_10_01 = np.std(raf_sup_lle_umap_projections_train_10_01, axis=0)

raf_mean_sup_lle_umap_projection_test_10_01 = np.mean(raf_sup_lle_umap_projections_test_10_01, axis=0)
raf_std_sup_lle_umap_projection_test_10_01 = np.std(raf_sup_lle_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_sup_lle_umap_projections_train_10_01.npy', raf_sup_lle_umap_projections_train_10_01)
np.save('raf_mean_sup_lle_umap_projection_train_10_01.npy', raf_mean_sup_lle_umap_projection_train_10_01)
np.save('raf_std_sup_lle_umap_projection_train_10_01.npy', raf_std_sup_lle_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_sup_lle_umap_projections_test_10_01.npy', raf_sup_lle_umap_projections_test_10_01)
np.save('raf_mean_sup_lle_umap_projection_test_10_01.npy', raf_mean_sup_lle_umap_projection_test_10_01)
np.save('raf_std_sup_lle_umap_projection_test_10_01.npy', raf_std_sup_lle_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_sup_lle_umap_projection_train_10_01[:, 0],
    raf_mean_sup_lle_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("LLE + Supervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_lle_umap_sup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_lle_umap_projection_train_10_01, y_train).predict(raf_mean_sup_lle_umap_projection_test_10_01)) # second argument is y_test_pred_lle
print(f"ARI: {ari_raf_lle_umap_sup_10_01:.2f}")
# Silhouette Score
silhouette_raf_lle_umap_sup_10_01 = silhouette_score(raf_mean_sup_lle_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_lle_umap_projection_train_10_01, y_train).predict(raf_mean_sup_lle_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_lle_umap_sup_10_01:.2f}")
# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_sup_lle_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_lle_umap_sup_10_01_db_score = davies_bouldin_score(
    raf_mean_sup_lle_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_lle_umap_sup_10_01_db_score:.2f}")

--------

Training set very imbalanced.

**SMOTE** (Synthetic Minority Oversampling Technique) is a method for addressing class imbalance by generating synthetic samples for minority classes. It works by:

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import numpy as np

# Split into train and test sets if not already split
# (In your case, you already have x_train_emotion_norm and y_train_emotion)

# Apply SMOTE to the training set
smote = SMOTE(random_state=42)  # Random state for reproducibility
x_train_balanced, y_train_balanced = smote.fit_resample(x_train_emotion_norm, y_train_emotion)

# Verify the new class distribution
from collections import Counter
print(f"Original class distribution: {Counter(y_train_emotion)}")
print(f"Balanced class distribution: {Counter(y_train_balanced)}")


In [None]:
pca = PCA(0.95)
x_train_emotion2_pca = pca.fit_transform(x_train_balanced)

# Step 2: UMAP on PCA-transformed data

reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_train_embedding = reducer.fit_transform(x_train_emotion2_pca)

-----

## Isomap before UMAP

In [None]:
# Step 1: Downsample the Dataset Consistently
def downsample_consistent(x_train, y_train, sample_fraction=0.35):
    """
    Downsample the dataset consistently, ensuring label distribution is preserved.
    Returns sampled indices to extract data points and labels.
    """
    sampled_indices = []
    unique_labels = np.unique(y_train)
    for label in unique_labels:
        # Get indices for the current label
        label_indices = np.where(y_train == label)[0]
        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False
        )
        sampled_indices.extend(sampled_indices_label)
    return np.array(sampled_indices)

# Downsample x_train and y_train
sample_fraction = 0.35
sampled_indices = downsample_consistent(x_train, y_train, sample_fraction=sample_fraction)

# Extract downsampled data
x_train_sampled = x_train[sampled_indices]
y_train_sampled = y_train[sampled_indices]

print("Downsampled x_train shape:", x_train_sampled.shape)
print("Downsampled y_train shape:", y_train_sampled.shape)

In [None]:
# Step 1: Downsample the Dataset Consistently
def downsample_consistent(x_test, y_test, sample_fraction=0.35):
    """
    Downsample the dataset consistently, ensuring label distribution is preserved.
    Returns sampled indices to extract data points and labels.
    """
    sampled_indices = []
    unique_labels = np.unique(y_test)
    for label in unique_labels:
        # Get indices for the current label
        label_indices = np.where(y_test == label)[0]
        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False
        )
        sampled_indices.extend(sampled_indices_label)
    return np.array(sampled_indices)

# Downsample x_test and y_test
sample_fraction = 0.35
sampled_indices = downsample_consistent(x_test, y_test, sample_fraction=sample_fraction)

# Extract downsampled data
x_test_sampled = x_test[sampled_indices]
y_test_sampled = y_test[sampled_indices]

print("Downsampled x_test shape:", x_test_sampled.shape)
print("Downsampled y_test shape:", y_test_sampled.shape)

In [None]:
# Step 1: Flatten the Images into 1D Vectors
x_train_flattened = x_train_sampled.reshape(x_train_sampled.shape[0], -1)  # Flatten to (num_samples, 2304)
x_test_flattened = x_test_sampled.reshape(x_test_sampled.shape[0], -1)    # Flatten to (num_samples, 2304)

# Verify shapes
print("Shape of x_train_flattened:", x_train_flattened.shape)  # (num_train_samples, 2304)
print("Shape of x_test_flattened:", x_test_flattened.shape)    # (num_test_samples, 2304)

# Step 2: Normalize the Sampled Data
scaler = StandardScaler()
x_train_sampled_scaled = scaler.fit_transform(x_train_flattened)

In [None]:
# Step 3: Apply Isomap for Dimensionality Reduction
n_neighbors_isomap = 10  # Number of neighbors for Isomap
n_components_isomap = 50  # Reduce to 50 dimensions before UMAP
isomap = Isomap(n_neighbors=n_neighbors_isomap, n_components=n_components_isomap)
x_train_isomap = isomap.fit_transform(x_train_sampled_scaled)

print("Isomap reduced shape:", x_train_isomap.shape)

In [None]:
# Step 4: Apply UMAP for Further Reduction to 2D
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
x_train_umap = reducer.fit_transform(x_train_isomap)

In [None]:
# Step 5: Visualize the Results
plt.figure(figsize=(10, 8))
plt.scatter(
    x_train_umap[:, 0],
    x_train_umap[:, 1],
    c=y_train_sampled,  # Color by ground truth labels
    cmap="tab10",
    s=5,
    alpha=0.8
)
plt.title("Isomap + UMAP Projection: Downsampled Training Data")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.colorbar(label="Labels")
plt.show()

In [None]:
# from sklearn.cluster import KMeans

# Apply KMeans to the UMAP-reduced data
n_clusters = len(np.unique(y_train_sampled))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
predicted_labels = kmeans.fit_predict(x_train_umap)

# Calculate Silhouette Score
sil_score = silhouette_score(x_train_umap, predicted_labels)
print(f"Silhouette Score (with KMeans): {sil_score:.2f}")

# Calculate Davies-Bouldin Index
db_score = davies_bouldin_score(x_train_umap, predicted_labels)
print(f"Davies-Bouldin Index (with KMeans): {db_score:.2f}")


In [None]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

ari = adjusted_rand_score(y_train_sampled, predicted_labels)
print(f"Adjusted Rand Index (ARI): {ari:.2f}")

nmi = normalized_mutual_info_score(y_train_sampled, predicted_labels)
print(f"Normalized Mutual Information (NMI): {nmi:.2f}")

Using HDBScan instead of Kmeans

In [None]:
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean')
predicted_labels = clusterer.fit_predict(x_train_umap)

# Evaluate ARI and NMI
ari_score = adjusted_rand_score(y_train_sampled, predicted_labels)
nmi_score = normalized_mutual_info_score(y_train_sampled, predicted_labels)
print(f"Adjusted Rand Index (ARI): {ari_score:.2f}")
print(f"Normalized Mutual Information (NMI): {nmi_score:.2f}")

------

## ResNet before UMAP

In [134]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input

In [135]:
# Preprocess and reshape data for ResNet
x_train_reshaped = x_train_emotion.reshape(-1, 64, 64, 1).repeat(3, axis=-1)  # ResNet requires 3 channels
x_train_preprocessed = preprocess_input(x_train_reshaped)

In [None]:
# Load pretrained ResNet50 and extract features
resnet = ResNet50(weights='imagenet', include_top=False, input_shape=(64, 64, 3))
x_train_features = resnet.predict(x_train_preprocessed)
x_train_features_flat = x_train_features.reshape(x_train_features.shape[0], -1)  # Flatten

# Use these features as input for UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_train_embedding_cnn = reducer.fit_transform(x_train_features_flat)


In [None]:
plt.scatter(umap_train_embedding_cnn[:, 0], umap_train_embedding_cnn[:, 1], c=y_train_emotion, cmap='Spectral', s=5)
plt.colorbar(label="Emotion Label")
plt.title("UMAP Projection of ResNet Features")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.show()

In [None]:
# Calculate Silhouette Score
sil_score = silhouette_score(umap_train_embedding_cnn, y_train_emotion)
print(f"Silhouette Score: {sil_score:.2f}")

# Calculate Davies-Bouldin Index (lower is better)
db_score = davies_bouldin_score(umap_train_embedding_cnn, y_train_emotion)
print(f"Davies-Bouldin Index: {db_score:.2f}")

## VGG before UMAP

In [139]:
from tensorflow.keras.applications import VGG16

In [140]:
# Preprocess and reshape data for VGG16
# Reshape the grayscale images to 224x224x3 (required for VGG16)
#x_train_reshaped = x_train_emotion.reshape(-1, 64, 64, 1).repeat(3, axis=-1)  # Convert to 3 channels -- Done it for ResNet
x_train_resized = np.array([np.resize(img, (224, 224, 3)) for img in x_train_reshaped])

# Preprocess input for VGG16
x_train_preprocessed = preprocess_input(x_train_resized)

In [2]:
# Load pretrained VGG16 model and extract features
vgg = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x_train_features = vgg.predict(x_train_preprocessed)

# Flatten the features for UMAP
x_train_features_flat = x_train_features.reshape(x_train_features.shape[0], -1)

In [None]:
# Apply UMAP to the VGG16 features
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_train_embedding_vgg = reducer.fit_transform(x_train_features_flat)

In [None]:
# Visualization
import matplotlib.pyplot as plt
plt.scatter(umap_train_embedding_vgg[:, 0], umap_train_embedding_vgg[:, 1], c=y_train_emotion, cmap='Spectral', s=5)
plt.colorbar(label="Emotion Label")
plt.title("UMAP Projection of VGG16 Features")
plt.show()

-------------

In [None]:
# Step 2: Apply PCA
    pca = PCA(0.95)
    x_train_pca_emotions = pca.fit_transform(x_train_emotion_norm)
    x_test_pca_emotions = pca.transform(x_test_emotion_norm)

    print(f"Original number of features: {x_test_emotion_norm.shape[1]}")
    print(f"Reduced number of features: {x_train_pca_emotions.shape[1]}")

In [None]:
# Plot the explained variance ratio
plt.figure(figsize=(8, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title('Explained Variance vs Number of Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Train a Logistic Regression model
clf = LogisticRegression(max_iter=10000)
clf.fit(x_train_pca_emotions, y_train_emotion)

# Predictions
y_pred = clf.predict(x_test_pca_emotions)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_emotion, y_pred))
print("Classification Report:")
print(classification_report(y_test_emotion, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test_emotion, y_pred))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Compute the confusion matrix
cm = confusion_matrix(y_test_emotion, y_pred)

# Map the numeric labels (1 to 7) to their emotion names
emotion_labels = list(emotion_map.values())  # ["Surprise", "Fear", ..., "Neutral"]

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
            xticklabels=emotion_labels, 
            yticklabels=emotion_labels)
plt.title("Confusion Matrix")
plt.ylabel("True Labels")
plt.xlabel("Predicted Labels")
plt.show()


In [None]:
# Plot the 2D projection with cluster labels
plt.figure(figsize=(10, 8))
sns.scatterplot(x=x_train_pca_emotions[:, 0], y=x_train_pca_emotions[:, 1], hue=y_train_emotion, palette='tab10', s=10, legend='full')
plt.title("2D Scatter Plot of PCA-reduced Data with Cluster Labels")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [39]:
from sklearn.preprocessing import StandardScaler

In [40]:
# Flatten the images
x_train_flat = x_train.reshape(len(x_train), -1)
x_test_flat = x_test.reshape(len(x_test), -1)

# Normalize the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_flat)
x_test_scaled = scaler.transform(x_test_flat)

In [None]:
# Apply PCA
n_components = 100  # Number of principal components to retain
pca = PCA(n_components=n_components)
x_train_pca = pca.fit_transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

# Print the explained variance ratio
print(f"Explained variance by the first {n_components} components: {sum(pca.explained_variance_ratio_):.2f}")

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(x_train_pca[:, 0], x_train_pca[:, 1], c=y_train, cmap='viridis', s=2, alpha=0.5)
plt.colorbar()
plt.title('First Two Principal Components (Train Set)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of Components')
plt.grid()
plt.show()


In [24]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, silhouette_score

In [None]:
# Number of clusters (assuming 7 emotions based on your dataset)
n_clusters = 7

# Apply KMeans to the PCA-transformed data
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
y_train_pred = kmeans.fit_predict(x_train_pca)

In [None]:
# Evaluate the clustering
silhouette_avg = silhouette_score(x_train_pca, y_train_pred)
print(f"Silhouette Score: {silhouette_avg:.2f}")

# Compare clustering to ground truth using Adjusted Rand Index (ARI)
ari = adjusted_rand_score(y_train, y_train_pred)
print(f"Adjusted Rand Index: {ari:.2f}")

# Visualize the clusters in the first two PCA dimensions
plt.figure(figsize=(10, 8))
plt.scatter(x_train_pca[:, 0], x_train_pca[:, 1], c=y_train_pred, cmap='viridis', s=5, alpha=0.5)
plt.colorbar(label="Cluster ID")
plt.title(f"KMeans Clustering on PCA-reduced Data ({n_clusters} Clusters)")
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

In [None]:
umap = UMAP(n_neighbors=15, n_components=2, random_state=42)
x_train_umap = umap.fit_transform(x_train_scaled)


In [None]:
# Visualize UMAP projection
plt.figure(figsize=(10, 8))
plt.scatter(x_train_umap[:, 0], x_train_umap[:, 1], c=y_train, cmap='viridis', s=5, alpha=0.5)
plt.colorbar(label="True Label")
plt.title('UMAP Projection of Training Data')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.show()

## Gabor + PCA + UMAP

### Gabor + PCA + Unsupervised UMAP

In [9]:
# load the projections, mean, and standard deviation for the training set
raf_unsup_gabor_pca_umap_projections_train_10_01= np.load('raf_unsup_gabor_pca_umap_projections_train_10_01.npy')
raf_mean_unsup_gabor_pca_umap_projection_train_10_01= np.load('raf_mean_unsup_gabor_pca_umap_projection_train_10_01.npy')
raf_std_unsup_gabor_pca_umap_projection_train_10_01= np.load('raf_std_unsup_gabor_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_unsup_gabor_pca_umap_projections_test_10_01= np.load('raf_unsup_gabor_pca_umap_projections_test_10_01.npy')
raf_mean_unsup_gabor_pca_umap_projection_test_10_01= np.load('raf_mean_unsup_gabor_pca_umap_projection_test_10_01.npy')
raf_std_unsup_gabor_pca_umap_projection_test_10_01= np.load('raf_std_unsup_gabor_pca_umap_projection_test_10_01.npy')

In [None]:
import cv2

In [51]:
# Create Gabor Kernels
def create_gabor_kernels():
    """Generates a set of Gabor kernels with different orientations and frequencies."""
    kernels = []
    ksize = 31  # Kernel size
    sigma = 4.0  # Standard deviation of the Gaussian envelope
    lambd = 10.0  # Wavelength of the sinusoidal factor
    gamma = 0.5  # Spatial aspect ratio
    for theta in np.arange(0, np.pi, np.pi / 4):  # 8 orientations
        kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lambd, gamma, psi=0, ktype=cv2.CV_32F)
        kernels.append(kernel)
    return kernels

# Apply Gabor Filters
def apply_gabor_filters(images, kernels):
    """Applies a set of Gabor filters to a batch of images."""
    gabor_features = []
    for image in images:
        image_2d = image.reshape(48, 48)  # Reshape back to 2D (assumes 48x48 images)
        responses = []
        for kernel in kernels:
            filtered = cv2.filter2D(image_2d, cv2.CV_32F, kernel)  # Apply Gabor filter
            responses.append(filtered.flatten())  # Flatten the filtered image
        gabor_features.append(np.concatenate(responses))  # Concatenate all filter responses
    return np.array(gabor_features)

In [None]:
# Generate Gabor kernels
gabor_kernels = create_gabor_kernels()
print(f"Generated {len(gabor_kernels)} Gabor kernels.")

In [None]:
# Apply Gabor filters to the training and test sets
x_train_gabor = apply_gabor_filters(x_train_emotion_norm, gabor_kernels)
x_test_gabor = apply_gabor_filters(x_test_emotion_norm, gabor_kernels)

print(f"Train Gabor feature shape: {x_train_gabor.shape}")
print(f"Test Gabor feature shape: {x_test_gabor.shape}")

In [56]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_raf_train_gabor = scaler.fit_transform(x_train_gabor)
x_raf_test_gabor = scaler.transform(x_test_gabor)

Applying PCA to Gabor

In [None]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
x_raf_train_pca_gabor = pca.fit_transform(x_raf_train_gabor)
x_raf_test_pca_gabor = pca.transform(x_raf_test_gabor)

print(f"Reduced train shape: {x_raf_train_pca_gabor.shape}")
print(f"Reduced test shape: {x_raf_test_pca_gabor.shape}")

In [58]:
np.save('x_raf_train_gabor_pca.npy', x_raf_train_pca_gabor)
np.save('x_raf_test_gabor_pca.npy', x_raf_test_pca_gabor)

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
raf_unsup_gabor_pca_umap_projections_train_10_01 = []
raf_unsup_gabor_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_raf_train_pca_gabor)
    raf_unsup_gabor_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_raf_test_pca_gabor)
    raf_unsup_gabor_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_unsup_gabor_pca_umap_projections_train_10_01 = np.array(raf_unsup_gabor_pca_umap_projections_train_10_01)
raf_unsup_gabor_pca_umap_projections_test_10_01 = np.array(raf_unsup_gabor_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_unsup_gabor_pca_umap_projection_train_10_01 = np.mean(raf_unsup_gabor_pca_umap_projections_train_10_01, axis=0)
raf_std_unsup_gabor_pca_umap_projection_train_10_01 = np.std(raf_unsup_gabor_pca_umap_projections_train_10_01, axis=0)

raf_mean_unsup_gabor_pca_umap_projection_test_10_01 = np.mean(raf_unsup_gabor_pca_umap_projections_test_10_01, axis=0)
raf_std_unsup_gabor_pca_umap_projection_test_10_01 = np.std(raf_unsup_gabor_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_unsup_gabor_pca_umap_projections_train_10_01.npy', raf_unsup_gabor_pca_umap_projections_train_10_01)
np.save('raf_mean_unsup_gabor_pca_umap_projection_train_10_01.npy', raf_mean_unsup_gabor_pca_umap_projection_train_10_01)
np.save('raf_std_unsup_gabor_pca_umap_projection_train_10_01.npy', raf_std_unsup_gabor_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_unsup_gabor_pca_umap_projections_test_10_01.npy', raf_unsup_gabor_pca_umap_projections_test_10_01)
np.save('raf_mean_unsup_gabor_pca_umap_projection_test_10_01.npy', raf_mean_unsup_gabor_pca_umap_projection_test_10_01)
np.save('raf_std_unsup_gabor_pca_umap_projection_test_10_01.npy', raf_std_unsup_gabor_pca_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_unsup_gabor_pca_umap_projection_train_10_01[:, 0],
    raf_mean_unsup_gabor_pca_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Gabor + PCA + Unsupervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_gabor_pca_umap_unsup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_gabor_pca_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_gabor_pca_umap_projection_test_10_01)) # second argument is y_test_pred_gabor_pca
print(f"ARI: {ari_raf_gabor_pca_umap_unsup_10_01:.2f}")
# Silhouette Score
silhouette_raf_gabor_pca_umap_unsup_10_01 = silhouette_score(raf_mean_unsup_gabor_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_gabor_pca_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_gabor_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_gabor_pca_umap_unsup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_unsup_gabor_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_unsup_gabor_pca_10_01_db_score = davies_bouldin_score(
    raf_mean_unsup_gabor_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_unsup_gabor_pca_10_01_db_score:.2f}")

### Gabor + PCA + UMAP Supervised 10 runs

In [14]:
# load the projections, mean, and standard deviation for the training set
raf_sup_gabor_pca_umap_projections_train_10_01= np.load('raf_sup_gabor_pca_umap_projections_train_10_01.npy')
raf_mean_sup_gabor_pca_umap_projection_train_10_01= np.load('raf_mean_sup_gabor_pca_umap_projection_train_10_01.npy')
raf_std_sup_gabor_pca_umap_projection_train_10_01= np.load('raf_std_sup_gabor_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_sup_gabor_pca_umap_projections_test_10_01= np.load('raf_sup_gabor_pca_umap_projections_test_10_01.npy')
raf_mean_sup_gabor_pca_umap_projection_test_10_01= np.load('raf_mean_sup_gabor_pca_umap_projection_test_10_01.npy')
raf_std_sup_gabor_pca_umap_projection_test_10_01= np.load('raf_std_sup_gabor_pca_umap_projection_test_10_01.npy')


In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
raf_sup_gabor_pca_umap_projections_train_10_01 = []
raf_sup_gabor_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running Supervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(x_raf_train_pca_gabor, y_train)
    raf_sup_gabor_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_raf_test_pca_gabor)
    raf_sup_gabor_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_sup_gabor_pca_umap_projections_train_10_01 = np.array(raf_sup_gabor_pca_umap_projections_train_10_01)
raf_sup_gabor_pca_umap_projections_test_10_01 = np.array(raf_sup_gabor_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_sup_gabor_pca_umap_projection_train_10_01 = np.mean(raf_sup_gabor_pca_umap_projections_train_10_01, axis=0)
raf_std_sup_gabor_pca_umap_projection_train_10_01 = np.std(raf_sup_gabor_pca_umap_projections_train_10_01, axis=0)

raf_mean_sup_gabor_pca_umap_projection_test_10_01 = np.mean(raf_sup_gabor_pca_umap_projections_test_10_01, axis=0)
raf_std_sup_gabor_pca_umap_projection_test_10_01 = np.std(raf_sup_gabor_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_sup_gabor_pca_umap_projections_train_10_01.npy', raf_sup_gabor_pca_umap_projections_train_10_01)
np.save('raf_mean_sup_gabor_pca_umap_projection_train_10_01.npy', raf_mean_sup_gabor_pca_umap_projection_train_10_01)
np.save('raf_std_sup_gabor_pca_umap_projection_train_10_01.npy', raf_std_sup_gabor_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_sup_gabor_pca_umap_projections_test_10_01.npy', raf_sup_gabor_pca_umap_projections_test_10_01)
np.save('raf_mean_sup_gabor_pca_umap_projection_test_10_01.npy', raf_mean_sup_gabor_pca_umap_projection_test_10_01)
np.save('raf_std_sup_gabor_pca_umap_projection_test_10_01.npy', raf_std_sup_gabor_pca_umap_projection_test_10_01)

# Output confirmation
print("Supervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_sup_gabor_pca_umap_projection_train_10_01[:, 0],
    raf_mean_sup_gabor_pca_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Gabor + PCA + Supervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_raf_gabor_pca_umap_sup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_gabor_pca_umap_projection_train_10_01, y_train).predict(raf_mean_sup_gabor_pca_umap_projection_test_10_01)) # second argument is y_test_pred_gabor_pca
print(f"ARI: {ari_raf_gabor_pca_umap_sup_10_01:.2f}")
# Silhouette Score
silhouette_raf_gabor_pca_umap_sup_10_01 = silhouette_score(raf_mean_sup_gabor_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_gabor_pca_umap_projection_train_10_01, y_train).predict(raf_mean_sup_gabor_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_gabor_pca_umap_sup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_sup_gabor_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_sup_gabor_pca_10_01_db_score = davies_bouldin_score(
    raf_mean_sup_gabor_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_sup_gabor_pca_10_01_db_score:.2f}")

-----

Exploring using Supervised UMAP

In [None]:
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42, metric="euclidean")
x_train_pca_gabor_supumap = reducer.fit_transform(x_train_pca_gabor, y_train)

In [43]:
np.save('x_train_pca_gabor_supumap.npy',x_train_pca_gabor_supumap)

In [None]:
# Visualize training set UMAP embeddings
plt.figure(figsize=(10, 8))
plt.scatter(x_train_pca_gabor_supumap[:, 0], x_train_pca_gabor_supumap[:, 1], c=y_train, cmap='Spectral', s=5)
plt.colorbar()
plt.title("Supervised UMAP Clustering of Training Data")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

# Compute the Silhouette Score
sil_score = silhouette_score(x_train_pca_gabor_supumap, y_train)
print(f"Silhouette Score: {sil_score:.2f}")

Applying UMAP + Kmeans

In [None]:
# from sklearn.cluster import KMeans

# Apply KMeans to the UMAP-reduced data
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
predicted_labels = kmeans.fit_predict(x_train_pca_gabor_supumap)

# Calculate Silhouette Score
sil_score = silhouette_score(x_train_pca_gabor_supumap, predicted_labels)
print(f"Silhouette Score (with KMeans): {sil_score:.2f}")

# Calculate Davies-Bouldin Index
db_score = davies_bouldin_score(x_train_pca_gabor_supumap, predicted_labels)
print(f"Davies-Bouldin Index (with KMeans): {db_score:.2f}")


In [None]:
ari = adjusted_rand_score(y_train, predicted_labels)
print(f"Adjusted Rand Index (ARI): {ari:.2f}")

nmi = normalized_mutual_info_score(y_train, predicted_labels)
print(f"Normalized Mutual Information (NMI): {nmi:.2f}")

In [44]:
# Step 1: Create Gabor Kernels (Reused from Training)
gabor_kernels = create_gabor_kernels()

# Step 2: Apply Gabor Filters to x_test
x_test_gabor = apply_gabor_filters(x_test, gabor_kernels)

In [45]:
# Step 3: Apply the Pre-Trained PCA Model
x_test_pca_gabor = pca.transform(x_test_gabor)  # Use the already fitted PCA model

In [46]:
# Step 4: Apply the Pre-Trained UMAP Model
x_test_umap = reducer.transform(x_test_pca_gabor)  # Use the already fitted UMAP model

In [None]:
# Step 5: Predict Clusters with the Pre-Trained KMeans Model
y_test_predicted = kmeans.predict(x_test_umap)

# Step 6: Evaluate the Results
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score

# Adjusted Rand Index (ARI)
ari_score = adjusted_rand_score(y_test, y_test_predicted)
print(f"Adjusted Rand Index (ARI) on Test Data: {ari_score:.2f}")

# Normalized Mutual Information (NMI)
nmi_score = normalized_mutual_info_score(y_test, y_test_predicted)
print(f"Normalized Mutual Information (NMI) on Test Data: {nmi_score:.2f}")

# Silhouette Score
silhouette = silhouette_score(x_test_umap, y_test_predicted)
print(f"Silhouette Score on Test Data: {silhouette:.2f}")

In [None]:
# Step 7: Visualize the UMAP Results on Test Data
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
plt.scatter(
    x_test_umap[:, 0], 
    x_test_umap[:, 1], 
    c=y_test_predicted, 
    cmap="tab10", 
    s=5, 
    alpha=0.8
)
plt.title("Supervised UMAP Projection of Test Data with Predicted Clusters")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.colorbar(label="Predicted Clusters")
plt.show()

-----------

Adding SMOTE to the steps

In [30]:
from imblearn.over_sampling import SMOTE

In [None]:
# Flatten Gabor features if not already flattened (SMOTE requires 2D input)
x_train_flat = x_train_gabor

# Apply SMOTE to the Gabor features and labels
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train_flat, y_train)

print(f"Original train shape: {x_train_gabor.shape}")
print(f"SMOTE train shape: {x_train_smote.shape}, {y_train_smote.shape}")

In [None]:
# Standardize the data after SMOTE
x_train_smote = scaler.fit_transform(x_train_smote)

# Apply PCA to the SMOTE-balanced data
pca = PCA(n_components=0.85, random_state=42)
x_train_pca_smote = pca.fit_transform(x_train_smote)

print(f"Reduced train shape after PCA: {x_train_pca_smote.shape}")

In [None]:
# Apply UMAP
umap_model = umap.UMAP(n_neighbors=50, min_dist=0.1, n_components=2, random_state=42)
x_train_umap_smote = umap_model.fit_transform(x_train_pca_smote)

print(f"UMAP reduced train shape: {x_train_umap_smote.shape}")

In [None]:
# Compute the Silhouette Score
sil_score = silhouette_score(x_train_umap_smote, y_train_smote)
print(f"Silhouette Score: {sil_score:.2f}")

---------

# MNIST Silhouette Score + ARI + Accuracy + knn cross validation

In [56]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

## PCA

---------

### n_components=2

In [80]:
x_train_pca_c2= np.load("x_train_pca_c2.npy")
x_test_pca_c2= np.load("x_test_pca_c2.npy")
y_test_pred_pca_c2= np.load("y_test_pred_pca_c2.npy")  # Save SVM predictions
cv_scores_pca= np.load("cv_scores_pca.npy")

In [59]:
# Apply PCA
pca = PCA(n_components=2)
x_train_pca_c2 = pca.fit_transform(x_train_standardized1)
x_test_pca_c2 = pca.transform(x_test_standardized1)

In [60]:
x_train_pca_c2= np.load("x_train_pca_c2.npy")
x_test_pca_c2= np.load("x_test_pca_c2.npy")

In [81]:
x_full_pca_c2 = np.vstack([x_train_pca_c2, x_test_pca_c2])

In [62]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_pca_c2 = kmeans.fit_predict(x_full_pca_c2)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_pca_c2 = adjusted_rand_score(y_full, cluster_labels_pca_c2)
print(f"Adjusted Rand Index (ARI): {ari_pca_c2}")

# Silhouette Score
silhouette_pca_c2 = silhouette_score(x_full_pca_c2, cluster_labels_pca_c2)
print(silhouette_pca_c2)

In [82]:
# k-NN Accuracy for varying k
knn_accuracies_pca_c2 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_pca_c2, y_train)
    knn_accuracy = knn.score(x_test_pca_c2, y_test)
    knn_accuracies_pca_c2[k] = knn_accuracy

In [65]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_pca_c2, y_train)
y_test_pred_pca_c2= svm_clf.predict(x_test_pca_c2)
svm_accuracy_pca_c2 = accuracy_score(y_test, y_test_pred_pca_c2)

In [67]:
### RUN from HERE

# 10-Fold Cross-Validation Accuracy
cv_scores_pca_c2 = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_pca_c2, y_train, cv=10)
cv_accuracy_pca_c2 = cv_scores_pca_c2.mean()
cv_std_pca_c2 = cv_scores_pca_c2.std()

In [None]:
# Results for PCA
results_pca_c2 = {
    'ARI': ari_pca_c2,
    'Silhouette Score': silhouette_pca_c2,
    'SVM Accuracy': svm_accuracy_pca_c2,
    'k-NN Accuracy': knn_accuracies_pca_c2,
    '10-Fold CV Accuracy': (cv_accuracy_pca_c2, cv_std_pca_c2)
}

print("PCA Results:")
print(results_pca_c2)

In [None]:
# Save intermediate data
np.save("x_train_pca_c2.npy", x_train_pca_c2)  # pca-reduced training data
np.save("x_test_pca_c2.npy", x_test_pca_c2)    # pca-reduced test data
np.save("y_test_pred_pca_c2.npy", y_test_pred_pca_c2)  # SVM predictions
np.save("cv_scores_pca_c2.npy", cv_scores_pca_c2)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracies_pca_c2.json", "w") as file:
    json.dump(knn_accuracies_pca_c2, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_pca_c2_serializable = convert_to_serializable({
    'ARI': ari_pca_c2,
    'Silhouette Score': silhouette_pca_c2,
    'SVM Accuracy': svm_accuracy_pca_c2,
    'k-NN Accuracy': knn_accuracies_pca_c2,
    '10-Fold CV Accuracy': (cv_accuracy_pca_c2, cv_std_pca_c2)
})

# Save results summary to JSON
with open("pca_c2_results.json", "w") as file:
    json.dump(results_pca_c2_serializable, file, indent=4)

print("PCA results and intermediate data saved successfully!")

---------

In [11]:
# ARI
ari_pca_c2 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(x_train_pca_c2, y_train).predict(x_test_pca_c2)) # second argument is y_test_pred_pca

# Silhouette Score
silhouette_pca_c2 = silhouette_score(x_test_pca_c2, KNeighborsClassifier(n_neighbors=1).fit(x_train_pca_c2, y_train).predict(x_test_pca_c2))

**Silhouette and ARI with Kmeans**

In [46]:
# Perform clustering on the t-SNE embeddings
kmeans = KMeans(n_clusters=10, random_state=42)
cluster_labels = kmeans.fit_predict(x_train_pca_c2)

In [48]:
# ARI
ari_pca_c2_kmeans = adjusted_rand_score(y_train, cluster_labels) # second argument is y_test_pred_pca

# Silhouette Score
silhouette_pca_c2_kmeans = silhouette_score(x_train_pca_c2,cluster_labels)

In [8]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_pca_c2, y_train)
y_test_pred_pca_c2 = svm_clf.predict(x_test_pca_c2)
svm_accuracy_pca = accuracy_score(y_test, y_test_pred_pca_c2)

In [9]:
# k-NN Accuracy for varying k
knn_accuracies_pca = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_pca_c2, y_train)
    knn_accuracy = knn.score(x_test_pca_c2, y_test)
    knn_accuracies_pca[k] = knn_accuracy

# 10-Fold Cross-Validation Accuracy
cv_scores_pca = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_pca_c2, y_train, cv=10)
cv_accuracy_pca = cv_scores_pca.mean()
cv_std_pca = cv_scores_pca.std()

In [None]:
# Results for PCA
results_pca = {
    'ARI': ari_pca_c2,
    'Silhouette Score': silhouette_pca_c2,
    'SVM Accuracy': svm_accuracy_pca,
    'k-NN Accuracy': knn_accuracies_pca,
    '10-Fold CV Accuracy': (cv_accuracy_pca, cv_std_pca)
}

print("PCA Results:")
print(results_pca)

In [None]:
# Plot the 2D projection with cluster labels
plt.figure(figsize=(10, 8))
sns.scatterplot(x=x_train_pca_c2[:, 0], y=x_train_pca_c2[:, 1], hue=y_train, palette='tab10', s=10, legend='full')
plt.title("2D Scatter Plot of PCA-reduced Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
import json

# Save intermediate data (PCA embeddings and other computationally expensive results)
np.save("x_train_pca_c2.npy", x_train_pca_c2)
np.save("x_test_pca_c2.npy", x_test_pca_c2)
np.save("y_test_pred_pca_c2.npy", y_test_pred_pca_c2)  # Save SVM predictions
np.save("cv_scores_pca.npy", cv_scores_pca)      # Save cross-validation scores

# Save k-NN accuracies
with open("knn_accuracies_pca.json", "w") as file:
    json.dump(knn_accuracies_pca, file, indent=4)

# Save PCA Results
results_pca_c2 = {
    'ARI': ari_pca_c2,
    'Silhouette Score': silhouette_pca_c2,
    'SVM Accuracy': svm_accuracy_pca,
    'k-NN Accuracy': knn_accuracies_pca,
    '10-Fold CV Accuracy': {
        'Mean': cv_accuracy_pca,
        'StdDev': cv_std_pca
    },
    'Filepaths': {
        'x_train_pca_c2': "x_train_pca_c2.npy",
        'x_test_pca_c2': "x_test_pca_C2.npy",
        'y_test_pred_pca_c2': "y_test_pred_pca_c2.npy",
        'cv_scores_pca': "cv_scores_pca.npy",
        'knn_accuracies_pca': "knn_accuracies_pca.json"
    }
}

# Save results to a JSON file
with open("pca_c2_results.json", "w") as file:
    json.dump(results_pca_c2, file, indent=4)

print("PCA components=2 results and all intermediate data saved successfully!")

In [None]:
# Results for UMAP
results_pca_c2 = {
    'ARI': ari_pca_c2,
    'Silhouette Score': silhouette_pca_c2,
    'SVM Accuracy': svm_accuracy_pca,
    'k-NN Accuracy': knn_accuracies_pca,
    '10-Fold CV Accuracy': {
        'Mean': cv_accuracy_pca,
        'StdDev': cv_std_pca
}}
print("PCA Results:")
print(results_pca_c2)

In [None]:
# Step 2: Apply PCA
pca = PCA(0.95)
x_train_pca_95 = pca.fit_transform(x_train_standardized1)
x_test_pca_95 = pca.transform(x_test_standardized1)

np.save("x_train_pca_95.npy", x_train_pca_95)
np.save("x_test_pca_95.npy", x_test_pca_95)

print(f"Original number of features: {x_train_standardized1.shape[1]}")
print(f"Reduced number of features: {x_train_pca_95.shape[1]}")

In [None]:
x_train_pca_95= np.load("x_train_pca_95.npy")
x_test_pca_95= np.load("x_test_pca_95.npy")

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_pca95 = kmeans.fit_predict(x_test_pca_95)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_pca_95 = adjusted_rand_score(y_test, cluster_labels_pca95)
print(f"Adjusted Rand Index (ARI): {ari_pca_95}")
# ari_pca_95 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(x_train_pca_95, y_train).predict(x_test_pca_95))

# Silhouette Score
silhouette_pca_95 = silhouette_score(x_test_pca_95, cluster_labels_pca95)
print(silhouette_pca_95)
# silhouette_pca_95 = silhouette_score(x_test_pca_95, KNeighborsClassifier(n_neighbors=1).fit(x_train_pca_95, y_train).predict(x_test_pca_95))

In [None]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_pca_95, y_train)
y_test_pred_pca_95= svm_clf.predict(x_test_pca_95)
svm_accuracy_pca = accuracy_score(y_test, y_test_pred_pca_95)

In [None]:
np.save("svm_accuracy_pca.npy", svm_accuracy_pca)

In [None]:
svm_accuracy_pca= np.load('svm_accuracy_pca.npy')

In [None]:
# k-NN Accuracy for varying k
knn_accuracies_pca = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_pca_95, y_train)
    knn_accuracy = knn.score(x_test_pca_95, y_test)
    knn_accuracies_pca[k] = knn_accuracy

# for k in [100, 200, 400]:
# 'k-NN Accuracy': {100: 0.9143, 200: 0.8993, 400: 0.8809}

# # 10-Fold Cross-Validation Accuracy
# cv_scores_pca = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_pca_95, y_train, cv=10)
cv_accuracy_pca = cv_scores_pca.mean()
cv_std_pca = cv_scores_pca.std()


In [None]:
cv_scores_pca= np.load("cv_scores_pca.npy")

In [None]:
# Results for PCA
results_pca = {
    'ARI': ari_pca_95,
    'Silhouette Score': silhouette_pca_95,
    'SVM Accuracy': svm_accuracy_pca,
    'k-NN Accuracy': knn_accuracies_pca,
    '10-Fold CV Accuracy': (cv_accuracy_pca, cv_std_pca)
}

print("PCA Results:")
print(results_pca)

### PCA n_components=50

In [None]:
# Step 2: Apply PCA
pca = PCA(n_components=50)
x_train_pca_c50 = pca.fit_transform(x_train_standardized1)
x_test_pca_c50 = pca.transform(x_test_standardized1)

np.save("x_train_pca_c50.npy", x_train_pca_c50)
np.save("x_test_pca_c50.npy", x_test_pca_c50)

print(f"Original number of features: {x_train_standardized1.shape[1]}")
print(f"Reduced number of features: {x_train_pca_c50.shape[1]}")

In [None]:
x_train_pca_c50= np.load("x_train_pca_c50.npy")
x_test_pca_c50= np.load("x_test_pca_c50.npy")

In [27]:
x_full_pca_c50 = np.vstack([x_train_pca_c50, x_test_pca_c50])

In [28]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_pca50 = kmeans.fit_predict(x_full_pca_c50)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_pca_c50 = adjusted_rand_score(y_full, cluster_labels_pca50)
print(f"Adjusted Rand Index (ARI): {ari_pca_c50}")

# Silhouette Score
silhouette_pca_c50 = silhouette_score(x_full_pca_c50, cluster_labels_pca50)
print(silhouette_pca_c50)

In [30]:
# k-NN Accuracy for varying k
knn_accuracies_pca_c50 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_pca_c50, y_train)
    knn_accuracy = knn.score(x_test_pca_c50, y_test)
    knn_accuracies_pca_c50[k] = knn_accuracy

# for k in [100, 200, 400]:
# 'k-NN Accuracy': {100: 0.9143, 200: 0.8993, 400: 0.8809}

In [31]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_pca_c50, y_train)
y_test_pred_pca_c50= svm_clf.predict(x_test_pca_c50)
svm_accuracy_pca_c50 = accuracy_score(y_test, y_test_pred_pca_c50)

In [32]:
# 10-Fold Cross-Validation Accuracy
cv_scores_pca_c50 = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_pca_c50, y_train, cv=10)
cv_accuracy_pca_c50 = cv_scores_pca_c50.mean()
cv_std_pca_c50 = cv_scores_pca_c50.std()

In [None]:
# Results for PCA
results_pca_c50 = {
    'ARI': ari_pca_c50,
    'Silhouette Score': silhouette_pca_c50,
    'SVM Accuracy': svm_accuracy_pca_c50,
    'k-NN Accuracy': knn_accuracies_pca_c50,
    '10-Fold CV Accuracy': (cv_accuracy_pca_c50, cv_std_pca_c50)
}

print("PCA Results:")
print(results_pca_c50)

In [None]:
# Save intermediate data
np.save("x_train_pca_c50.npy", x_train_pca_c50)  # pca-reduced training data
np.save("x_test_pca_c50.npy", x_test_pca_c50)    # pca-reduced test data
np.save("y_test_pred_pca_c50.npy", y_test_pred_pca_c50)  # SVM predictions
np.save("cv_scores_pca_c50.npy", cv_scores_pca_c50)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracies_pca_c50.json", "w") as file:
    json.dump(knn_accuracies_pca_c50, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_pca_c50_serializable = convert_to_serializable({
    'ARI': ari_pca_c50,
    'Silhouette Score': silhouette_pca_c50,
    'SVM Accuracy': svm_accuracy_pca_c50,
    'k-NN Accuracy': knn_accuracies_pca_c50,
    '10-Fold CV Accuracy': (cv_accuracy_pca_c50, cv_std_pca_c50)
})

# Save results summary to JSON
with open("pca_c50_results.json", "w") as file:
    json.dump(results_pca_c50_serializable, file, indent=4)

print("PCA results and intermediate data saved successfully!")

-------

### PCA (95)

In [None]:
# Step 2: Apply PCA
pca = PCA(0.95)
x_train_pca_95 = pca.fit_transform(x_train_standardized1)
x_test_pca_95 = pca.transform(x_test_standardized1)

np.save("x_train_pca_95.npy", x_train_pca_95)
np.save("x_test_pca_95.npy", x_test_pca_95)

print(f"Original number of features: {x_train_standardized1.shape[1]}")
print(f"Reduced number of features: {x_train_pca_95.shape[1]}")

In [51]:
x_train_pca_95= np.load("x_train_pca_95.npy")
x_test_pca_95= np.load("x_test_pca_95.npy")

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_pca95 = kmeans.fit_predict(x_test_pca_95)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_pca_95 = adjusted_rand_score(y_test, cluster_labels_pca95)
print(f"Adjusted Rand Index (ARI): {ari_pca_95}")
# ari_pca_95 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(x_train_pca_95, y_train).predict(x_test_pca_95))

# Silhouette Score
silhouette_pca_95 = silhouette_score(x_test_pca_95, cluster_labels_pca95)
print(silhouette_pca_95)
# silhouette_pca_95 = silhouette_score(x_test_pca_95, KNeighborsClassifier(n_neighbors=1).fit(x_train_pca_95, y_train).predict(x_test_pca_95))

In [11]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_pca_95, y_train)
y_test_pred_pca_95= svm_clf.predict(x_test_pca_95)
svm_accuracy_pca = accuracy_score(y_test, y_test_pred_pca_95)

In [None]:
np.save("svm_accuracy_pca.npy", svm_accuracy_pca)

In [63]:
svm_accuracy_pca= np.load('svm_accuracy_pca.npy')

In [73]:
# k-NN Accuracy for varying k
knn_accuracies_pca = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_pca_95, y_train)
    knn_accuracy = knn.score(x_test_pca_95, y_test)
    knn_accuracies_pca[k] = knn_accuracy

# for k in [100, 200, 400]:
# 'k-NN Accuracy': {100: 0.9143, 200: 0.8993, 400: 0.8809}

# # 10-Fold Cross-Validation Accuracy
# cv_scores_pca = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_pca_95, y_train, cv=10)
cv_accuracy_pca = cv_scores_pca.mean()
cv_std_pca = cv_scores_pca.std()


In [67]:
cv_scores_pca= np.load("cv_scores_pca.npy")

In [None]:
# Results for PCA
results_pca = {
    'ARI': ari_pca_95,
    'Silhouette Score': silhouette_pca_95,
    'SVM Accuracy': svm_accuracy_pca,
    'k-NN Accuracy': knn_accuracies_pca,
    '10-Fold CV Accuracy': (cv_accuracy_pca, cv_std_pca)
}

print("PCA Results:")
print(results_pca)

In [None]:
# Plot the 2D projection with cluster labels
plt.figure(figsize=(10, 8))
sns.scatterplot(x=x_train_pca_95[:, 0], y=x_train_pca_95[:, 1], hue=y_train, palette='tab10', s=10, legend='full')
plt.title("2D Scatter Plot of PCA-reduced Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
import json

# Save intermediate data (PCA embeddings and other computationally expensive results)
np.save("x_train_pca_95.npy", x_train_pca_95)
np.save("x_test_pca_95.npy", x_test_pca_95)
np.save("y_test_pred_pca_95.npy", y_test_pred_pca_95)  # Save SVM predictions
np.save("cv_scores_pca.npy", cv_scores_pca)      # Save cross-validation scores

# Save k-NN accuracies
with open("knn_accuracies_pca.json", "w") as file:
    json.dump(knn_accuracies_pca, file, indent=4)

# Save PCA Results
results_pca_95 = {
    'ARI': ari_pca_95,
    'Silhouette Score': silhouette_pca_95,
    'SVM Accuracy': svm_accuracy_pca,
    'k-NN Accuracy': knn_accuracies_pca,
    '10-Fold CV Accuracy': {
        'Mean': cv_accuracy_pca,
        'StdDev': cv_std_pca
    },
    'Filepaths': {
        'x_train_pca_95': "x_train_pca_95.npy",
        'x_test_pca_95': "x_test_pca_95.npy",
        'y_test_pred_pca_95': "y_test_pred_pca_95.npy",
        'cv_scores_pca': "cv_scores_pca.npy",
        'knn_accuracies_pca': "knn_accuracies_pca.json"
    }
}

# Save results to a JSON file
with open("pca_95_results.json", "w") as file:
    json.dump(results_pca_95, file, indent=4)

print("PCA 95% results and all intermediate data saved successfully!")

Alternative of applying Kmeans for ARI

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_pca95 = kmeans.fit_predict(x_test_pca_95)

# Compute ARI between true labels and cluster labels
ari_pca_95 = adjusted_rand_score(y_test, cluster_labels_pca95)
print(f"Adjusted Rand Index (ARI): {ari_pca_95}")

In [None]:
silhouette_pca_95_k = silhouette_score(x_test_pca_95, cluster_labels_pca95)
print(silhouette_pca_95_k)

-------------

## T-SNE

### Std - train - components=2 - perp=15

In [41]:
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=15)
x_train_std_tsne_c2 = tsne.fit_transform(x_train_standardized1)

**Kmeans Clustering evaluations**


In [42]:
# Perform clustering on the t-SNE embeddings
kmeans = KMeans(n_clusters=10, random_state=42)
cluster_labels = kmeans.fit_predict(x_train_std_tsne_c2)

- ARI

In [None]:
# Compute ARI between true labels and cluster assignments
ari_tsne_std = adjusted_rand_score(y_train, cluster_labels)
print(f"Adjusted Rand Index (ARI): {ari_tsne_std}")

- Silhouette

In [None]:
# Compute Silhouette Score for the clustering
silhouette_tsne_std = silhouette_score(x_train_std_tsne_c2, cluster_labels)
print(f"Silhouette Score: {silhouette_tsne_std}")

**Classification Accuracy**

- SVM Accuracy

In [None]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_std_tsne_c2, y_train)
y_test_pred_tsne = svm_clf.predict(x_test_std_tsne_c2)
svm_accuracy_tsne = accuracy_score(y_test, y_test_pred_tsne)

- k-NN Accuracy for varying k

In [None]:
# k-NN Accuracy for varying k
knn_accuracies_tsne = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_tsne_c2, y_train)
    knn_accuracy = knn.score(x_test_tsne_c2, y_test)
    knn_accuracies_tsne[k] = knn_accuracy

- 10-Fold Cross-Validation Accuracy

In [None]:
# 10-Fold Cross-Validation Accuracy
cv_scores_tsne = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_tsne_c2, y_train, cv=10)
cv_accuracy_tsne = cv_scores_tsne.mean()
cv_std_tsne = cv_scores_tsne.std()

### Std - fullset - n_components=2 - perp=15

In [None]:
# Combine training and testing sets for t-SNE (unsupervised embedding)
x_full = np.vstack([x_train_standardized1, x_test_standardized1])  # Combine normalized train and test data
y_full = np.hstack([y_train, y_test])  # Combine train and test labels

# Apply t-SNE with optimized parameters
tsne = TSNE(n_components=2, random_state=42, perplexity=15)
x_full_tsne = tsne.fit_transform(x_full)  # Fit on the entire dataset

In [None]:
np.save('x_full_tsne.npy', x_full_tsne)

In [75]:
# Apply K-Means on t-SNE embeddings
kmeans_tsne = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits
cluster_labels_full_tsne = kmeans.fit_predict(x_full_tsne)

**Clustering Metrics: ARI and Silhouette Score**

In [None]:
# Compute Adjusted Rand Index (ARI)
ari_full_tsne = adjusted_rand_score(y_full, cluster_labels_full_tsne)
print(f"Adjusted Rand Index (ARI): {ari_full_tsne}")

# Compute Silhouette Score
silhouette_full_tsne = silhouette_score(x_full_tsne, cluster_labels_full_tsne)
print(f"Silhouette Score: {silhouette_full_tsne}")

**Classification Metrics**

In [None]:
# k-NN Accuracy for varying k
knn_full_accuracy_tsne = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_full_tsne, y_full)
    knn_accuracy = knn.score(x_full_tsne, y_full)
    knn_accuracies_tsne[k] = knn_accuracy

# 100: 0.9269714285714286,
#  200: 0.9182714285714285,
#  400: 0.9113,

In [None]:
# Train SVM on t-SNE embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_full_tsne, y_full)

# Predict on the same embeddings
y_pred_svm = svm_clf.predict(x_full_tsne)

# Compute SVM accuracy
svm_accuracy_tsne = accuracy_score(y_full, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_tsne:.4f}")


In [85]:
svm_accuracy_full_tsne=svm_accuracy_tsne

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_tsne = cross_val_score(svm_clf, x_full_tsne, y_full, cv=10)
cv_accuracy_tsne = cv_scores_tsne.mean()
cv_std_tsne_full = cv_scores_tsne.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_tsne:.4f} ± {cv_std_tsne:.4f}")

In [93]:
cv_std_full_tsne=cv_std_tsne

Results

In [None]:
# Results for full t-SNE
results_full_tsne = {
    'ARI': ari_full_tsne,
    'Silhouette Score': silhouette_full_tsne,
    'SVM Accuracy': svm_accuracy_full_tsne,
    'k-NN Accuracy': knn_accuracies_tsne,
    '10-Fold CV Accuracy': (cv_accuracy_tsne, cv_std_tsne)
}

print("t-SNE Results:")
print(results_full_tsne)

In [None]:
# Convert results to JSON-serializable format
results_full_tsne = {
    'ARI': float(ari_full_tsne),
    'Silhouette Score': float(silhouette_full_tsne),
    'SVM Accuracy': float(svm_accuracy_full_tsne),
    '10-Fold CV Accuracy': {
        'Mean': float(cv_accuracy_full_tsne),
        'StdDev': float(cv_std_full_tsne)
    },
    'Filepaths': {
        'x_train_tsne_c2': "x_train_full_tsne.npy",
        'cv_scores_tsne': "cv_scores_tsne.npy",
        'knn_accuracies_tsne': "knn_accuracies_tsne.json"
    }
}

# Save results to a JSON file
with open("full_tsne_results.json", "w") as file:
    json.dump(results_full_tsne, file, indent=4)

print("full t-SNE results and all intermediate data saved successfully!")


-----

### Norm - train - n_components=2 - perp=15

In [14]:
# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=15)
x_train_tsne_c2 = tsne.fit_transform(x_train_normalized)
x_test_tsne_c2 = tsne.fit_transform(x_test_normalized)

-----

In [68]:
np.save("x_train_tsne_c2.npy", x_train_tsne_c2)
np.save("x_test_tsne_c2.npy", x_test_tsne_c2)

In [31]:
x_train_tsne_c2 = np.load('x_train_tsne_c2.npy')

In [None]:
# Perform clustering on the t-SNE embeddings
kmeans = KMeans(n_clusters=10, random_state=42)
cluster_labels = kmeans.fit_predict(x_train_tsne_c2)

# Compute ARI between true labels and cluster assignments
ari_tsne = adjusted_rand_score(y_train, cluster_labels)
print(f"Adjusted Rand Index (ARI): {ari_tsne}")

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_tsne_c2, y_train)
y_test_pred_tsne = knn.predict(x_test_tsne_c2)
ari_tsne_c2 = adjusted_rand_score(y_test, y_test_pred_tsne)
print(ari_tsne_c2)

In [15]:
# ARI
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_tsne_c2, y_train)
y_test_pred_tsne = knn.predict(x_test_tsne_c2)
ari_tsne_c2 = adjusted_rand_score(y_test, y_test_pred_tsne)

# Silhouette Score
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_tsne_c2, y_train)
y_test_pred_tsne = knn.predict(x_test_tsne_c2)
silhouette_tsne_c2 = silhouette_score(x_test_tsne_c2, y_test_pred_tsne)

In [16]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_tsne_c2, y_train)
y_test_pred_tsne = svm_clf.predict(x_test_tsne_c2)
svm_accuracy_tsne = accuracy_score(y_test, y_test_pred_tsne)

In [17]:
# k-NN Accuracy for varying k
knn_accuracies_tsne = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_tsne_c2, y_train)
    knn_accuracy = knn.score(x_test_tsne_c2, y_test)
    knn_accuracies_tsne[k] = knn_accuracy

In [18]:
# 10-Fold Cross-Validation Accuracy
cv_scores_tsne = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_tsne_c2, y_train, cv=10)
cv_accuracy_tsne = cv_scores_tsne.mean()
cv_std_tsne = cv_scores_tsne.std()

In [None]:
# Results for t-SNE
results_tsne = {
    'ARI': ari_tsne_c2,
    'Silhouette Score': silhouette_tsne_c2,
    'SVM Accuracy': svm_accuracy_tsne,
    'k-NN Accuracy': knn_accuracies_tsne,
    '10-Fold CV Accuracy': (cv_accuracy_tsne, cv_std_tsne)
}

print("t-SNE Results:")
print(results_tsne)

In [None]:
# Convert results to JSON-serializable format
results_tsne_c2 = {
    'ARI': float(ari_tsne_c2),
    'Silhouette Score': float(silhouette_tsne_c2),
    'SVM Accuracy': float(svm_accuracy_tsne),
    '10-Fold CV Accuracy': {
        'Mean': float(cv_accuracy_tsne),
        'StdDev': float(cv_std_tsne)
    },
    'Filepaths': {
        'x_train_tsne_c2': "x_train_tsne_c2.npy",
        'x_test_tsne_c2': "x_test_tsne_c2.npy",
        'y_test_pred_tsne_c2': "y_test_pred_tsne_c2.npy",
        'cv_scores_tsne': "cv_scores_tsne.npy",
        'knn_accuracies_tsne': "knn_accuracies_tsne.json"
    }
}

# Save results to a JSON file
with open("tsne_c2_results.json", "w") as file:
    json.dump(results_tsne_c2, file, indent=4)

print("tsne components=2 results and all intermediate data saved successfully!")


In [79]:
# Save intermediate data (tsne embeddings and other computationally expensive results)
np.save("x_train_tsne_c2.npy", x_train_tsne_c2)
np.save("x_test_tsne_c2.npy", x_test_tsne_c2)
np.save("y_test_pred_tsne_c2.npy", y_test_pred_tsne)  # Save SVM predictions
np.save("cv_scores_tsne.npy", cv_scores_tsne)      # Save cross-validation scores

In [7]:
x_train_tsne_c2=np.load("x_train_tsne_c2.npy")
x_test_tsne_c2= np.load("x_test_tsne_c2.npy")
y_test_pred_tsne= np.load("y_test_pred_tsne_c2.npy")  # Save SVM predictions
cv_scores_tsne= np.load("cv_scores_tsne.npy") 

### t-SNE n_components=2 (not possible with 50)

In [15]:
# Combine training and testing sets for t-SNE (unsupervised embedding)
x_full = np.vstack([x_train_standardized1, x_test_standardized1])  # Combine normalized train and test data
y_full = np.hstack([y_train, y_test])  # Combine train and test labels

In [None]:
# Apply TSNE
tsne = TSNE(n_components=2, random_state=42, perplexity=15)
x_full_tsne = tsne.fit_transform(x_train_standardized1)

In [9]:
x_full_tsne= np.load('x_full_tsne.npy')

In [11]:
# Split the embeddings back into train and test sets
x_train_tsne_c2 = x_full_tsne[:x_train_standardized1.shape[0], :]  # Train embeddings
x_test_tsne_c2 = x_full_tsne[x_train_standardized1.shape[0]:, :]  # Test embeddings

In [12]:
# Save intermediate data (tsne embeddings and other computationally expensive results)
np.save("x_train_tsne_c2.npy", x_train_tsne_c2)
np.save("x_test_tsne_c2.npy", x_test_tsne_c2)

In [13]:
# Perform K-Means clustering on the TSNE-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_tsne_c2 = kmeans.fit_predict(x_full_tsne)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_tsne_c2 = adjusted_rand_score(y_full, cluster_labels_tsne_c2)
print(f"Adjusted Rand Index (ARI): {ari_tsne_c2}")

In [None]:
# Silhouette Score
silhouette_tsne_c2 = silhouette_score(x_full_tsne, cluster_labels_tsne_c2)
print(silhouette_tsne_c2)

In [18]:
# k-NN Accuracy for varying k
knn_accuracy_tsne_c2 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_tsne_c2, y_train)
    knn_accuracy = knn.score(x_test_tsne_c2, y_test)
    knn_accuracy_tsne_c2[k] = knn_accuracy

In [None]:
# Train SVM on umap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_tsne_c2, y_train)

# Predict on the same embeddings
y_pred_svm_test = svm_clf.predict(x_test_tsne_c2)  # Predict on test embeddings

# Compute SVM accuracy
svm_accuracy_tsne_c2 = accuracy_score(y_test, y_pred_svm_test)
print(f"SVM Accuracy (Test): {svm_accuracy_tsne_c2:.4f}")

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_tsne_c2 = cross_val_score(svm_clf, x_train_tsne_c2, y_train, cv=10)
cv_accuracy_tsne_c2 = cv_scores_tsne_c2.mean()
cv_std_tsne_c2 = cv_scores_tsne_c2.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_tsne_c2:.4f} ± {cv_std_tsne_c2:.4f}")

In [None]:
# Results for umap
results_tsne_c2 = {
    'ARI': ari_tsne_c2,
    'Silhouette Score': silhouette_tsne_c2,
    'SVM Accuracy': svm_accuracy_tsne_c2,
    'k-NN Accuracy': knn_accuracy_tsne_c2,
    '10-Fold CV Accuracy': (cv_accuracy_tsne_c2, cv_std_tsne_c2)
}

print("umap Results:")
print(results_tsne_c2)

In [22]:
# Save intermediate data
np.save("x_train_tsne_c2.npy", x_train_tsne_c2)  # umap-reduced training data
np.save("x_test_tsne_c2.npy", x_test_tsne_c2)    # umap-reduced test data
np.save("y_test_pred_tsne_c2.npy", y_pred_svm_test)  # SVM predictions
np.save("cv_scores_tsne_c2.npy", cv_scores_tsne_c2)      # Cross-validation scores

In [None]:
# Save k-NN accuracies to JSON
with open("knn_accuracy_tsne_c2.json", "w") as file:
    json.dump(knn_accuracy_tsne_c2, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_tsne_c2_serializable = convert_to_serializable({
    'ARI': ari_tsne_c2,
    'Silhouette Score': silhouette_tsne_c2,
    'SVM Accuracy': svm_accuracy_tsne_c2,
    'k-NN Accuracy': knn_accuracy_tsne_c2,
    '10-Fold CV Accuracy': (cv_accuracy_tsne_c2, cv_std_tsne_c2)
})

# Save results summary to JSON
with open("tsne_c50_results.json", "w") as file:
    json.dump(results_tsne_c2_serializable, file, indent=4)

print("tsne results and intermediate data saved successfully!")

------------

## ISOMAP

### ISOMAP n_components=2

In [None]:
def downsample_mnist_consistent(x_data, y_labels, sample_fraction=0.35):
    """
    Downsample the dataset consistently, returning indices to ensure
    the same points are selected in both spaces.
    """
    sampled_indices = []
    unique_labels = np.unique(y_labels)
    for label in unique_labels:
        # Select indices for the current label
        label_indices = np.where(y_labels == label)[0]
        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False, random_state=42
        )
        sampled_indices.extend(sampled_indices_label)
    return np.array(sampled_indices)

# Downsample training data
sampled_indices_train = downsample_mnist_consistent(x_train_standardized1, y_train, sample_fraction=0.35)
x_train_sampled = x_train_standardized1[sampled_indices_train]
y_train_sampled = y_train[sampled_indices_train]

# Downsample test data
sampled_indices_test = downsample_mnist_consistent(x_test_standardized1, y_test, sample_fraction=0.35)
x_test_sampled = x_test_standardized1[sampled_indices_test]
y_test_sampled = y_test[sampled_indices_test]

print(f"Training set reduced to {len(x_train_sampled)} samples.")
print(f"Test set reduced to {len(x_test_sampled)} samples.")

In [None]:
# Save the sampled indices
np.save("sampled_indices_train.npy", sampled_indices_train)
np.save("sampled_indices_test.npy", sampled_indices_test)

# Save the downsampled dataset
np.save("x_train_sampled.npy", x_train_sampled)
np.save("y_train_sampled.npy", y_train_sampled)
np.save("x_test_sampled.npy", x_test_sampled)
np.save("y_test_sampled.npy", y_test_sampled)

print("Downsampling saved successfully!")

In [30]:
# load the sampled indices
sampled_indices_train= np.load("sampled_indices_train.npy")
sampled_indices_test= np.load("sampled_indices_test.npy")

# load the downsampled dataset
x_train_sampled= np.load("x_train_sampled.npy")
y_train_sampled= np.load("y_train_sampled.npy")
x_test_sampled= np.load("x_test_sampled.npy")
y_test_sampled= np.load("y_test_sampled.npy")

In [73]:
# Apply Isomap
isomap = Isomap(n_components=2, n_neighbors=15)
x_train_isomap_c2 = isomap.fit_transform(x_train_sampled)
x_test_isomap_c2 = isomap.transform(x_test_sampled)

In [75]:
np.save('x_train_isomap_c2.npy',x_train_isomap_c2)
np.save('x_test_isomap_c2.npy',x_test_isomap_c2)

In [76]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_isomap_c2 = kmeans.fit_predict(x_train_isomap_c2)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_isomap_c2 = adjusted_rand_score(y_train_sampled, cluster_labels_isomap_c2)
print(f"Adjusted Rand Index (ARI): {ari_isomap_c2}")

In [None]:
# Silhouette Score
silhouette_isomap_c2 = silhouette_score(x_train_isomap_c2, cluster_labels_isomap_c2)
print(silhouette_isomap_c2)

In [79]:
# k-NN Accuracy for varying k
knn_accuracy_isomap_c2 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_isomap_c2, y_train_sampled)
    knn_accuracy = knn.score(x_train_isomap_c2, y_train_sampled)
    knn_accuracy_isomap_c2[k] = knn_accuracy

In [None]:
# Train SVM on Isomap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_isomap_c2, y_train_sampled)

# Predict on the same embeddings
y_pred_svm = svm_clf.predict(x_train_isomap_c2)

# Compute SVM accuracy
svm_accuracy_isomap_c2 = accuracy_score(y_train_sampled, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_isomap_c2:.4f}")


In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_isomap_c2 = cross_val_score(svm_clf, x_train_isomap_c2, y_train_sampled, cv=10)
cv_accuracy_isomap_c2 = cv_scores_isomap_c2.mean()
cv_std_isomap_c2 = cv_scores_isomap_c2.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_isomap_c2:.4f} ± {cv_std_isomap_c2:.4f}")

In [None]:
# Results for Isomap
results_isomap_c2 = {
    'ARI': ari_isomap_c2,
    'Silhouette Score': silhouette_isomap_c2,
    'SVM Accuracy': svm_accuracy_isomap_c2,
    'k-NN Accuracy': knn_accuracy_isomap_c2,
    '10-Fold CV Accuracy': (cv_accuracy_isomap_c2, cv_std_isomap_c2)
}

print("Isomap Results:")
print(results_isomap_c2)

In [None]:
# Save intermediate data
np.save("x_train_isomap_c2.npy", x_train_isomap_c2)  # ISOMAP-reduced training data
np.save("x_test_isomap_c2.npy", x_test_isomap_c2)    # ISOMAP-reduced test data
np.save("y_test_pred_isomap_c2.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_isomap_c2.npy", cv_scores_isomap_c2)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracy_isomap_c2.json", "w") as file:
    json.dump(knn_accuracy_isomap_c2, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_isomap_c2_serializable = convert_to_serializable({
    'ARI': ari_isomap_c2,
    'Silhouette Score': silhouette_isomap_c2,
    'SVM Accuracy': svm_accuracy_isomap_c2,
    'k-NN Accuracy': knn_accuracy_isomap_c2,
    '10-Fold CV Accuracy': (cv_accuracy_isomap_c2, cv_std_isomap_c2)
})

# Save results summary to JSON
with open("isomap_c2_results.json", "w") as file:
    json.dump(results_isomap_c2_serializable, file, indent=4)

print("ISOMAP results and intermediate data saved successfully!")

-------

In [None]:
# ARI
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_isomap_c2, y_train_sampled)
y_test_pred_isomap = knn.predict(x_test_isomap_c2)
ari_isomap_c2 = adjusted_rand_score(y_test_sampled, y_test_pred_isomap)

# Silhouette Score
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_isomap_c2, y_train_sampled)
y_test_pred_isomap = knn.predict(x_test_isomap_c2)
silhouette_isomap_c2 = silhouette_score(x_test_isomap_c2, y_test_pred_isomap)

# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_isomap_c2, y_train_sampled)
y_test_pred_isomap = svm_clf.predict(x_test_isomap_c2)
svm_accuracy_isomap = accuracy_score(y_test_sampled, y_test_pred_isomap)

# k-NN Accuracy for varying k
knn_accuracies_isomap = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_isomap_c2, y_train_sampled)
    knn_accuracy = knn.score(x_test_isomap, y_test_sampled)
    knn_accuracies_isomap[k] = knn_accuracy

# 10-Fold Cross-Validation Accuracy
cv_scores_isomap = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_isomap_c2, y_train_sampled, cv=10)
cv_accuracy_isomap = cv_scores_isomap.mean()
cv_std_isomap = cv_scores_isomap.std()

# Results for Isomap
results_isomap = {
    'ARI': ari_isomap_c2,
    'Silhouette Score': silhouette_isomap_c2,
    'SVM Accuracy': svm_accuracy_isomap,
    'k-NN Accuracy': knn_accuracies_isomap,
    '10-Fold CV Accuracy': (cv_accuracy_isomap, cv_std_isomap)
}

print("Isomap Results:")
print(results_isomap)

In [None]:
# Save intermediate data (tsne embeddings and other computationally expensive results)
np.save("x_train_isomap_c2.npy", x_train_isomap_c2)
np.save("x_test_isomap_c2.npy", x_test_isomap_c2)
np.save("y_test_pred_isomap_c2.npy", y_test_pred_isomap)  # Save SVM predictions
np.save("cv_scores_isomap.npy", cv_scores_isomap)      # Save cross-validation scores

In [None]:
# Save intermediate data
np.save("x_train_isomap_c2.npy", x_train_isomap_c2)  # ISOMAP-reduced training data
np.save("x_test_isomap_c2.npy", x_test_isomap_c2)    # ISOMAP-reduced test data
np.save("y_test_pred_isomap_c2.npy", y_test_pred_isomap)  # SVM predictions
np.save("cv_scores_isomap_c2.npy", cv_scores_isomap)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracies_isomap_c2.json", "w") as file:
    json.dump(knn_accuracies_isomap, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_isomap_c2_serializable = convert_to_serializable({
    'ARI': ari_isomap_c2,
    'Silhouette Score': silhouette_isomap_c2,
    'SVM Accuracy': svm_accuracy_isomap,
    'k-NN Accuracy': knn_accuracies_isomap,
    '10-Fold CV Accuracy': {
        'Mean': cv_accuracy_isomap,
        'StdDev': cv_std_isomap
    }
})

# Save results summary to JSON
with open("isomap_c2_results.json", "w") as file:
    json.dump(results_isomap_c2_serializable, file, indent=4)

print("ISOMAP results and intermediate data saved successfully!")

In [None]:
# Load intermediate data
x_train_isomap_c2 = np.load("x_train_isomap_c2.npy")
x_test_isomap_c2 = np.load("x_test_isomap_c2.npy")
y_test_pred_isomap = np.load("y_test_pred_isomap_c2.npy")
cv_scores_isomap = np.load("cv_scores_isomap_c2.npy")

# Load k-NN accuracies
with open("knn_accuracies_isomap_c2.json", "r") as file:
    knn_accuracies_isomap = json.load(file)

# Load results summary
with open("isomap_c2_results.json", "r") as file:
    results_isomap_c2 = json.load(file)

print("ISOMAP Results Reloaded:")
print(results_isomap_c2)

---

### ISOMAP n_component=50

In [97]:
# Apply Isomap
isomap = Isomap(n_components=50, n_neighbors=15)
x_train_isomap_c50 = isomap.fit_transform(x_train_sampled)
x_test_isomap_c50 = isomap.transform(x_test_sampled)

In [98]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_isomap_c50 = kmeans.fit_predict(x_train_isomap_c50)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_isomap_c50 = adjusted_rand_score(y_train_sampled, cluster_labels_isomap_c50)
print(f"Adjusted Rand Index (ARI): {ari_isomap_c50}")
# ari_isomap_c50 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(x_train_isomap_c50, y_train).predict(x_test_isomap_c50))

In [None]:
# Silhouette Score
silhouette_isomap_c50 = silhouette_score(x_train_isomap_c50, cluster_labels_isomap_c50)
print(silhouette_isomap_c50)
# silhouette_isomap_c50 = silhouette_score(x_test_isomap_c50, KNeighborsClassifier(n_neighbors=1).fit(x_train_isomap_c50, y_train).predict(x_test_isomap_c50))

In [106]:
# k-NN Accuracy for varying k
knn_accuracy_isomap_c50 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_isomap_c50, y_train_sampled)
    knn_accuracy = knn.score(x_train_isomap_c50, y_train_sampled)
    knn_accuracy_isomap_c50[k] = knn_accuracy

# {1: 1.0, 5: 0.9598475827577995, 10: 0.9524172422005239}

In [None]:
knn_accuracy_isomap_c50

In [None]:
# Train SVM on Isomap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_isomap_c50, y_train_sampled)

# Predict on the same embeddings
y_pred_svm = svm_clf.predict(x_train_isomap_c50)

# Compute SVM accuracy
svm_accuracy_isomap_c50 = accuracy_score(y_train_sampled, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_isomap_c50:.4f}")


In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_isomap_c50 = cross_val_score(svm_clf, x_train_isomap_c50, y_train_sampled, cv=10)
cv_accuracy_isomap_c50 = cv_scores_isomap_c50.mean()
cv_std_isomap_c50 = cv_scores_isomap_c50.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_isomap_c50:.4f} ± {cv_std_isomap_c50:.4f}")

In [None]:
# Results for Isomap
results_isomap_c50 = {
    'ARI': ari_isomap_c50,
    'Silhouette Score': silhouette_isomap_c50,
    'SVM Accuracy': svm_accuracy_isomap_c50,
    'k-NN Accuracy': knn_accuracy_isomap_c50,
    '10-Fold CV Accuracy': (cv_accuracy_isomap_c50, cv_std_isomap_c50)
}

print("Isomap Results:")
print(results_isomap_c50)

In [None]:
# Save intermediate data
np.save("x_train_isomap_c50.npy", x_train_isomap_c50)  # ISOMAP-reduced training data
np.save("x_test_isomap_c50.npy", x_test_isomap_c50)    # ISOMAP-reduced test data
np.save("y_test_pred_isomap_c50.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_isomap_c50.npy", cv_scores_isomap_c50)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracy_isomap_c50.json", "w") as file:
    json.dump(knn_accuracy_isomap_c50, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_isomap_c50_serializable = convert_to_serializable({
    'ARI': ari_isomap_c50,
    'Silhouette Score': silhouette_isomap_c50,
    'SVM Accuracy': svm_accuracy_isomap_c50,
    'k-NN Accuracy': knn_accuracy_isomap_c50,
    '10-Fold CV Accuracy': (cv_accuracy_isomap_c50, cv_std_isomap_c50)
})

# Save results summary to JSON
with open("isomap_c50_results.json", "w") as file:
    json.dump(results_isomap_c50_serializable, file, indent=4)

print("ISOMAP results and intermediate data saved successfully!")

-----------

In [None]:
# ARI
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_isomap_c2, y_train_sampled)
y_test_pred_isomap = knn.predict(x_test_isomap_c2)
ari_isomap_c2 = adjusted_rand_score(y_test_sampled, y_test_pred_isomap)

# Silhouette Score
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_isomap_c2, y_train_sampled)
y_test_pred_isomap = knn.predict(x_test_isomap_c2)
silhouette_isomap_c2 = silhouette_score(x_test_isomap_c2, y_test_pred_isomap)

# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_isomap_c2, y_train_sampled)
y_test_pred_isomap = svm_clf.predict(x_test_isomap_c2)
svm_accuracy_isomap = accuracy_score(y_test_sampled, y_test_pred_isomap)

# k-NN Accuracy for varying k
knn_accuracies_isomap = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_isomap_c2, y_train_sampled)
    knn_accuracy = knn.score(x_test_isomap, y_test_sampled)
    knn_accuracies_isomap[k] = knn_accuracy

# 10-Fold Cross-Validation Accuracy
cv_scores_isomap = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_isomap_c2, y_train_sampled, cv=10)
cv_accuracy_isomap = cv_scores_isomap.mean()
cv_std_isomap = cv_scores_isomap.std()

# Results for Isomap
results_isomap = {
    'ARI': ari_isomap_c2,
    'Silhouette Score': silhouette_isomap_c2,
    'SVM Accuracy': svm_accuracy_isomap,
    'k-NN Accuracy': knn_accuracies_isomap,
    '10-Fold CV Accuracy': (cv_accuracy_isomap, cv_std_isomap)
}

print("Isomap Results:")
print(results_isomap)

In [None]:
# Save intermediate data (tsne embeddings and other computationally expensive results)
np.save("x_train_isomap_c2.npy", x_train_isomap_c2)
np.save("x_test_isomap_c2.npy", x_test_isomap_c2)
np.save("y_test_pred_isomap_c2.npy", y_test_pred_isomap)  # Save SVM predictions
np.save("cv_scores_isomap.npy", cv_scores_isomap)      # Save cross-validation scores

In [None]:
# Save intermediate data
np.save("x_train_isomap_c2.npy", x_train_isomap_c2)  # ISOMAP-reduced training data
np.save("x_test_isomap_c2.npy", x_test_isomap_c2)    # ISOMAP-reduced test data
np.save("y_test_pred_isomap_c2.npy", y_test_pred_isomap)  # SVM predictions
np.save("cv_scores_isomap_c2.npy", cv_scores_isomap)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracies_isomap_c2.json", "w") as file:
    json.dump(knn_accuracies_isomap, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_isomap_c2_serializable = convert_to_serializable({
    'ARI': ari_isomap_c2,
    'Silhouette Score': silhouette_isomap_c2,
    'SVM Accuracy': svm_accuracy_isomap,
    'k-NN Accuracy': knn_accuracies_isomap,
    '10-Fold CV Accuracy': {
        'Mean': cv_accuracy_isomap,
        'StdDev': cv_std_isomap
    }
})

# Save results summary to JSON
with open("isomap_c2_results.json", "w") as file:
    json.dump(results_isomap_c2_serializable, file, indent=4)

print("ISOMAP results and intermediate data saved successfully!")

-------

## LLE

### LLE n_components=2

In [84]:
# Apply LLE
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=15, method='standard')
x_train_lle_c2 = lle.fit_transform(x_train_sampled)
x_test_lle_c2 = lle.transform(x_test_sampled)

In [89]:
# Perform K-Means clustering on the LLE-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_lle_c2 = kmeans.fit_predict(x_test_lle_c2)

In [85]:
x_test_lle_c2= np.load('x_test_lle_c2.npy')
x_train_lle_c2= np.load('x_train_lle_c2.npy')

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_lle_c2 = adjusted_rand_score(y_test_sampled, cluster_labels_lle_c2)
print(f"Adjusted Rand Index (ARI): {ari_lle_c2}")

In [None]:
# Silhouette Score
silhouette_lle_c2 = silhouette_score(x_test_lle_c2, cluster_labels_lle_c2)
print(silhouette_lle_c2)

In [88]:
# k-NN Accuracy for varying k
knn_accuracy_lle_c2 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_lle_c2, y_train_sampled)
    knn_accuracy = knn.score(x_train_lle_c2, y_train_sampled)
    knn_accuracy_lle_c2[k] = knn_accuracy

In [None]:
# Train SVM on lle embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_lle_c2, y_train_sampled)

# Predict on the same embeddings
y_pred_svm = svm_clf.predict(x_train_lle_c2)

# Compute SVM accuracy
svm_accuracy_lle_c2 = accuracy_score(y_train_sampled, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_lle_c2:.4f}")


In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_lle_c2 = cross_val_score(svm_clf, x_train_lle_c2, y_train_sampled, cv=10)
cv_accuracy_lle_c2 = cv_scores_lle_c2.mean()
cv_std_lle_c2 = cv_scores_lle_c2.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_lle_c2:.4f} ± {cv_std_lle_c2:.4f}")

In [None]:
# Results for lle
results_lle_c2 = {
    'ARI': ari_lle_c2,
    'Silhouette Score': silhouette_lle_c2,
    'SVM Accuracy': svm_accuracy_lle_c2,
    'k-NN Accuracy': knn_accuracy_lle_c2,
    '10-Fold CV Accuracy': (cv_accuracy_lle_c2, cv_std_lle_c2)
}

print("lle Results:")
print(results_lle_c2)

In [None]:
# Save intermediate data
np.save("x_train_lle_c2.npy", x_train_lle_c2)  # lle-reduced training data
np.save("x_test_lle_c2.npy", x_test_lle_c2)    # lle-reduced test data
np.save("y_test_pred_lle_c2.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_lle_c2.npy", cv_scores_lle_c2)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracy_lle_c2.json", "w") as file:
    json.dump(knn_accuracy_lle_c2, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_lle_c2_serializable = convert_to_serializable({
    'ARI': ari_lle_c2,
    'Silhouette Score': silhouette_lle_c2,
    'SVM Accuracy': svm_accuracy_lle_c2,
    'k-NN Accuracy': knn_accuracy_lle_c2,
    '10-Fold CV Accuracy': (cv_accuracy_lle_c2, cv_std_lle_c2)
})

# Save results summary to JSON
with open("lle_c2_results.json", "w") as file:
    json.dump(results_lle_c2_serializable, file, indent=4)

print("lle results and intermediate data saved successfully!")

In [None]:
# ARI
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_lle_c2, y_train_sampled)
y_test_pred_lle = knn.predict(x_test_lle_c2)
ari_lle_c2 = adjusted_rand_score(y_test_sampled, y_test_pred_lle)

# Silhouette Score
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_lle_c2, y_train_sampled)
y_test_pred_lle = knn.predict(x_test_lle_c2)
silhouette_lle_c2 = silhouette_score(x_test_lle_c2, y_test_pred_lle)

# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_lle_c2, y_train_sampled)
y_test_pred_lle_c2 = svm_clf.predict(x_test_lle_c2)
svm_accuracy_lle_c2 = accuracy_score(y_test_sampled, y_test_pred_lle_c2)

# k-NN Accuracy for varying k
knn_accuracies_lle_c2 = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_lle_c2, y_train_sampled)
    knn_accuracy = knn.score(x_test_lle_c2, y_test_sampled)
    knn_accuracies_lle_c2[k] = knn_accuracy

# 10-Fold Cross-Validation Accuracy
cv_scores_lle_c2 = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_lle_c2, y_train_sampled, cv=10)
cv_accuracy_lle_c2 = cv_scores_lle_c2.mean()
cv_std_lle_c2 = cv_scores_lle_c2.std()

# Results for LLE
results_lle_c2 = {
    'ARI': ari_lle_c2,
    'Silhouette Score': silhouette_lle_c2,
    'SVM Accuracy': svm_accuracy_lle_c2,
    'k-NN Accuracy': knn_accuracies_lle_c2,
    '10-Fold CV Accuracy': (cv_accuracy_lle_c2, cv_std_lle_c2)
}

print("LLE Results:")
print(results_lle_c2)

In [None]:
# # Save intermediate data (tsne embeddings and other computationally expensive results)
# np.save("x_train_lle_c2.npy", x_train_lle_c2)
# np.save("x_test_lle_c2.npy", x_test_lle_c2)
# np.save("y_test_pred_lle_c2.npy", y_test_pred_lle)  # Save SVM predictions
# np.save("cv_scores_lle.npy", cv_scores_lle)      # Save cross-validation scores

In [35]:
# Load intermediate data
x_train_lle_c2 = np.load("x_train_lle_c2.npy")
x_test_lle_c2 = np.load("x_test_lle_c2.npy")
y_test_pred_lle_c2 = np.load("y_test_pred_lle_c2.npy")
cv_scores_lle = np.load("cv_scores_lle.npy")

In [37]:
# Load intermediate data
x_train_sampled = np.load("x_train_sampled.npy")
y_train_sampled = np.load("y_train_sampled.npy")
x_test_sampled = np.load("x_test_sampled.npy")
y_test_sampled = np.load("y_test_sampled.npy")

In [None]:
# Save k-NN accuracies to JSON
with open("knn_accuracies_lle_c2.json", "w") as file:
    json.dump(knn_accuracies_lle_c2, file, indent=4)

# Save results summary to JSON
results_lle_c2 = {
    'ARI': ari_lle_c2,
    'Silhouette Score': silhouette_lle_c2,
    'SVM Accuracy': svm_accuracy_lle_c2,
    'k-NN Accuracy': knn_accuracies_lle_c2,
    '10-Fold CV Accuracy': {
        'Mean': cv_accuracy_lle_c2,
        'StdDev': cv_std_lle_c2
    }
}

with open("lle_c2_results.json", "w") as file:
    json.dump(results_lle_c2, file, indent=4)

print("LLE results and intermediate data saved successfully!")

In [None]:
# Load intermediate data
x_train_lle_c2 = np.load("x_train_lle_c2.npy")
x_test_lle_c2 = np.load("x_test_lle_c2.npy")
y_test_pred_lle_c2 = np.load("y_test_pred_lle_c2.npy")
cv_scores_lle = np.load("cv_scores_lle.npy")

# Load k-NN accuracies
with open("knn_accuracies_lle_c2.json", "r") as file:
    knn_accuracies_lle_c2 = json.load(file)

# Load results summary
with open("lle_c2_results.json", "r") as file:
    lle_c2_results = json.load(file)

print("LLE Results Reloaded:")
print(lle_c2_results)

### LLE n_component=50

In [114]:
# Apply LLE
lle = LocallyLinearEmbedding(n_components=50, n_neighbors=15, method='standard')
x_train_lle_c50 = lle.fit_transform(x_train_sampled)
x_test_lle_c50 = lle.transform(x_test_sampled)

In [28]:
x_train_lle_c50= np.load('x_train_lle_c50.npy')
x_test_lle_c50= np.load('x_test_lle_c50.npy')

In [32]:
x_full_lle_c50 = np.vstack([x_train_lle_c50, x_test_lle_c50])
y_full_lle_c50 = np.hstack([y_train_sampled, y_test_sampled])

In [37]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_lle_c50 = kmeans.fit_predict(x_full_lle_c50)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_lle_c50 = adjusted_rand_score(y_full_lle_c50, cluster_labels_lle_c50)
print(f"Adjusted Rand Index (ARI): {ari_lle_c50}")

In [None]:
# Silhouette Score
silhouette_lle_c50 = silhouette_score(x_full_lle_c50, cluster_labels_lle_c50)
print(silhouette_lle_c50)

In [41]:
# k-NN Accuracy for varying k
knn_accuracy_lle_c50 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_lle_c50, y_train_sampled)
    knn_accuracy = knn.score(x_train_lle_c50, y_train_sampled)
    knn_accuracy_lle_c50[k] = knn_accuracy

# {1: 1.0, 5: 0.9598475827577995, 10: 0.9524172422005239}

In [None]:
knn_accuracy_lle_c50

In [None]:
# Train SVM on lle embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_lle_c50, y_train_sampled)

# Predict on the same embeddings
y_pred_svm = svm_clf.predict(x_train_lle_c50)

# Compute SVM accuracy
svm_accuracy_lle_c50 = accuracy_score(y_train_sampled, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_lle_c50:.4f}")


In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_lle_c50 = cross_val_score(svm_clf, x_train_lle_c50, y_train_sampled, cv=10)
cv_accuracy_lle_c50 = cv_scores_lle_c50.mean()
cv_std_lle_c50 = cv_scores_lle_c50.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_lle_c50:.4f} ± {cv_std_lle_c50:.4f}")

In [None]:
# Results for lle
results_lle_c50 = {
    'ARI': ari_lle_c50,
    'Silhouette Score': silhouette_lle_c50,
    'SVM Accuracy': svm_accuracy_lle_c50,
    'k-NN Accuracy': knn_accuracy_lle_c50,
    '10-Fold CV Accuracy': (cv_accuracy_lle_c50, cv_std_lle_c50)
}

print("lle Results:")
print(results_lle_c50)

In [None]:
# Save intermediate data
np.save("x_train_lle_c50.npy", x_train_lle_c50)  # lle-reduced training data
np.save("x_test_lle_c50.npy", x_test_lle_c50)    # lle-reduced test data
np.save("y_test_pred_lle_c50.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_lle_c50.npy", cv_scores_lle_c50)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracy_lle_c50.json", "w") as file:
    json.dump(knn_accuracy_lle_c50, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_lle_c50_serializable = convert_to_serializable({
    'ARI': ari_lle_c50,
    'Silhouette Score': silhouette_lle_c50,
    'SVM Accuracy': svm_accuracy_lle_c50,
    'k-NN Accuracy': knn_accuracy_lle_c50,
    '10-Fold CV Accuracy': (cv_accuracy_lle_c50, cv_std_lle_c50)
})

# Save results summary to JSON
with open("lle_c50_results.json", "w") as file:
    json.dump(results_lle_c50_serializable, file, indent=4)

print("lle results and intermediate data saved successfully!")

-------

## UMAP

### UMAP n_components=2

In [90]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_umap_c2, y_train)
y_test_pred_umap = knn.predict(x_test_umap_c2)
ari_umap_c2 = adjusted_rand_score(y_test, y_test_pred_umap)

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_umap_c2, y_train)
y_test_pred_umap = knn.predict(x_test_umap_c2)
silhouette_umap_c2 = silhouette_score(x_test_umap_c2, y_test_pred_umap)

In [42]:
# Load results from JSON file
with open("umap_c2_results.json", "r") as file:
    results_umap_c2 = json.load(file)

In [None]:
print(results_umap_c2)

In [None]:
# Apply UMAP
umap = UMAP(n_components=2, n_neighbors=15, random_state=42)
x_train_umap_c2_std = umap.fit_transform(x_train_standardized1)
x_test_umap_c2_std = umap.transform(x_test_standardized1)

In [45]:
# Save intermediate data (umap embeddings and other computationally expensive results)
np.save("x_train_umap_c2_std.npy", x_train_umap_c2_std)
np.save("x_test_umap_c2_std.npy", x_test_umap_c2_std)

In [47]:
x_full_umap_c2_std = np.vstack([x_train_umap_c2_std, x_test_umap_c2_std])

In [48]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_umap_c2_std = kmeans.fit_predict(x_full_umap_c2_std)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_umap_c2_std = adjusted_rand_score(y_full, cluster_labels_umap_c2_std)
print(f"Adjusted Rand Index (ARI): {ari_umap_c2_std}")

In [None]:
# Silhouette Score
silhouette_umap_c2_std = silhouette_score(x_full_umap_c2_std, cluster_labels_umap_c2_std)
print(silhouette_umap_c2_std)

In [52]:
# k-NN Accuracy for varying k
knn_accuracy_umap_c2_std = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_umap_c2_std, y_train)
    knn_accuracy = knn.score(x_test_umap_c2_std, y_test)
    knn_accuracy_umap_c2_std[k] = knn_accuracy

# {1: 1.0, 5: 0.9598475827577995, 10: 0.9524172422005239}

In [None]:
# Train SVM on umap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_umap_c2_std, y_train)

# Predict on the same embeddings
y_pred_svm_test = svm_clf.predict(x_test_umap_c2_std)  # Predict on test embeddings

# Compute SVM accuracy
svm_accuracy_umap_c2_std = accuracy_score(y_test, y_pred_svm_test)
print(f"SVM Accuracy (Test): {svm_accuracy_umap_c2_std:.4f}")

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_umap_c2_std = cross_val_score(svm_clf, x_train_umap_c2_std, y_train, cv=10)
cv_accuracy_umap_c2_std = cv_scores_umap_c2_std.mean()
cv_std_umap_c2_std = cv_scores_umap_c2_std.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_umap_c2_std:.4f} ± {cv_std_umap_c2_std:.4f}")

In [None]:
# Results for umap
results_umap_c2_std = {
    'ARI': ari_umap_c2_std,
    'Silhouette Score': silhouette_umap_c2_std,
    'SVM Accuracy': svm_accuracy_umap_c2_std,
    'k-NN Accuracy': knn_accuracy_umap_c2_std,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c2_std, cv_std_umap_c2_std)
}

print("umap Results:")
print(results_umap_c2_std)

----

In [None]:
# UMAP
from umap import UMAP

# Apply UMAP
umap = UMAP(n_components=2, n_neighbors=15, random_state=42)
x_train_umap_c2 = umap.fit_transform(x_train_normalized)
x_test_umap_c2 = umap.transform(x_test_normalized)

# ARI
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_umap_c2, y_train)
y_test_pred_umap = knn.predict(x_test_umap_c2)
ari_umap_c2 = adjusted_rand_score(y_test, y_test_pred_umap)

# Silhouette Score
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_umap_c2, y_train)
y_test_pred_umap = knn.predict(x_test_umap_c2)
silhouette_umap_c2 = silhouette_score(x_test_umap_c2, y_test_pred_umap)

# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_umap_c2, y_train)
y_test_pred_umap_c2 = svm_clf.predict(x_test_umap_c2)
svm_accuracy_umap_c2 = accuracy_score(y_test, y_test_pred_umap_c2)

# k-NN Accuracy for varying k
knn_accuracies_umap_c2 = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_umap_c2, y_train)
    knn_accuracy = knn.score(x_test_umap_c2, y_test)
    knn_accuracies_umap_c2[k] = knn_accuracy

# 10-Fold Cross-Validation Accuracy
cv_scores_umap_c2 = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_umap_c2, y_train, cv=10)
cv_accuracy_umap_c2 = cv_scores_umap_c2.mean()
cv_std_umap_c2 = cv_scores_umap_c2.std()

# Results for UMAP
results_umap_c2 = {
    'ARI': ari_umap_c2,
    'Silhouette Score': silhouette_umap_c2,
    'SVM Accuracy': svm_accuracy_umap_c2,
    'k-NN Accuracy': knn_accuracies_umap_c2,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c2, cv_std_umap_c2)
}

print("UMAP Results:")
print(results_umap_c2)

In [None]:
# Save intermediate data (tsne embeddings and other computationally expensive results)
np.save("x_train_umap_c2.npy", x_train_umap_c2)
np.save("x_test_umap_c2.npy", x_test_umap_c2)
np.save("y_test_pred_umap_c2.npy", y_test_pred_umap)  # Save SVM predictions
np.save("cv_scores_umap.npy", cv_scores_umap)      # Save cross-validation scores

In [None]:
# Load intermediate data
x_train_umap_c2 = np.load("x_train_umap_c2.npy")
x_test_umap_c2 = np.load("x_test_umap_c2.npy")
y_test_pred_umap_c2 = np.load("y_test_pred_umap_c2.npy")
cv_scores_umap = np.load("cv_scores_umap.npy")

In [24]:
# # k-NN Accuracy for varying k
# knn_accuracies_umap_c2 = {}
# for k in [1, 5, 10]:
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn.fit(x_train_umap_c2, y_train)
#     knn_accuracy = knn.score(x_test_umap_c2, y_test)
#     knn_accuracies_umap_c2[k] = knn_accuracy

In [28]:
# # 10-Fold Cross-Validation Accuracy
# cv_scores_umap_c2 = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_umap_c2, y_train, cv=10)
# cv_accuracy_umap_c2 = cv_scores_umap_c2.mean()
# cv_std_umap_c2 = cv_scores_umap_c2.std()

In [26]:
# # ARI
# ari_umap_c2 = adjusted_rand_score(y_train, KNeighborsClassifier(n_neighbors=1).fit(x_train_umap_c2, y_train).predict(x_train_umap_c2))

# # Silhouette Score
# silhouette_umap_c2 = silhouette_score(x_train_umap_c2, y_train)

In [30]:
# # SVM Accuracy
# svm_clf = SVC(kernel='rbf', random_state=42)
# svm_clf.fit(x_train_umap_c2, y_train)
# y_test_pred_umap_c2 = svm_clf.predict(x_test_umap_c2)
# svm_accuracy_umap_c2 = accuracy_score(y_test, y_test_pred_umap_c2)

In [None]:
# Results for UMAP
results_umap_c2 = {
    'ARI': ari_umap_c2,
    'Silhouette Score': silhouette_umap_c2,
    'SVM Accuracy': svm_accuracy_umap_c2,
    'k-NN Accuracy': knn_accuracies_umap_c2,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c2, cv_std_umap_c2)
}

print("UMAP Results:")
print(results_umap_c2)

In [None]:
# Helper function to convert NumPy types to native Python types
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):  # Handle NumPy floats
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):  # Handle NumPy integers
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_umap_c2_serializable = convert_to_serializable({
    'ARI': ari_umap_c2,
    'Silhouette Score': silhouette_umap_c2,
    'SVM Accuracy': svm_accuracy_umap_c2,
    'k-NN Accuracy': knn_accuracies_umap_c2,
    '10-Fold CV Accuracy': {
        'Mean': cv_accuracy_umap_c2,
        'StdDev': cv_std_umap_c2
    }
})

# Save results summary to JSON
with open("umap_c2_results.json", "w") as file:
    json.dump(results_umap_c2_serializable, file, indent=4)

print("UMAP results saved successfully!")


### UMAP n_components=50

In [None]:
# UMAP
from umap import UMAP

# Apply UMAP
umap = UMAP(n_components=50, n_neighbors=15, random_state=42)
x_train_umap_c50 = umap.fit_transform(x_train_standardized1)
x_test_umap_c50 = umap.transform(x_test_standardized1)

In [14]:
# Save intermediate data (umap embeddings and other computationally expensive results)
np.save("x_train_umap_c50.npy", x_train_umap_c50)
np.save("x_test_umap_c50.npy", x_test_umap_c50)

In [15]:
x_full_umap_c50 = np.vstack([x_train_umap_c50, x_test_umap_c50])

In [19]:
y_full = np.hstack([y_train, y_test])

In [16]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_umap_c50 = kmeans.fit_predict(x_full_umap_c50)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_umap_c50 = adjusted_rand_score(y_full, cluster_labels_umap_c50)
print(f"Adjusted Rand Index (ARI): {ari_umap_c50}")

In [None]:
# Silhouette Score
silhouette_umap_c50 = silhouette_score(x_full_umap_c50, cluster_labels_umap_c50)
print(silhouette_umap_c50)

In [22]:
# k-NN Accuracy for varying k
knn_accuracy_umap_c50 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_umap_c50, y_train)
    knn_accuracy = knn.score(x_test_umap_c50, y_test)
    knn_accuracy_umap_c50[k] = knn_accuracy

# {1: 1.0, 5: 0.9598475827577995, 10: 0.9524172422005239}

In [None]:
# Train SVM on umap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_umap_c50, y_train)

# Predict on the same embeddings
y_pred_svm_test = svm_clf.predict(x_test_umap_c50)  # Predict on test embeddings

# Compute SVM accuracy
svm_accuracy_umap_c50 = accuracy_score(y_test, y_pred_svm_test)
print(f"SVM Accuracy (Test): {svm_accuracy_umap_c50:.4f}")

In [52]:
cv_scores_umap_c50= np.load('cv_scores_umap_c50.npy')

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_umap_c50 = cross_val_score(svm_clf, x_train_umap_c50, y_train, cv=10)
cv_accuracy_umap_c50 = cv_scores_umap_c50.mean()
cv_std_umap_c50 = cv_scores_umap_c50.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_umap_c50:.4f} ± {cv_std_umap_c50:.4f}")

In [None]:
# Results for umap
results_umap_c50 = {
    'ARI': ari_umap_c50,
    'Silhouette Score': silhouette_umap_c50,
    'SVM Accuracy': svm_accuracy_umap_c50,
    'k-NN Accuracy': knn_accuracy_umap_c50,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c50, cv_std_umap_c50)
}

print("umap Results:")
print(results_umap_c50)

In [55]:
# Save intermediate data
np.save("x_train_umap_c50.npy", x_train_umap_c50)  # umap-reduced training data
np.save("x_test_umap_c50.npy", x_test_umap_c50)    # umap-reduced test data
np.save("y_test_pred_umap_c50.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_umap_c50.npy", cv_scores_umap_c50)      # Cross-validation scores

In [41]:
# Save intermediate data
x_train_umap_c50= np.load("x_train_umap_c50.npy") 
x_test_umap_c50= np.load("x_test_umap_c50.npy") 
y_pred_svm= np.load("y_test_pred_umap_c50.npy") 
cv_scores_umap_c50= np.load("cv_scores_umap_c50.npy")  

In [None]:
# Save k-NN accuracies to JSON
with open("knn_accuracy_umap_c50.json", "w") as file:
    json.dump(knn_accuracy_umap_c50, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_umap_c50_serializable = convert_to_serializable({
    'ARI': ari_umap_c50,
    'Silhouette Score': silhouette_umap_c50,
    'SVM Accuracy': svm_accuracy_umap_c50,
    'k-NN Accuracy': knn_accuracy_umap_c50,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c50, cv_std_umap_c50)
})

# Save results summary to JSON
with open("umap_c50_results.json", "w") as file:
    json.dump(results_umap_c50_serializable, file, indent=4)

print("umap results and intermediate data saved successfully!")

In [37]:
# Load results from JSON file
with open("umap_c50_results.json", "r") as file:
    results_umap_c50 = json.load(file)

In [None]:
print(results_umap_c50)

### UMAP n_components=50 norm

In [None]:
# UMAP
from umap import UMAP

# Apply UMAP
umap = UMAP(n_components=50, n_neighbors=15, random_state=42)
x_train_umap_c50_norm = umap.fit_transform(x_train_normalized)
x_test_umap_c50_norm = umap.transform(x_test_normalized)

In [8]:
# Save intermediate data (umap embeddings and other computationally expensive results)
np.save("x_train_umap_c50_norm.npy", x_train_umap_c50_norm)
np.save("x_test_umap_c50_norm.npy", x_test_umap_c50_norm)

In [9]:
x_full_umap_c50_norm = np.vstack([x_train_umap_c50_norm, x_test_umap_c50_norm])

In [10]:
y_full = np.hstack([y_train, y_test])

In [11]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_umap_c50_norm = kmeans.fit_predict(x_full_umap_c50_norm)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_umap_c50_norm = adjusted_rand_score(y_full, cluster_labels_umap_c50_norm)
print(f"Adjusted Rand Index (ARI): {ari_umap_c50_norm}")

In [None]:
# Silhouette Score
silhouette_umap_c50_norm = silhouette_score(x_full_umap_c50_norm, cluster_labels_umap_c50_norm)
print(silhouette_umap_c50_norm)

In [14]:
# k-NN Accuracy for varying k
knn_accuracy_umap_c50_norm = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_umap_c50_norm, y_train)
    knn_accuracy = knn.score(x_test_umap_c50_norm, y_test)
    knn_accuracy_umap_c50_norm[k] = knn_accuracy

# {1: 1.0, 5: 0.9598475827577995, 10: 0.9524172422005239}

In [None]:
# Train SVM on umap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_umap_c50_norm, y_train)

# Predict on the same embeddings
y_pred_svm_test = svm_clf.predict(x_test_umap_c50_norm)  # Predict on test embeddings

# Compute SVM accuracy
svm_accuracy_umap_c50_norm = accuracy_score(y_test, y_pred_svm_test)
print(f"SVM Accuracy (Test): {svm_accuracy_umap_c50_norm:.4f}")

In [25]:
np.save('cv_scores_umap_c50_norm.npy', cv_scores_umap_c50_norm)

In [26]:
cv_scores_umap_c50_norm= np.load('cv_scores_umap_c50_norm.npy')

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_umap_c50_norm = cross_val_score(svm_clf, x_train_umap_c50_norm, y_train, cv=10)
cv_accuracy_umap_c50_norm = cv_scores_umap_c50_norm.mean()
cv_std_umap_c50_norm = cv_scores_umap_c50_norm.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_umap_c50_norm:.4f} ± {cv_std_umap_c50_norm:.4f}")

In [None]:
# Results for umap
results_umap_c50_norm = {
    'ARI': ari_umap_c50_norm,
    'Silhouette Score': silhouette_umap_c50_norm,
    'SVM Accuracy': svm_accuracy_umap_c50_norm,
    'k-NN Accuracy': knn_accuracy_umap_c50_norm,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c50_norm, cv_std_umap_c50_norm)
}

print("umap Results:")
print(results_umap_c50_norm)

In [None]:
# Save intermediate data
np.save("x_train_umap_c50_norm.npy", x_train_umap_c50_norm)  # umap-reduced training data
np.save("x_test_umap_c50_norm.npy", x_test_umap_c50_norm)    # umap-reduced test data
np.save("y_test_pred_umap_c50_norm.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_umap_c50_norm.npy", cv_scores_umap_c50_norm)      # Cross-validation scores

In [None]:
# Save intermediate data
x_train_umap_c50_norm= np.load("x_train_umap_c50_norm.npy") 
x_test_umap_c50_norm= np.load("x_test_umap_c50_norm.npy") 
y_pred_svm= np.load("y_test_pred_umap_c50_norm.npy") 
cv_scores_umap_c50_norm= np.load("cv_scores_umap_c50_norm.npy")  

In [None]:
# Save k-NN accuracies to JSON
with open("knn_accuracy_umap_c50_norm.json", "w") as file:
    json.dump(knn_accuracy_umap_c50_norm, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_umap_c50_norm_serializable = convert_to_serializable({
    'ARI': ari_umap_c50_norm,
    'Silhouette Score': silhouette_umap_c50_norm,
    'SVM Accuracy': svm_accuracy_umap_c50_norm,
    'k-NN Accuracy': knn_accuracy_umap_c50_norm,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c50_norm, cv_std_umap_c50_norm)
})

# Save results summary to JSON
with open("umap_c50_norm_results.json", "w") as file:
    json.dump(results_umap_c50_norm_serializable, file, indent=4)

print("umap results and intermediate data saved successfully!")

In [None]:
# Load results from JSON file
with open("umap_c50_norm_results.json", "r") as file:
    results_umap_c50_norm = json.load(file)

In [None]:
print(results_umap__c2_norm)

In [None]:
# Step 2: Visualize the UMAP Results
plt.figure(figsize=(10, 8))
plt.scatter(x_train_umap_c50_norm[:, 0], x_train_umap_c50_norm[:, 1], c=y_train, cmap="tab10", s=5, alpha=0.8)
plt.title("UMAP Projection of MNIST Dataset")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.colorbar(label="MNIST Labels")
plt.show()

### UMAP n_components=2 norm

In [None]:
# UMAP
from umap import UMAP

# Apply UMAP
umap = UMAP(n_components=2, n_neighbors=15, random_state=42)
x_train_umap_c2_norm = umap.fit_transform(x_train_normalized)
x_test_umap_c2_norm = umap.transform(x_test_normalized)

In [31]:
# Save intermediate data (umap embeddings and other computationally expensive results)
np.save("x_train_umap_c2_norm.npy", x_train_umap_c2_norm)
np.save("x_test_umap_c2_norm.npy", x_test_umap_c2_norm)

In [32]:
x_full_umap_c2_norm = np.vstack([x_train_umap_c2_norm, x_test_umap_c2_norm])

In [33]:
y_full = np.hstack([y_train, y_test])

In [34]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_umap_c2_norm = kmeans.fit_predict(x_full_umap_c2_norm)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_umap_c2_norm = adjusted_rand_score(y_full, cluster_labels_umap_c2_norm)
print(f"Adjusted Rand Index (ARI): {ari_umap_c2_norm}")

In [None]:
# Silhouette Score
silhouette_umap_c2_norm = silhouette_score(x_full_umap_c2_norm, cluster_labels_umap_c2_norm)
print(silhouette_umap_c2_norm)

In [37]:
# k-NN Accuracy for varying k
knn_accuracy_umap_c2_norm = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_umap_c2_norm, y_train)
    knn_accuracy = knn.score(x_test_umap_c2_norm, y_test)
    knn_accuracy_umap_c2_norm[k] = knn_accuracy

# {1: 1.0, 5: 0.9598475827577995, 10: 0.9524172422005239}

In [None]:
# Train SVM on umap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_umap_c2_norm, y_train)

# Predict on the same embeddings
y_pred_svm_test = svm_clf.predict(x_test_umap_c2_norm)  # Predict on test embeddings

# Compute SVM accuracy
svm_accuracy_umap_c2_norm = accuracy_score(y_test, y_pred_svm_test)
print(f"SVM Accuracy (Test): {svm_accuracy_umap_c2_norm:.4f}")

In [43]:
np.save('cv_scores_umap_c2_norm.npy', cv_scores_umap_c2_norm)

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_umap_c2_norm = cross_val_score(svm_clf, x_train_umap_c2_norm, y_train, cv=10)
cv_accuracy_umap_c2_norm = cv_scores_umap_c2_norm.mean()
cv_std_umap_c2_norm = cv_scores_umap_c2_norm.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_umap_c2_norm:.4f} ± {cv_std_umap_c2_norm:.4f}")

In [None]:
# Results for umap
results_umap_c2_norm = {
    'ARI': ari_umap_c2_norm,
    'Silhouette Score': silhouette_umap_c2_norm,
    'SVM Accuracy': svm_accuracy_umap_c2_norm,
    'k-NN Accuracy': knn_accuracy_umap_c2_norm,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c2_norm, cv_std_umap_c2_norm)
}

print("umap Results:")
print(results_umap_c2_norm)

In [45]:
# Save intermediate data
np.save("x_train_umap_c2_norm.npy", x_train_umap_c2_norm)  # umap-reduced training data
np.save("x_test_umap_c2_norm.npy", x_test_umap_c2_norm)    # umap-reduced test data
np.save("y_test_pred_umap_c2_norm.npy", y_pred_svm_test)  # SVM predictions
np.save("cv_scores_umap_c2_norm.npy", cv_scores_umap_c2_norm)      # Cross-validation scores

In [46]:
# Save intermediate data
x_train_umap_c2_norm= np.load("x_train_umap_c2_norm.npy") 
x_test_umap_c2_norm= np.load("x_test_umap_c2_norm.npy") 
y_pred_svm= np.load("y_test_pred_umap_c2_norm.npy") 
cv_scores_umap_c2_norm= np.load("cv_scores_umap_c2_norm.npy")  

In [None]:
# Save k-NN accuracies to JSON
with open("knn_accuracy_umap_c2_norm.json", "w") as file:
    json.dump(knn_accuracy_umap_c2_norm, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_umap_c2_norm_serializable = convert_to_serializable({
    'ARI': ari_umap_c2_norm,
    'Silhouette Score': silhouette_umap_c2_norm,
    'SVM Accuracy': svm_accuracy_umap_c2_norm,
    'k-NN Accuracy': knn_accuracy_umap_c2_norm,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c2_norm, cv_std_umap_c2_norm)
})

# Save results summary to JSON
with open("umap_c2_norm_results.json", "w") as file:
    json.dump(results_umap_c2_norm_serializable, file, indent=4)

print("umap results and intermediate data saved successfully!")

In [48]:
# Load results from JSON file
with open("umap_c2_norm_results.json", "r") as file:
    results_umap_c2_norm = json.load(file)

In [None]:
print(results_umap_c2_norm)

In [None]:
# Step 2: Visualize the UMAP Results
plt.figure(figsize=(10, 8))
plt.scatter(x_train_umap_c2_norm[:, 0], x_train_umap_c2_norm[:, 1], c=y_train, cmap="tab10", s=5, alpha=0.8)
plt.title("UMAP Projection of MNIST Dataset")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.colorbar(label="MNIST Labels")
plt.show()

---------

## MDS

### MDS n_components= 2

In [None]:
def downsample_mnist_consistent(x_data, y_labels, sample_fraction=0.1):
    """
    Downsample the dataset consistently, returning indices to ensure
    the same points are selected in both spaces.
    """
    sampled_indices = []
    unique_labels = np.unique(y_labels)
    for label in unique_labels:
        # Select indices for the current label
        label_indices = np.where(y_labels == label)[0]
        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False, random_state=42
        )
        sampled_indices.extend(sampled_indices_label)
    return np.array(sampled_indices)

# Downsample training data
sampled_indices_train_mds = downsample_mnist_consistent(x_train_standardized1, y_train, sample_fraction=0.1)
x_train_sampled_mds = x_train_standardized1[sampled_indices_train_mds]
y_train_sampled_mds = y_train[sampled_indices_train_mds]

# Downsample test data
sampled_indices_test_mds = downsample_mnist_consistent(x_test_standardized1, y_test, sample_fraction=0.1)
x_test_sampled_mds = x_test_standardized1[sampled_indices_test_mds]
y_test_sampled_mds = y_test[sampled_indices_test_mds]

print(f"Training set reduced to {len(x_train_sampled_mds)} samples.")
print(f"Test set reduced to {len(x_test_sampled_mds)} samples.")

In [None]:
# Save the sampled indices
np.save("sampled_indices_train_mds.npy", sampled_indices_train_mds)
np.save("sampled_indices_test_mds.npy", sampled_indices_test_mds)

# Save the downsampled dataset
np.save("x_train_sampled_mds.npy", x_train_sampled_mds)
np.save("y_train_sampled_mds.npy", y_train_sampled_mds)
np.save("x_test_sampled_mds.npy", x_test_sampled_mds)
np.save("y_test_sampled_mds.npy", y_test_sampled_mds)

print("Downsampling saved successfully!")

In [35]:
# Load sampled indices
sampled_indices_train_mds= np.load("sampled_indices_train_mds.npy")
sampled_indices_test_mds= np.load("sampled_indices_test_mds.npy")

# Load downsampled dataset
x_train_sampled_mds= np.load("x_train_sampled_mds.npy")
y_train_sampled_mds= np.load("y_train_sampled_mds.npy")
x_test_sampled_mds= np.load("x_test_sampled_mds.npy")
y_test_sampled_mds= np.load("y_test_sampled_mds.npy")

# Load 
x_train_mds_c2= np.load("x_train_mds_c2.npy")
x_test_mds_c2= np.load("x_test_mds_c2.npy")

In [None]:
# Apply MDS
mds = MDS(n_components=2, random_state=42, n_jobs=-1)
x_train_mds_c2 = mds.fit_transform(x_train_sampled_mds)
x_test_mds_c2 = mds.fit_transform(x_test_sampled_mds)  # MDS needs to be run separately for the test set

In [12]:
# Save the downsampled dataset
np.save("x_train_mds_c2.npy", x_train_mds_c2)
np.save("x_test_mds_c2.npy", x_test_mds_c2)

In [93]:
# ARI
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_mds_c2, y_train_sampled_mds)
y_test_pred_mds = knn.predict(x_test_mds_c2)
ari_mds_c2 = adjusted_rand_score(y_test_sampled_mds, y_test_pred_mds)

# Silhouette Score
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train_mds_c2, y_train_sampled_mds)
y_test_pred_mds = knn.predict(x_test_mds_c2)
silhouette_mds_c2 = silhouette_score(x_test_mds_c2, y_test_pred_mds)

In [17]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_mds_c2, y_train_sampled_mds)
y_test_pred_mds_c2 = svm_clf.predict(x_test_mds_c2)
svm_accuracy_mds_c2 = accuracy_score(y_test_sampled_mds, y_test_pred_mds_c2)

In [18]:
# k-NN Accuracy for varying k
knn_accuracies_mds_c2 = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_mds_c2, y_train_sampled_mds)
    knn_accuracy = knn.score(x_test_mds_c2, y_test_sampled_mds)
    knn_accuracies_mds_c2[k] = knn_accuracy

In [19]:
# 10-Fold Cross-Validation Accuracy
cv_scores_mds_c2 = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_mds_c2, y_train_sampled_mds, cv=10)
cv_accuracy_mds_c2 = cv_scores_mds_c2.mean()
cv_std_mds_c2 = cv_scores_mds_c2.std()

In [None]:
# Results for MDS
results_mds_c2 = {
    'ARI': ari_mds_c2,
    'Silhouette Score': silhouette_mds_c2,
    'SVM Accuracy': svm_accuracy_mds_c2,
    'k-NN Accuracy': knn_accuracies_mds_c2,
    '10-Fold CV Accuracy': (cv_accuracy_mds_c2, cv_std_mds_c2)
}

print("MDS Results:")
print(results_mds_c2)

In [None]:
import numpy as np
import json

# Save intermediate data
np.save("x_train_mds_c2.npy", x_train_mds_c2)  # MDS-reduced training data
np.save("x_test_mds_c2.npy", x_test_mds_c2)    # MDS-reduced test data
np.save("y_test_pred_mds_c2.npy", y_test_pred_mds_c2)  # SVM predictions
np.save("cv_scores_mds_c2.npy", cv_scores_mds_c2)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracies_mds_c2.json", "w") as file:
    json.dump(knn_accuracies_mds_c2, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_mds_c2_serializable = convert_to_serializable({
    'ARI': ari_mds_c2,
    'Silhouette Score': silhouette_mds_c2,
    'SVM Accuracy': svm_accuracy_mds_c2,
    'k-NN Accuracy': knn_accuracies_mds_c2,
    '10-Fold CV Accuracy': {
        'Mean': cv_accuracy_mds_c2,
        'StdDev': cv_std_mds_c2
    }
})

# Save results summary to JSON
with open("mds_c2_results.json", "w") as file:
    json.dump(results_mds_c2_serializable, file, indent=4)

print("MDS results and intermediate data saved successfully!")


In [None]:
# Plot the 2D projection with cluster labels
plt.figure(figsize=(10, 8))
sns.scatterplot(x=x_train_mds_c2[:, 0], y=x_train_mds_c2[:, 1], hue=y_train_sampled_mds, palette='tab10', s=10, legend='full')
plt.title("2D Scatter Plot of PCA-reduced Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

------

### MDS n_components= 50

In [64]:
# Combine training and test sets
x_full_mds = np.vstack([x_train_sampled_mds, x_test_sampled_mds])


In [65]:
# Apply MDS once
mds = MDS(n_components=50, random_state=42, n_jobs=-1)
x_full_mds_c50 = mds.fit_transform(x_full_mds)

In [66]:
# Split the embeddings back into train and test sets
x_train_mds_c50 = x_full_mds_c50[:len(y_train_sampled_mds)]
x_test_mds_c50 = x_full_mds_c50[len(y_train_sampled_mds):]

In [71]:
np.save('x_full_mds_c50.npy', x_full_mds_c50)

In [70]:
np.save('x_train_mds_c50.npy', x_train_mds_c50)
np.save('x_test_mds_c50.npy', x_test_mds_c50)

In [75]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_mds_c50 = kmeans.fit_predict(x_test_mds_c50)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_mds_c50 = adjusted_rand_score(y_test_sampled_mds, cluster_labels_mds_c50)
print(f"Adjusted Rand Index (ARI): {ari_mds_c50}")

In [None]:
# Silhouette Score
silhouette_mds_c50 = silhouette_score(x_test_mds_c50, cluster_labels_mds_c50)
print(silhouette_mds_c50)

In [None]:
knn_accuracy_mds_c50 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_mds_c50, y_train_sampled_mds)
    knn_accuracy = knn.score(x_test_mds_c50, y_test_sampled_mds)
    knn_accuracy_mds_c50[k] = knn_accuracy

print(f"k-NN Accuracy: {knn_accuracy_mds_c50}")

In [None]:
# Train SVM
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_mds_c50, y_train_sampled_mds)

# Predict on the test set
y_pred_svm = svm_clf.predict(x_test_mds_c50)

# Compute accuracy
svm_accuracy_mds_c50 = accuracy_score(y_test_sampled_mds, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_mds_c50:.4f}")

In [None]:
# Perform 10-fold CV
cv_scores_mds_c50 = cross_val_score(svm_clf, x_train_mds_c50, y_train_sampled_mds, cv=10)
cv_accuracy_mds_c50 = cv_scores_mds_c50.mean()
cv_std_mds_c50 = cv_scores_mds_c50.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_mds_c50:.4f} ± {cv_std_mds_c50:.4f}")

In [None]:
# Results for mds
results_mds_c50 = {
    'ARI': ari_mds_c50,
    'Silhouette Score': silhouette_mds_c50,
    'SVM Accuracy': svm_accuracy_mds_c50,
    'k-NN Accuracy': knn_accuracy_mds_c50,
    '10-Fold CV Accuracy': (cv_accuracy_mds_c50, cv_std_mds_c50)
}

print("mds Results:")
print(results_mds_c50)

In [None]:
# Save intermediate data
np.save("x_train_mds_c50.npy", x_train_mds_c50)  # mds-reduced training data
np.save("x_test_mds_c50.npy", x_test_mds_c50)    # mds-reduced test data
np.save("y_test_pred_mds_c50.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_mds_c50.npy", cv_scores_mds_c50)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracy_mds_c50.json", "w") as file:
    json.dump(knn_accuracy_mds_c50, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_mds_c50_serializable = convert_to_serializable({
    'ARI': ari_mds_c50,
    'Silhouette Score': silhouette_mds_c50,
    'SVM Accuracy': svm_accuracy_mds_c50,
    'k-NN Accuracy': knn_accuracy_mds_c50,
    '10-Fold CV Accuracy': (cv_accuracy_mds_c50, cv_std_mds_c50)
})

# Save results summary to JSON
with open("mds_c50_results.json", "w") as file:
    json.dump(results_mds_c50_serializable, file, indent=4)

print("mds results and intermediate data saved successfully!")

------

## Visual Comparison

In [None]:
# Load embeddings and cluster labels
x_train_pca_c2 = np.load("x_train_pca_c2.npy")
x_test_pca_c2 = np.load("x_test_pca_c2.npy")

x_train_tsne_c2 = np.load("x_train_tsne_c2.npy")
x_test_tsne_c2 = np.load("x_test_tsne_c2.npy")

x_train_isomap_c2 = np.load("x_train_isomap_c2.npy")
x_test_isomap_c2 = np.load("x_test_isomap_c2.npy")
y_test_sampled = np.load("y_test_sampled.npy")
y_train_sampled = np.load("y_train_sampled.npy")

x_train_lle_c2 = np.load("x_train_lle_c2.npy")
x_test_lle_c2 = np.load("x_test_lle_c2.npy")

x_train_umap = np.load('x_train_umap_c2.npy')
x_test_umap_c2_norm= np.load('x_test_umap_c2_norm.npy')

x_train_mds_c2 = np.load("x_train_mds_c2.npy")
x_test_mds_c2 = np.load("x_test_mds_c2.npy")
y_train_sampled_mds = np.load("y_train_sampled_mds.npy")
y_test_sampled_mds = np.load("y_test_sampled_mds.npy")

# Use test embeddings and labels for visualization
methods = {
    'PCA': (x_test_pca_c2, y_test),
    'Isomap': (x_test_isomap_c2, y_test_sampled),
    'LLE': (x_test_lle_c2, y_test_sampled),
    'MDS': (x_test_mds_c2, y_test_sampled_mds),
    't-SNE': (x_test_tsne_c2, y_test),
    'UMAP': (x_test_umap_c2_norm, y_test)
}

# Create a grid of subplots with two columns and three rows
fig, axes = plt.subplots(3, 2, figsize=(16, 18))  # Two columns, three rows
fig.subplots_adjust(hspace=0.4, wspace=0.4)  # Adjust spacing between plots

# Flatten axes for easier iteration
axes = axes.flatten()

# Define the label names (digits 0-9)
label_names = [f"Digit {i}" for i in range(10)]

for ax, (method, (embedding, labels)) in zip(axes, methods.items()):
    scatter = sns.scatterplot(
        x=embedding[:, 0], 
        y=embedding[:, 1], 
        hue=labels.astype(str),  # Ensure labels are strings
        palette='tab10',  # Use a standard tab10 palette for each plot
        s=40,  # Larger markers for better visibility
        ax=ax
    )
    ax.set_title(f'{method} Embeddings', fontsize=14, pad=10, loc='center')  # Larger font size

    # Hide x and y axis ticks and labels
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    
    # Set equal aspect ratio for symmetry
    ax.set_aspect('equal')

    # Add a legend for each plot
    handles, labels = scatter.get_legend_handles_labels()
    ax.legend(
        handles=handles, labels=label_names, title="Cluster", fontsize=10, loc='upper right',
        frameon=True, edgecolor="black"
    )

# Ensure the layout updates properly
plt.tight_layout()

# Show all plots
plt.show()

# Verification: Ensure colors and digits are correctly matched
print("\nVerifying colors and digits for each plot:")
for method, (embedding, labels) in methods.items():
    print(f"\nMethod: {method}")
    
    # Generate scatter plot to extract handles and colors
    scatter = sns.scatterplot(
        x=embedding[:, 0],
        y=embedding[:, 1],
        hue=labels.astype(str),
        palette='tab10',
        s=10,
        legend=True  # Ensure legend is generated
    )
    legend = scatter.get_legend()
    handles = legend.legendHandles  # Get handles from the legend
    plt.close()  # Close the plot since we only need the handles

    # Ensure there are exactly 10 handles for digits 0-9
    if len(handles) != 10:
        print(f"Error: Expected 10 clusters but got {len(handles)} for {method}.")
        continue

    # Check colors for each digit
    for digit in range(10):
        # Extract color from the plot handle and normalize to (R, G, B)
        color_in_plot = tuple(handles[digit].get_facecolor()[0][:3])  # Normalize to a tuple
        expected_color = tuple(sns.color_palette('tab10', 10)[digit])  # Also as a tuple
        
        # Compare RGB components
        match = color_in_plot == expected_color
        print(
            f"Digit {digit}: Color in plot {color_in_plot} | Expected color {expected_color} | Match: {match}"
        )


In [None]:
"x_train_umap_c50.npy", x_train_umap_c50)
np.save("x_test_umap_c50.npy", x_test_umap_c50)

In [None]:
# Load embeddings and cluster labels
x_train_pca_c50 = np.load("x_train_pca_c50.npy")
x_test_pca_c50 = np.load("x_test_pca_c50.npy")

x_train_tsne_c2 = np.load("x_train_tsne_c2.npy")
x_test_tsne_c2 = np.load("x_test_tsne_c2.npy")

x_train_isomap_c50 = np.load("x_train_isomap_c50.npy")
x_test_isomap_c50 = np.load("x_test_isomap_c50.npy")
y_test_sampled = np.load("y_test_sampled.npy")
y_train_sampled = np.load("y_train_sampled.npy")

x_train_lle_c50 = np.load("x_train_lle_c50.npy")
x_test_lle_c50 = np.load("x_test_lle_c50.npy")

x_train_umap_c50 = np.load('x_train_umap_c50.npy')
x_test_umap_c50 = np.load('x_test_umap_c50.npy')



x_train_mds_c50 = np.load("x_train_mds_c50.npy")
x_test_mds_c50 = np.load("x_test_mds_c50.npy")
y_train_sampled_mds = np.load("y_train_sampled_mds.npy")
y_test_sampled_mds = np.load("y_test_sampled_mds.npy")

# Use test embeddings and labels for visualization
methods = {
    'PCA': (x_test_pca_c50, y_test),
    'Isomap': (x_test_isomap_c50, y_test_sampled),
    'LLE': (x_test_lle_c50, y_test_sampled),
    'MDS': (x_test_mds_c50, y_test_sampled_mds),
    't-SNE': (x_test_tsne_c2, y_test),
    'UMAP': (x_test_umap_c50, y_test)
}

# Create a grid of subplots with two columns and three rows
fig, axes = plt.subplots(3, 2, figsize=(16, 18))  # Two columns, three rows
fig.subplots_adjust(hspace=0.4, wspace=0.4)  # Adjust spacing between plots

# Flatten axes for easier iteration
axes = axes.flatten()

# Define the label names (digits 0-9)
label_names = [f"Digit {i}" for i in range(10)]

for ax, (method, (embedding, labels)) in zip(axes, methods.items()):
    scatter = sns.scatterplot(
        x=embedding[:, 0], 
        y=embedding[:, 1], 
        hue=labels.astype(str),  # Ensure labels are strings
        palette='tab10',  # Use a standard tab10 palette for each plot
        s=40,  # Larger markers for better visibility
        ax=ax
    )
    ax.set_title(f'{method} Embeddings', fontsize=14, pad=10, loc='center')  # Larger font size

    # Hide x and y axis ticks and labels
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    
    # Set equal aspect ratio for symmetry
    ax.set_aspect('equal')

    # Add a legend for each plot
    handles, labels = scatter.get_legend_handles_labels()
    ax.legend(
        handles=handles, labels=label_names, title="Cluster", fontsize=10, loc='upper right',
        frameon=True, edgecolor="black"
    )

# Ensure the layout updates properly
plt.tight_layout()

# Show all plots
plt.show()

# Verification: Ensure colors and digits are correctly matched
print("\nVerifying colors and digits for each plot:")
for method, (embedding, labels) in methods.items():
    print(f"\nMethod: {method}")
    
    # Generate scatter plot to extract handles and colors
    scatter = sns.scatterplot(
        x=embedding[:, 0],
        y=embedding[:, 1],
        hue=labels.astype(str),
        palette='tab10',
        s=10,
        legend=True  # Ensure legend is generated
    )
    legend = scatter.get_legend()
    handles = legend.legendHandles  # Get handles from the legend
    plt.close()  # Close the plot since we only need the handles

    # Ensure there are exactly 10 handles for digits 0-9
    if len(handles) != 10:
        print(f"Error: Expected 10 clusters but got {len(handles)} for {method}.")
        continue

    # Check colors for each digit
    for digit in range(10):
        # Extract color from the plot handle and normalize to (R, G, B)
        color_in_plot = tuple(handles[digit].get_facecolor()[0][:3])  # Normalize to a tuple
        expected_color = tuple(sns.color_palette('tab10', 10)[digit])  # Also as a tuple
        
        # Compare RGB components
        match = color_in_plot == expected_color
        print(
            f"Digit {digit}: Color in plot {color_in_plot} | Expected color {expected_color} | Match: {match}"
        )


----------