# MSc. Thesis - Clustering and shape analysis of high dimensional biomedical data.

- **Author:** Dundas Lorenzo Agustin, s223288, MSc. Business Analytics, Technical University of Denmark.
- **Co-Supervisor:** Sebastián Basterrech, Postdoc, Department of Applied Mathematics and Computer Science, Technical University of Denmark.
- **Supervisor:** Line Katrine Harder Clemmensen, Professor, Department of Applied Mathematics and Computer Science, Technical University of Denmark.

-----------

# Imports

In [None]:
import json
import numpy as np
import pandas as pd
import struct
import random
import matplotlib.pyplot as plt
import umap
import seaborn as sns
import csv
import hdbscan
import networkx as nx
import pickle
import os
import tensorflow as tf
import zipfile
import cv2

from sklearn.metrics import silhouette_score, silhouette_samples, confusion_matrix, accuracy_score, davies_bouldin_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.metrics import pairwise_distances as sklearn_pairwise_distances
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from scipy.stats import mode, linregress, norm
from array import array
from os.path import join
from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding, MDS,trustworthiness
from sklearn.utils import resample
from scipy.stats import multivariate_normal
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from PIL import Image
from numpy import asarray
from IPython.display import SVG, Image
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
from livelossplot.tf_keras import PlotLossesCallback
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, Conv2D, BatchNormalization, Activation, MaxPooling2D
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# MNIST Data Loader Class

In [28]:
# MNIST Data Loader Class

class MnistDataloader(object):
    def __init__(self, training_images_filepath, training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath
    
    def read_images_labels(self, images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())
        
        # Convert labels to NumPy array
        labels = np.array(labels)
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img            
        
        return images, labels
            
    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        
        return (x_train, y_train), (x_test, y_test)

## Verify Reading Dataset via MNISTDataloader class

In [None]:
# Verify Reading Dataset via MnistDataloader class
%matplotlib inline

# Set file paths of MNIST Datasets
input_path = 'C:/Users/Lorenzo/OneDrive/Documents/DTU/Python/2024 Fall/MSc Thesis'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')

# Helper function to show a list of images with their relating titles
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images)/cols) + 1
    plt.figure(figsize=(30,20))
    index = 1    
    for x in zip(images, title_texts):        
        image = x[0]        
        title_text = x[1]
        plt.subplot(rows, cols, index)        
        plt.imshow(image, cmap=plt.cm.gray)
        if (title_text != ''):
            plt.title(title_text, fontsize = 15);        
        index += 1

# Load MINST dataset
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()

# Show some random training and test images 
images_2_show = []
titles_2_show = []
for i in range(0, 10):
    r = random.randint(1, 60000)
    images_2_show.append(x_train[r])
    titles_2_show.append('training image [' + str(r) + '] = ' + str(y_train[r]))    

for i in range(0, 5):
    r = random.randint(1, 10000)
    images_2_show.append(x_test[r])        
    titles_2_show.append('test image [' + str(r) + '] = ' + str(y_test[r]))    

show_images(images_2_show, titles_2_show)

In [30]:
# List of images to a NumPy array
x_train_array = np.array(x_train)
x_test_array = np.array(x_test)

- Option 1) Normalization done by Standarization (zero mean and unit variance). Recommended for DR techniques and clustering

In [None]:
from sklearn.preprocessing import StandardScaler

# Flatten images if needed
x_train_flattened1 = x_train_array.reshape(x_train_array.shape[0], -1)
x_test_flattened1 = x_test_array.reshape(x_test_array.shape[0], -1)
# Standardize the data
scaler = StandardScaler()
x_train_standardized1 = scaler.fit_transform(x_train_flattened1)
x_test_standardized1 = scaler.transform(x_test_flattened1)

print("Data standardized: Mean =", x_train_standardized1.mean(), "Std Dev =", x_train_standardized1.std())

- Option 2) Normalizaiton done by Scailing reshaping the images scaling to [0,1]. However more in use for neural networks.

In [5]:
# Flatten the 28x28 images into 784-dimensional vectors
x_train_flattened = np.array([img.flatten() for img in x_train_array])
x_test_flattened = np.array([img.flatten() for img in x_test_array])

In [6]:
# Normalizing by 255 scales the pixel intensity values to the [0, 1] range.
# Hhelps improve performance and consistency in clustering and dimensionality reduction algorithms. 
# Making it a common practice in image-based data processing.

x_train_normalized = x_train_flattened / 255.0
x_test_normalized = x_test_flattened / 255.0

In [None]:
# Check normalization for x_train_normalized
train_mean = x_train_normalized.mean(axis=0)  # Mean for each feature
train_std = x_train_normalized.std(axis=0)    # Standard deviation for each feature

# Check normalization for x_test_normalized
test_mean = x_test_normalized.mean(axis=0)  # Mean for each feature
test_std = x_test_normalized.std(axis=0)    # Standard deviation for each feature

# Print results
print("Train Data - Mean (per feature):")
print(train_mean)
print("Train Data - Standard Deviation (per feature):")
print(train_std)

print("\nTest Data - Mean (per feature):")
print(test_mean)
print("Test Data - Standard Deviation (per feature):")
print(test_std)

# Verify if data is normalized
if np.allclose(train_mean, 0, atol=1e-2) and np.allclose(train_std, 1, atol=1e-2):
    print("\nx_train_normalized is properly normalized (zero mean, unit variance).")
else:
    print("\nx_train_normalized is NOT properly normalized.")

if np.allclose(test_mean, 0, atol=1e-2) and np.allclose(test_std, 1, atol=1e-2):
    print("x_test_normalized is properly normalized (zero mean, unit variance).")
else:
    print("x_test_normalized is NOT properly normalized.")


----------------

# Exploration and comparison of multiple algorithms - MNIST

## PCA

### n_components=2

In [None]:
# Apply PCA
pca = PCA(n_components=2)
x_train_pca_c2 = pca.fit_transform(x_train_standardized1)
x_test_pca_c2 = pca.transform(x_test_standardized1)

In [None]:
x_full_pca_c2 = np.vstack([x_train_pca_c2, x_test_pca_c2])

#### Clustering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_pca_c2 = kmeans.fit_predict(x_full_pca_c2)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_pca_c2 = adjusted_rand_score(y_full, cluster_labels_pca_c2)
print(f"Adjusted Rand Index (ARI): {ari_pca_c2}")

# Silhouette Score
silhouette_pca_c2 = silhouette_score(x_full_pca_c2, cluster_labels_pca_c2)
print(silhouette_pca_c2)

#### Classifiers

In [None]:
# k-NN Accuracy for varying k
knn_accuracies_pca_c2 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_pca_c2, y_train)
    knn_accuracy = knn.score(x_test_pca_c2, y_test)
    knn_accuracies_pca_c2[k] = knn_accuracy

In [None]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_pca_c2, y_train)
y_test_pred_pca_c2= svm_clf.predict(x_test_pca_c2)
svm_accuracy_pca_c2 = accuracy_score(y_test, y_test_pred_pca_c2)

In [None]:
# 10-Fold Cross-Validation Accuracy
cv_scores_pca_c2 = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_pca_c2, y_train, cv=10)
cv_accuracy_pca_c2 = cv_scores_pca_c2.mean()
cv_std_pca_c2 = cv_scores_pca_c2.std()

In [None]:
# Results for PCA
results_pca_c2 = {
    'ARI': ari_pca_c2,
    'Silhouette Score': silhouette_pca_c2,
    'SVM Accuracy': svm_accuracy_pca_c2,
    'k-NN Accuracy': knn_accuracies_pca_c2,
    '10-Fold CV Accuracy': (cv_accuracy_pca_c2, cv_std_pca_c2)
}

print("PCA Results:")
print(results_pca_c2)

In [None]:
# Save intermediate data
np.save("x_train_pca_c2.npy", x_train_pca_c2)  # pca-reduced training data
np.save("x_test_pca_c2.npy", x_test_pca_c2)    # pca-reduced test data
np.save("y_test_pred_pca_c2.npy", y_test_pred_pca_c2)  # SVM predictions
np.save("cv_scores_pca_c2.npy", cv_scores_pca_c2)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracies_pca_c2.json", "w") as file:
    json.dump(knn_accuracies_pca_c2, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_pca_c2_serializable = convert_to_serializable({
    'ARI': ari_pca_c2,
    'Silhouette Score': silhouette_pca_c2,
    'SVM Accuracy': svm_accuracy_pca_c2,
    'k-NN Accuracy': knn_accuracies_pca_c2,
    '10-Fold CV Accuracy': (cv_accuracy_pca_c2, cv_std_pca_c2)
})

# Save results summary to JSON
with open("pca_c2_results.json", "w") as file:
    json.dump(results_pca_c2_serializable, file, indent=4)

print("PCA results and intermediate data saved successfully!")

In [None]:
# Plot the 2D projection with cluster labels
plt.figure(figsize=(10, 8))
sns.scatterplot(x=x_train_pca_c2[:, 0], y=x_train_pca_c2[:, 1], hue=y_train, palette='tab10', s=10, legend='full')
plt.title("2D Scatter Plot of PCA-reduced Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

---------

### PCA n_components=50

In [None]:
# Step 2: Apply PCA
pca = PCA(n_components=50)
x_train_pca_c50 = pca.fit_transform(x_train_standardized1)
x_test_pca_c50 = pca.transform(x_test_standardized1)

np.save("x_train_pca_c50.npy", x_train_pca_c50)
np.save("x_test_pca_c50.npy", x_test_pca_c50)

print(f"Original number of features: {x_train_standardized1.shape[1]}")
print(f"Reduced number of features: {x_train_pca_c50.shape[1]}")

In [None]:
x_full_pca_c50 = np.vstack([x_train_pca_c50, x_test_pca_c50])

#### Clustering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_pca50 = kmeans.fit_predict(x_full_pca_c50)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_pca_c50 = adjusted_rand_score(y_full, cluster_labels_pca50)
print(f"Adjusted Rand Index (ARI): {ari_pca_c50}")

# Silhouette Score
silhouette_pca_c50 = silhouette_score(x_full_pca_c50, cluster_labels_pca50)
print(silhouette_pca_c50)

#### Classifiers

In [None]:
# k-NN Accuracy for varying k
knn_accuracies_pca_c50 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_pca_c50, y_train)
    knn_accuracy = knn.score(x_test_pca_c50, y_test)
    knn_accuracies_pca_c50[k] = knn_accuracy

In [None]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_pca_c50, y_train)
y_test_pred_pca_c50= svm_clf.predict(x_test_pca_c50)
svm_accuracy_pca_c50 = accuracy_score(y_test, y_test_pred_pca_c50)

In [None]:
# 10-Fold Cross-Validation Accuracy
cv_scores_pca_c50 = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_pca_c50, y_train, cv=10)
cv_accuracy_pca_c50 = cv_scores_pca_c50.mean()
cv_std_pca_c50 = cv_scores_pca_c50.std()

In [None]:
# Results for PCA
results_pca_c50 = {
    'ARI': ari_pca_c50,
    'Silhouette Score': silhouette_pca_c50,
    'SVM Accuracy': svm_accuracy_pca_c50,
    'k-NN Accuracy': knn_accuracies_pca_c50,
    '10-Fold CV Accuracy': (cv_accuracy_pca_c50, cv_std_pca_c50)
}

print("PCA Results:")
print(results_pca_c50)

In [None]:
# Save intermediate data
np.save("x_train_pca_c50.npy", x_train_pca_c50)  # pca-reduced training data
np.save("x_test_pca_c50.npy", x_test_pca_c50)    # pca-reduced test data
np.save("y_test_pred_pca_c50.npy", y_test_pred_pca_c50)  # SVM predictions
np.save("cv_scores_pca_c50.npy", cv_scores_pca_c50)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracies_pca_c50.json", "w") as file:
    json.dump(knn_accuracies_pca_c50, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_pca_c50_serializable = convert_to_serializable({
    'ARI': ari_pca_c50,
    'Silhouette Score': silhouette_pca_c50,
    'SVM Accuracy': svm_accuracy_pca_c50,
    'k-NN Accuracy': knn_accuracies_pca_c50,
    '10-Fold CV Accuracy': (cv_accuracy_pca_c50, cv_std_pca_c50)
})

# Save results summary to JSON
with open("pca_c50_results.json", "w") as file:
    json.dump(results_pca_c50_serializable, file, indent=4)

print("PCA results and intermediate data saved successfully!")

-------

### PCA (95)

In [None]:
# Step 2: Apply PCA
pca = PCA(0.95)
x_train_pca_95 = pca.fit_transform(x_train_standardized1)
x_test_pca_95 = pca.transform(x_test_standardized1)

np.save("x_train_pca_95.npy", x_train_pca_95)
np.save("x_test_pca_95.npy", x_test_pca_95)

print(f"Original number of features: {x_train_standardized1.shape[1]}")
print(f"Reduced number of features: {x_train_pca_95.shape[1]}")

In [None]:
x_train_pca_95= np.load("x_train_pca_95.npy")
x_test_pca_95= np.load("x_test_pca_95.npy")

#### Clustering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_pca95 = kmeans.fit_predict(x_test_pca_95)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_pca_95 = adjusted_rand_score(y_test, cluster_labels_pca95)
print(f"Adjusted Rand Index (ARI): {ari_pca_95}")
# ari_pca_95 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(x_train_pca_95, y_train).predict(x_test_pca_95))

# Silhouette Score
silhouette_pca_95 = silhouette_score(x_test_pca_95, cluster_labels_pca95)
print(silhouette_pca_95)
# silhouette_pca_95 = silhouette_score(x_test_pca_95, KNeighborsClassifier(n_neighbors=1).fit(x_train_pca_95, y_train).predict(x_test_pca_95))

#### Classifiers

In [None]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_pca_95, y_train)
y_test_pred_pca_95= svm_clf.predict(x_test_pca_95)
svm_accuracy_pca = accuracy_score(y_test, y_test_pred_pca_95)

In [None]:
# k-NN Accuracy for varying k
knn_accuracies_pca = {}
for k in [1, 5, 10]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_pca_95, y_train)
    knn_accuracy = knn.score(x_test_pca_95, y_test)
    knn_accuracies_pca[k] = knn_accuracy

# 10-Fold Cross-Validation Accuracy
cv_scores_pca = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_pca_95, y_train, cv=10)
cv_accuracy_pca = cv_scores_pca.mean()
cv_std_pca = cv_scores_pca.std()


In [None]:
# Results for PCA
results_pca = {
    'ARI': ari_pca_95,
    'Silhouette Score': silhouette_pca_95,
    'SVM Accuracy': svm_accuracy_pca,
    'k-NN Accuracy': knn_accuracies_pca,
    '10-Fold CV Accuracy': (cv_accuracy_pca, cv_std_pca)
}

print("PCA Results:")
print(results_pca)

In [None]:
# Plot the 2D projection with cluster labels
plt.figure(figsize=(10, 8))
sns.scatterplot(x=x_train_pca_95[:, 0], y=x_train_pca_95[:, 1], hue=y_train, palette='tab10', s=10, legend='full')
plt.title("2D Scatter Plot of PCA-reduced Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
import json

# Save intermediate data (PCA embeddings and other computationally expensive results)
np.save("x_train_pca_95.npy", x_train_pca_95)
np.save("x_test_pca_95.npy", x_test_pca_95)
np.save("y_test_pred_pca_95.npy", y_test_pred_pca_95)  # Save SVM predictions
np.save("cv_scores_pca.npy", cv_scores_pca)      # Save cross-validation scores

# Save k-NN accuracies
with open("knn_accuracies_pca.json", "w") as file:
    json.dump(knn_accuracies_pca, file, indent=4)

# Save PCA Results
results_pca_95 = {
    'ARI': ari_pca_95,
    'Silhouette Score': silhouette_pca_95,
    'SVM Accuracy': svm_accuracy_pca,
    'k-NN Accuracy': knn_accuracies_pca,
    '10-Fold CV Accuracy': {
        'Mean': cv_accuracy_pca,
        'StdDev': cv_std_pca
    },
    'Filepaths': {
        'x_train_pca_95': "x_train_pca_95.npy",
        'x_test_pca_95': "x_test_pca_95.npy",
        'y_test_pred_pca_95': "y_test_pred_pca_95.npy",
        'cv_scores_pca': "cv_scores_pca.npy",
        'knn_accuracies_pca': "knn_accuracies_pca.json"
    }
}

# Save results to a JSON file
with open("pca_95_results.json", "w") as file:
    json.dump(results_pca_95, file, indent=4)

print("PCA 95% results and all intermediate data saved successfully!")

-------------

## T-SNE

### t-SNE n_components=2 (not possible with 50)

In [None]:
# Combine training and testing sets for t-SNE (unsupervised embedding)
x_full = np.vstack([x_train_standardized1, x_test_standardized1])  # Combine normalized train and test data
y_full = np.hstack([y_train, y_test])  # Combine train and test labels

In [None]:
# Apply TSNE
tsne = TSNE(n_components=2, random_state=42, perplexity=15)
x_full_tsne = tsne.fit_transform(x_train_standardized1)

In [None]:
x_full_tsne= np.load('x_full_tsne.npy')

In [None]:
# Split the embeddings back into train and test sets
x_train_tsne_c2 = x_full_tsne[:x_train_standardized1.shape[0], :]  # Train embeddings
x_test_tsne_c2 = x_full_tsne[x_train_standardized1.shape[0]:, :]  # Test embeddings

In [None]:
# Save intermediate data (tsne embeddings and other computationally expensive results)
np.save("x_train_tsne_c2.npy", x_train_tsne_c2)
np.save("x_test_tsne_c2.npy", x_test_tsne_c2)

#### Clustering

In [None]:
# Perform K-Means clustering on the TSNE-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_tsne_c2 = kmeans.fit_predict(x_full_tsne)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_tsne_c2 = adjusted_rand_score(y_full, cluster_labels_tsne_c2)
print(f"Adjusted Rand Index (ARI): {ari_tsne_c2}")

In [None]:
# Silhouette Score
silhouette_tsne_c2 = silhouette_score(x_full_tsne, cluster_labels_tsne_c2)
print(silhouette_tsne_c2)

#### Classifiers

In [None]:
# k-NN Accuracy for varying k
knn_accuracy_tsne_c2 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_tsne_c2, y_train)
    knn_accuracy = knn.score(x_test_tsne_c2, y_test)
    knn_accuracy_tsne_c2[k] = knn_accuracy

In [None]:
# Train SVM on umap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_tsne_c2, y_train)

# Predict on the same embeddings
y_pred_svm_test = svm_clf.predict(x_test_tsne_c2)  # Predict on test embeddings

# Compute SVM accuracy
svm_accuracy_tsne_c2 = accuracy_score(y_test, y_pred_svm_test)
print(f"SVM Accuracy (Test): {svm_accuracy_tsne_c2:.4f}")

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_tsne_c2 = cross_val_score(svm_clf, x_train_tsne_c2, y_train, cv=10)
cv_accuracy_tsne_c2 = cv_scores_tsne_c2.mean()
cv_std_tsne_c2 = cv_scores_tsne_c2.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_tsne_c2:.4f} ± {cv_std_tsne_c2:.4f}")

In [None]:
# Results for umap
results_tsne_c2 = {
    'ARI': ari_tsne_c2,
    'Silhouette Score': silhouette_tsne_c2,
    'SVM Accuracy': svm_accuracy_tsne_c2,
    'k-NN Accuracy': knn_accuracy_tsne_c2,
    '10-Fold CV Accuracy': (cv_accuracy_tsne_c2, cv_std_tsne_c2)
}

print("umap Results:")
print(results_tsne_c2)

In [None]:
# Save intermediate data
np.save("x_train_tsne_c2.npy", x_train_tsne_c2)  # umap-reduced training data
np.save("x_test_tsne_c2.npy", x_test_tsne_c2)    # umap-reduced test data
np.save("y_test_pred_tsne_c2.npy", y_pred_svm_test)  # SVM predictions
np.save("cv_scores_tsne_c2.npy", cv_scores_tsne_c2)      # Cross-validation scores

In [None]:
# Save k-NN accuracies to JSON
with open("knn_accuracy_tsne_c2.json", "w") as file:
    json.dump(knn_accuracy_tsne_c2, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_tsne_c2_serializable = convert_to_serializable({
    'ARI': ari_tsne_c2,
    'Silhouette Score': silhouette_tsne_c2,
    'SVM Accuracy': svm_accuracy_tsne_c2,
    'k-NN Accuracy': knn_accuracy_tsne_c2,
    '10-Fold CV Accuracy': (cv_accuracy_tsne_c2, cv_std_tsne_c2)
})

# Save results summary to JSON
with open("tsne_c50_results.json", "w") as file:
    json.dump(results_tsne_c2_serializable, file, indent=4)

print("tsne results and intermediate data saved successfully!")

------------

## ISOMAP

### ISOMAP n_components=2

In [None]:
def downsample_mnist_consistent(x_data, y_labels, sample_fraction=0.35):
    """
    Downsample the dataset consistently, returning indices to ensure
    the same points are selected in both spaces.
    """
    sampled_indices = []
    unique_labels = np.unique(y_labels)
    for label in unique_labels:
        # Select indices for the current label
        label_indices = np.where(y_labels == label)[0]
        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False, random_state=42
        )
        sampled_indices.extend(sampled_indices_label)
    return np.array(sampled_indices)

# Downsample training data
sampled_indices_train = downsample_mnist_consistent(x_train_standardized1, y_train, sample_fraction=0.35)
x_train_sampled = x_train_standardized1[sampled_indices_train]
y_train_sampled = y_train[sampled_indices_train]

# Downsample test data
sampled_indices_test = downsample_mnist_consistent(x_test_standardized1, y_test, sample_fraction=0.35)
x_test_sampled = x_test_standardized1[sampled_indices_test]
y_test_sampled = y_test[sampled_indices_test]

print(f"Training set reduced to {len(x_train_sampled)} samples.")
print(f"Test set reduced to {len(x_test_sampled)} samples.")

In [None]:
# Save the sampled indices
np.save("sampled_indices_train.npy", sampled_indices_train)
np.save("sampled_indices_test.npy", sampled_indices_test)

# Save the downsampled dataset
np.save("x_train_sampled.npy", x_train_sampled)
np.save("y_train_sampled.npy", y_train_sampled)
np.save("x_test_sampled.npy", x_test_sampled)
np.save("y_test_sampled.npy", y_test_sampled)

print("Downsampling saved successfully!")

In [None]:
# load the sampled indices
sampled_indices_train= np.load("sampled_indices_train.npy")
sampled_indices_test= np.load("sampled_indices_test.npy")

# load the downsampled dataset
x_train_sampled= np.load("x_train_sampled.npy")
y_train_sampled= np.load("y_train_sampled.npy")
x_test_sampled= np.load("x_test_sampled.npy")
y_test_sampled= np.load("y_test_sampled.npy")

In [None]:
# Apply Isomap
isomap = Isomap(n_components=2, n_neighbors=15)
x_train_isomap_c2 = isomap.fit_transform(x_train_sampled)
x_test_isomap_c2 = isomap.transform(x_test_sampled)

In [None]:
np.save('x_train_isomap_c2.npy',x_train_isomap_c2)
np.save('x_test_isomap_c2.npy',x_test_isomap_c2)

#### Clustering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_isomap_c2 = kmeans.fit_predict(x_train_isomap_c2)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_isomap_c2 = adjusted_rand_score(y_train_sampled, cluster_labels_isomap_c2)
print(f"Adjusted Rand Index (ARI): {ari_isomap_c2}")

In [None]:
# Silhouette Score
silhouette_isomap_c2 = silhouette_score(x_train_isomap_c2, cluster_labels_isomap_c2)
print(silhouette_isomap_c2)

#### Classifiers

In [None]:
# k-NN Accuracy for varying k
knn_accuracy_isomap_c2 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_isomap_c2, y_train_sampled)
    knn_accuracy = knn.score(x_train_isomap_c2, y_train_sampled)
    knn_accuracy_isomap_c2[k] = knn_accuracy

In [None]:
# Train SVM on Isomap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_isomap_c2, y_train_sampled)

# Predict on the same embeddings
y_pred_svm = svm_clf.predict(x_train_isomap_c2)

# Compute SVM accuracy
svm_accuracy_isomap_c2 = accuracy_score(y_train_sampled, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_isomap_c2:.4f}")


In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_isomap_c2 = cross_val_score(svm_clf, x_train_isomap_c2, y_train_sampled, cv=10)
cv_accuracy_isomap_c2 = cv_scores_isomap_c2.mean()
cv_std_isomap_c2 = cv_scores_isomap_c2.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_isomap_c2:.4f} ± {cv_std_isomap_c2:.4f}")

In [None]:
# Results for Isomap
results_isomap_c2 = {
    'ARI': ari_isomap_c2,
    'Silhouette Score': silhouette_isomap_c2,
    'SVM Accuracy': svm_accuracy_isomap_c2,
    'k-NN Accuracy': knn_accuracy_isomap_c2,
    '10-Fold CV Accuracy': (cv_accuracy_isomap_c2, cv_std_isomap_c2)
}

print("Isomap Results:")
print(results_isomap_c2)

In [None]:
# Save intermediate data
np.save("x_train_isomap_c2.npy", x_train_isomap_c2)  # ISOMAP-reduced training data
np.save("x_test_isomap_c2.npy", x_test_isomap_c2)    # ISOMAP-reduced test data
np.save("y_test_pred_isomap_c2.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_isomap_c2.npy", cv_scores_isomap_c2)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracy_isomap_c2.json", "w") as file:
    json.dump(knn_accuracy_isomap_c2, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_isomap_c2_serializable = convert_to_serializable({
    'ARI': ari_isomap_c2,
    'Silhouette Score': silhouette_isomap_c2,
    'SVM Accuracy': svm_accuracy_isomap_c2,
    'k-NN Accuracy': knn_accuracy_isomap_c2,
    '10-Fold CV Accuracy': (cv_accuracy_isomap_c2, cv_std_isomap_c2)
})

# Save results summary to JSON
with open("isomap_c2_results.json", "w") as file:
    json.dump(results_isomap_c2_serializable, file, indent=4)

print("ISOMAP results and intermediate data saved successfully!")

-------

### ISOMAP n_component=50

In [None]:
# Apply Isomap
isomap = Isomap(n_components=50, n_neighbors=15)
x_train_isomap_c50 = isomap.fit_transform(x_train_sampled)
x_test_isomap_c50 = isomap.transform(x_test_sampled)

#### Clustering

In [None]:
# Perform K-Means clustering on the reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_isomap_c50 = kmeans.fit_predict(x_train_isomap_c50)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_isomap_c50 = adjusted_rand_score(y_train_sampled, cluster_labels_isomap_c50)
print(f"Adjusted Rand Index (ARI): {ari_isomap_c50}")

In [None]:
# Silhouette Score
silhouette_isomap_c50 = silhouette_score(x_train_isomap_c50, cluster_labels_isomap_c50)
print(silhouette_isomap_c50)

#### Classifiers

In [None]:
# k-NN Accuracy for varying k
knn_accuracy_isomap_c50 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_isomap_c50, y_train_sampled)
    knn_accuracy = knn.score(x_train_isomap_c50, y_train_sampled)
    knn_accuracy_isomap_c50[k] = knn_accuracy

In [None]:
# Train SVM on Isomap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_isomap_c50, y_train_sampled)

# Predict on the same embeddings
y_pred_svm = svm_clf.predict(x_train_isomap_c50)

# Compute SVM accuracy
svm_accuracy_isomap_c50 = accuracy_score(y_train_sampled, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_isomap_c50:.4f}")


In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_isomap_c50 = cross_val_score(svm_clf, x_train_isomap_c50, y_train_sampled, cv=10)
cv_accuracy_isomap_c50 = cv_scores_isomap_c50.mean()
cv_std_isomap_c50 = cv_scores_isomap_c50.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_isomap_c50:.4f} ± {cv_std_isomap_c50:.4f}")

In [None]:
# Results for Isomap
results_isomap_c50 = {
    'ARI': ari_isomap_c50,
    'Silhouette Score': silhouette_isomap_c50,
    'SVM Accuracy': svm_accuracy_isomap_c50,
    'k-NN Accuracy': knn_accuracy_isomap_c50,
    '10-Fold CV Accuracy': (cv_accuracy_isomap_c50, cv_std_isomap_c50)
}

print("Isomap Results:")
print(results_isomap_c50)

In [None]:
# Save intermediate data
np.save("x_train_isomap_c50.npy", x_train_isomap_c50)  # ISOMAP-reduced training data
np.save("x_test_isomap_c50.npy", x_test_isomap_c50)    # ISOMAP-reduced test data
np.save("y_test_pred_isomap_c50.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_isomap_c50.npy", cv_scores_isomap_c50)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracy_isomap_c50.json", "w") as file:
    json.dump(knn_accuracy_isomap_c50, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_isomap_c50_serializable = convert_to_serializable({
    'ARI': ari_isomap_c50,
    'Silhouette Score': silhouette_isomap_c50,
    'SVM Accuracy': svm_accuracy_isomap_c50,
    'k-NN Accuracy': knn_accuracy_isomap_c50,
    '10-Fold CV Accuracy': (cv_accuracy_isomap_c50, cv_std_isomap_c50)
})

# Save results summary to JSON
with open("isomap_c50_results.json", "w") as file:
    json.dump(results_isomap_c50_serializable, file, indent=4)

print("ISOMAP results and intermediate data saved successfully!")

-----------

## LLE

### LLE n_components=2

In [None]:
# Apply LLE
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=15, method='standard')
x_train_lle_c2 = lle.fit_transform(x_train_sampled)
x_test_lle_c2 = lle.transform(x_test_sampled)

#### Clustering

In [None]:
# Perform K-Means clustering on the LLE-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_lle_c2 = kmeans.fit_predict(x_test_lle_c2)

In [None]:
x_test_lle_c2= np.load('x_test_lle_c2.npy')
x_train_lle_c2= np.load('x_train_lle_c2.npy')

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_lle_c2 = adjusted_rand_score(y_test_sampled, cluster_labels_lle_c2)
print(f"Adjusted Rand Index (ARI): {ari_lle_c2}")

In [None]:
# Silhouette Score
silhouette_lle_c2 = silhouette_score(x_test_lle_c2, cluster_labels_lle_c2)
print(silhouette_lle_c2)

#### Classifiers

In [None]:
# k-NN Accuracy for varying k
knn_accuracy_lle_c2 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_lle_c2, y_train_sampled)
    knn_accuracy = knn.score(x_train_lle_c2, y_train_sampled)
    knn_accuracy_lle_c2[k] = knn_accuracy

In [None]:
# Train SVM on lle embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_lle_c2, y_train_sampled)

# Predict on the same embeddings
y_pred_svm = svm_clf.predict(x_train_lle_c2)

# Compute SVM accuracy
svm_accuracy_lle_c2 = accuracy_score(y_train_sampled, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_lle_c2:.4f}")


In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_lle_c2 = cross_val_score(svm_clf, x_train_lle_c2, y_train_sampled, cv=10)
cv_accuracy_lle_c2 = cv_scores_lle_c2.mean()
cv_std_lle_c2 = cv_scores_lle_c2.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_lle_c2:.4f} ± {cv_std_lle_c2:.4f}")

In [None]:
# Results for lle
results_lle_c2 = {
    'ARI': ari_lle_c2,
    'Silhouette Score': silhouette_lle_c2,
    'SVM Accuracy': svm_accuracy_lle_c2,
    'k-NN Accuracy': knn_accuracy_lle_c2,
    '10-Fold CV Accuracy': (cv_accuracy_lle_c2, cv_std_lle_c2)
}

print("lle Results:")
print(results_lle_c2)

In [None]:
# Save intermediate data
np.save("x_train_lle_c2.npy", x_train_lle_c2)  # lle-reduced training data
np.save("x_test_lle_c2.npy", x_test_lle_c2)    # lle-reduced test data
np.save("y_test_pred_lle_c2.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_lle_c2.npy", cv_scores_lle_c2)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracy_lle_c2.json", "w") as file:
    json.dump(knn_accuracy_lle_c2, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_lle_c2_serializable = convert_to_serializable({
    'ARI': ari_lle_c2,
    'Silhouette Score': silhouette_lle_c2,
    'SVM Accuracy': svm_accuracy_lle_c2,
    'k-NN Accuracy': knn_accuracy_lle_c2,
    '10-Fold CV Accuracy': (cv_accuracy_lle_c2, cv_std_lle_c2)
})

# Save results summary to JSON
with open("lle_c2_results.json", "w") as file:
    json.dump(results_lle_c2_serializable, file, indent=4)

print("lle results and intermediate data saved successfully!")

### LLE n_component=50

In [None]:
# Apply LLE
lle = LocallyLinearEmbedding(n_components=50, n_neighbors=15, method='standard')
x_train_lle_c50 = lle.fit_transform(x_train_sampled)
x_test_lle_c50 = lle.transform(x_test_sampled)

In [None]:
x_train_lle_c50= np.load('x_train_lle_c50.npy')
x_test_lle_c50= np.load('x_test_lle_c50.npy')

In [None]:
x_full_lle_c50 = np.vstack([x_train_lle_c50, x_test_lle_c50])
y_full_lle_c50 = np.hstack([y_train_sampled, y_test_sampled])

#### Clsutering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_lle_c50 = kmeans.fit_predict(x_full_lle_c50)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_lle_c50 = adjusted_rand_score(y_full_lle_c50, cluster_labels_lle_c50)
print(f"Adjusted Rand Index (ARI): {ari_lle_c50}")

In [None]:
# Silhouette Score
silhouette_lle_c50 = silhouette_score(x_full_lle_c50, cluster_labels_lle_c50)
print(silhouette_lle_c50)

#### Classifiers

In [None]:
# k-NN Accuracy for varying k
knn_accuracy_lle_c50 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_lle_c50, y_train_sampled)
    knn_accuracy = knn.score(x_train_lle_c50, y_train_sampled)
    knn_accuracy_lle_c50[k] = knn_accuracy

# {1: 1.0, 5: 0.9598475827577995, 10: 0.9524172422005239}

In [None]:
# Train SVM on lle embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_lle_c50, y_train_sampled)

# Predict on the same embeddings
y_pred_svm = svm_clf.predict(x_train_lle_c50)

# Compute SVM accuracy
svm_accuracy_lle_c50 = accuracy_score(y_train_sampled, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_lle_c50:.4f}")


In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_lle_c50 = cross_val_score(svm_clf, x_train_lle_c50, y_train_sampled, cv=10)
cv_accuracy_lle_c50 = cv_scores_lle_c50.mean()
cv_std_lle_c50 = cv_scores_lle_c50.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_lle_c50:.4f} ± {cv_std_lle_c50:.4f}")

In [None]:
# Results for lle
results_lle_c50 = {
    'ARI': ari_lle_c50,
    'Silhouette Score': silhouette_lle_c50,
    'SVM Accuracy': svm_accuracy_lle_c50,
    'k-NN Accuracy': knn_accuracy_lle_c50,
    '10-Fold CV Accuracy': (cv_accuracy_lle_c50, cv_std_lle_c50)
}

print("lle Results:")
print(results_lle_c50)

In [None]:
# Save intermediate data
np.save("x_train_lle_c50.npy", x_train_lle_c50)  # lle-reduced training data
np.save("x_test_lle_c50.npy", x_test_lle_c50)    # lle-reduced test data
np.save("y_test_pred_lle_c50.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_lle_c50.npy", cv_scores_lle_c50)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracy_lle_c50.json", "w") as file:
    json.dump(knn_accuracy_lle_c50, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_lle_c50_serializable = convert_to_serializable({
    'ARI': ari_lle_c50,
    'Silhouette Score': silhouette_lle_c50,
    'SVM Accuracy': svm_accuracy_lle_c50,
    'k-NN Accuracy': knn_accuracy_lle_c50,
    '10-Fold CV Accuracy': (cv_accuracy_lle_c50, cv_std_lle_c50)
})

# Save results summary to JSON
with open("lle_c50_results.json", "w") as file:
    json.dump(results_lle_c50_serializable, file, indent=4)

print("lle results and intermediate data saved successfully!")

-------

## UMAP

### UMAP n_components=2

In [None]:
# Apply UMAP
umap = UMAP(n_components=2, n_neighbors=15, random_state=42)
x_train_umap_c2_std = umap.fit_transform(x_train_standardized1)
x_test_umap_c2_std = umap.transform(x_test_standardized1)

In [None]:
# Save intermediate data (umap embeddings and other computationally expensive results)
np.save("x_train_umap_c2_std.npy", x_train_umap_c2_std)
np.save("x_test_umap_c2_std.npy", x_test_umap_c2_std)

In [None]:
x_full_umap_c2_std = np.vstack([x_train_umap_c2_std, x_test_umap_c2_std])

#### Clustering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_umap_c2_std = kmeans.fit_predict(x_full_umap_c2_std)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_umap_c2_std = adjusted_rand_score(y_full, cluster_labels_umap_c2_std)
print(f"Adjusted Rand Index (ARI): {ari_umap_c2_std}")

In [None]:
# Silhouette Score
silhouette_umap_c2_std = silhouette_score(x_full_umap_c2_std, cluster_labels_umap_c2_std)
print(silhouette_umap_c2_std)

#### Classifiers

In [None]:
# k-NN Accuracy for varying k
knn_accuracy_umap_c2_std = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_umap_c2_std, y_train)
    knn_accuracy = knn.score(x_test_umap_c2_std, y_test)
    knn_accuracy_umap_c2_std[k] = knn_accuracy

In [None]:
# Train SVM on umap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_umap_c2_std, y_train)

# Predict on the same embeddings
y_pred_svm_test = svm_clf.predict(x_test_umap_c2_std)  # Predict on test embeddings

# Compute SVM accuracy
svm_accuracy_umap_c2_std = accuracy_score(y_test, y_pred_svm_test)
print(f"SVM Accuracy (Test): {svm_accuracy_umap_c2_std:.4f}")

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_umap_c2_std = cross_val_score(svm_clf, x_train_umap_c2_std, y_train, cv=10)
cv_accuracy_umap_c2_std = cv_scores_umap_c2_std.mean()
cv_std_umap_c2_std = cv_scores_umap_c2_std.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_umap_c2_std:.4f} ± {cv_std_umap_c2_std:.4f}")

In [None]:
# Results for umap
results_umap_c2_std = {
    'ARI': ari_umap_c2_std,
    'Silhouette Score': silhouette_umap_c2_std,
    'SVM Accuracy': svm_accuracy_umap_c2_std,
    'k-NN Accuracy': knn_accuracy_umap_c2_std,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c2_std, cv_std_umap_c2_std)
}

print("umap Results:")
print(results_umap_c2_std)

----

### UMAP n_components=50

In [None]:
# UMAP
from umap import UMAP

# Apply UMAP
umap = UMAP(n_components=50, n_neighbors=15, random_state=42)
x_train_umap_c50 = umap.fit_transform(x_train_standardized1)
x_test_umap_c50 = umap.transform(x_test_standardized1)

In [None]:
# Save intermediate data (umap embeddings and other computationally expensive results)
np.save("x_train_umap_c50.npy", x_train_umap_c50)
np.save("x_test_umap_c50.npy", x_test_umap_c50)

In [None]:
x_full_umap_c50 = np.vstack([x_train_umap_c50, x_test_umap_c50])

In [None]:
y_full = np.hstack([y_train, y_test])

#### Clustering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_umap_c50 = kmeans.fit_predict(x_full_umap_c50)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_umap_c50 = adjusted_rand_score(y_full, cluster_labels_umap_c50)
print(f"Adjusted Rand Index (ARI): {ari_umap_c50}")

In [None]:
# Silhouette Score
silhouette_umap_c50 = silhouette_score(x_full_umap_c50, cluster_labels_umap_c50)
print(silhouette_umap_c50)

#### Classifier

In [None]:
# k-NN Accuracy for varying k
knn_accuracy_umap_c50 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_umap_c50, y_train)
    knn_accuracy = knn.score(x_test_umap_c50, y_test)
    knn_accuracy_umap_c50[k] = knn_accuracy

In [None]:
# Train SVM on umap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_umap_c50, y_train)

# Predict on the same embeddings
y_pred_svm_test = svm_clf.predict(x_test_umap_c50)  # Predict on test embeddings

# Compute SVM accuracy
svm_accuracy_umap_c50 = accuracy_score(y_test, y_pred_svm_test)
print(f"SVM Accuracy (Test): {svm_accuracy_umap_c50:.4f}")

In [None]:
cv_scores_umap_c50= np.load('cv_scores_umap_c50.npy')

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_umap_c50 = cross_val_score(svm_clf, x_train_umap_c50, y_train, cv=10)
cv_accuracy_umap_c50 = cv_scores_umap_c50.mean()
cv_std_umap_c50 = cv_scores_umap_c50.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_umap_c50:.4f} ± {cv_std_umap_c50:.4f}")

In [None]:
# Results for umap
results_umap_c50 = {
    'ARI': ari_umap_c50,
    'Silhouette Score': silhouette_umap_c50,
    'SVM Accuracy': svm_accuracy_umap_c50,
    'k-NN Accuracy': knn_accuracy_umap_c50,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c50, cv_std_umap_c50)
}

print("umap Results:")
print(results_umap_c50)

In [None]:
# Save intermediate data
np.save("x_train_umap_c50.npy", x_train_umap_c50)  # umap-reduced training data
np.save("x_test_umap_c50.npy", x_test_umap_c50)    # umap-reduced test data
np.save("y_test_pred_umap_c50.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_umap_c50.npy", cv_scores_umap_c50)      # Cross-validation scores

In [None]:
# Load intermediate data
x_train_umap_c50= np.load("x_train_umap_c50.npy") 
x_test_umap_c50= np.load("x_test_umap_c50.npy") 
y_pred_svm= np.load("y_test_pred_umap_c50.npy") 
cv_scores_umap_c50= np.load("cv_scores_umap_c50.npy")  

In [None]:
# Save k-NN accuracies to JSON
with open("knn_accuracy_umap_c50.json", "w") as file:
    json.dump(knn_accuracy_umap_c50, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_umap_c50_serializable = convert_to_serializable({
    'ARI': ari_umap_c50,
    'Silhouette Score': silhouette_umap_c50,
    'SVM Accuracy': svm_accuracy_umap_c50,
    'k-NN Accuracy': knn_accuracy_umap_c50,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c50, cv_std_umap_c50)
})

# Save results summary to JSON
with open("umap_c50_results.json", "w") as file:
    json.dump(results_umap_c50_serializable, file, indent=4)

print("umap results and intermediate data saved successfully!")

In [None]:
# Load results from JSON file
with open("umap_c50_results.json", "r") as file:
    results_umap_c50 = json.load(file)

In [None]:
print(results_umap_c50)

------

### UMAP n_components=50 norm

In [None]:
# UMAP
from umap import UMAP

# Apply UMAP
umap = UMAP(n_components=50, n_neighbors=15, random_state=42)
x_train_umap_c50_norm = umap.fit_transform(x_train_normalized)
x_test_umap_c50_norm = umap.transform(x_test_normalized)

In [None]:
# Save intermediate data (umap embeddings and other computationally expensive results)
np.save("x_train_umap_c50_norm.npy", x_train_umap_c50_norm)
np.save("x_test_umap_c50_norm.npy", x_test_umap_c50_norm)

In [None]:
x_full_umap_c50_norm = np.vstack([x_train_umap_c50_norm, x_test_umap_c50_norm])

In [None]:
y_full = np.hstack([y_train, y_test])

#### Clustering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_umap_c50_norm = kmeans.fit_predict(x_full_umap_c50_norm)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_umap_c50_norm = adjusted_rand_score(y_full, cluster_labels_umap_c50_norm)
print(f"Adjusted Rand Index (ARI): {ari_umap_c50_norm}")

In [None]:
# Silhouette Score
silhouette_umap_c50_norm = silhouette_score(x_full_umap_c50_norm, cluster_labels_umap_c50_norm)
print(silhouette_umap_c50_norm)

#### Classifiers

In [None]:
# k-NN Accuracy for varying k
knn_accuracy_umap_c50_norm = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_umap_c50_norm, y_train)
    knn_accuracy = knn.score(x_test_umap_c50_norm, y_test)
    knn_accuracy_umap_c50_norm[k] = knn_accuracy

In [None]:
# Train SVM on umap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_umap_c50_norm, y_train)

# Predict on the same embeddings
y_pred_svm_test = svm_clf.predict(x_test_umap_c50_norm)  # Predict on test embeddings

# Compute SVM accuracy
svm_accuracy_umap_c50_norm = accuracy_score(y_test, y_pred_svm_test)
print(f"SVM Accuracy (Test): {svm_accuracy_umap_c50_norm:.4f}")

In [None]:
np.save('cv_scores_umap_c50_norm.npy', cv_scores_umap_c50_norm)

In [None]:
cv_scores_umap_c50_norm= np.load('cv_scores_umap_c50_norm.npy')

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_umap_c50_norm = cross_val_score(svm_clf, x_train_umap_c50_norm, y_train, cv=10)
cv_accuracy_umap_c50_norm = cv_scores_umap_c50_norm.mean()
cv_std_umap_c50_norm = cv_scores_umap_c50_norm.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_umap_c50_norm:.4f} ± {cv_std_umap_c50_norm:.4f}")

In [None]:
# Results for umap
results_umap_c50_norm = {
    'ARI': ari_umap_c50_norm,
    'Silhouette Score': silhouette_umap_c50_norm,
    'SVM Accuracy': svm_accuracy_umap_c50_norm,
    'k-NN Accuracy': knn_accuracy_umap_c50_norm,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c50_norm, cv_std_umap_c50_norm)
}

print("umap Results:")
print(results_umap_c50_norm)

In [None]:
# Save intermediate data
np.save("x_train_umap_c50_norm.npy", x_train_umap_c50_norm)  # umap-reduced training data
np.save("x_test_umap_c50_norm.npy", x_test_umap_c50_norm)    # umap-reduced test data
np.save("y_test_pred_umap_c50_norm.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_umap_c50_norm.npy", cv_scores_umap_c50_norm)      # Cross-validation scores

In [None]:
# Load intermediate data
x_train_umap_c50_norm= np.load("x_train_umap_c50_norm.npy") 
x_test_umap_c50_norm= np.load("x_test_umap_c50_norm.npy") 
y_pred_svm= np.load("y_test_pred_umap_c50_norm.npy") 
cv_scores_umap_c50_norm= np.load("cv_scores_umap_c50_norm.npy")  

In [None]:
# Save k-NN accuracies to JSON
with open("knn_accuracy_umap_c50_norm.json", "w") as file:
    json.dump(knn_accuracy_umap_c50_norm, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_umap_c50_norm_serializable = convert_to_serializable({
    'ARI': ari_umap_c50_norm,
    'Silhouette Score': silhouette_umap_c50_norm,
    'SVM Accuracy': svm_accuracy_umap_c50_norm,
    'k-NN Accuracy': knn_accuracy_umap_c50_norm,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c50_norm, cv_std_umap_c50_norm)
})

# Save results summary to JSON
with open("umap_c50_norm_results.json", "w") as file:
    json.dump(results_umap_c50_norm_serializable, file, indent=4)

print("umap results and intermediate data saved successfully!")

In [None]:
# Load results from JSON file
with open("umap_c50_norm_results.json", "r") as file:
    results_umap_c50_norm = json.load(file)

In [None]:
print(results_umap__c2_norm)

--------

### UMAP n_components=2 norm

In [None]:
# UMAP
from umap import UMAP

# Apply UMAP
umap = UMAP(n_components=2, n_neighbors=15, random_state=42)
x_train_umap_c2_norm = umap.fit_transform(x_train_normalized)
x_test_umap_c2_norm = umap.transform(x_test_normalized)

In [None]:
# Save intermediate data (umap embeddings and other computationally expensive results)
np.save("x_train_umap_c2_norm.npy", x_train_umap_c2_norm)
np.save("x_test_umap_c2_norm.npy", x_test_umap_c2_norm)

In [None]:
x_full_umap_c2_norm = np.vstack([x_train_umap_c2_norm, x_test_umap_c2_norm])

In [None]:
y_full = np.hstack([y_train, y_test])

#### Clustering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_umap_c2_norm = kmeans.fit_predict(x_full_umap_c2_norm)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_umap_c2_norm = adjusted_rand_score(y_full, cluster_labels_umap_c2_norm)
print(f"Adjusted Rand Index (ARI): {ari_umap_c2_norm}")

In [None]:
# Silhouette Score
silhouette_umap_c2_norm = silhouette_score(x_full_umap_c2_norm, cluster_labels_umap_c2_norm)
print(silhouette_umap_c2_norm)

#### Classifiers

In [None]:
# k-NN Accuracy for varying k
knn_accuracy_umap_c2_norm = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_umap_c2_norm, y_train)
    knn_accuracy = knn.score(x_test_umap_c2_norm, y_test)
    knn_accuracy_umap_c2_norm[k] = knn_accuracy

In [None]:
# Train SVM on umap embeddings
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_umap_c2_norm, y_train)

# Predict on the same embeddings
y_pred_svm_test = svm_clf.predict(x_test_umap_c2_norm)  # Predict on test embeddings

# Compute SVM accuracy
svm_accuracy_umap_c2_norm = accuracy_score(y_test, y_pred_svm_test)
print(f"SVM Accuracy (Test): {svm_accuracy_umap_c2_norm:.4f}")

In [None]:
np.save('cv_scores_umap_c2_norm.npy', cv_scores_umap_c2_norm)

In [None]:
# SVM with RBF kernel
svm_clf = SVC(kernel='rbf', random_state=42)

# Perform 10-fold cross-validation
cv_scores_umap_c2_norm = cross_val_score(svm_clf, x_train_umap_c2_norm, y_train, cv=10)
cv_accuracy_umap_c2_norm = cv_scores_umap_c2_norm.mean()
cv_std_umap_c2_norm = cv_scores_umap_c2_norm.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_umap_c2_norm:.4f} ± {cv_std_umap_c2_norm:.4f}")

In [None]:
# Results for umap
results_umap_c2_norm = {
    'ARI': ari_umap_c2_norm,
    'Silhouette Score': silhouette_umap_c2_norm,
    'SVM Accuracy': svm_accuracy_umap_c2_norm,
    'k-NN Accuracy': knn_accuracy_umap_c2_norm,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c2_norm, cv_std_umap_c2_norm)
}

print("umap Results:")
print(results_umap_c2_norm)

In [None]:
# Save intermediate data
np.save("x_train_umap_c2_norm.npy", x_train_umap_c2_norm)  # umap-reduced training data
np.save("x_test_umap_c2_norm.npy", x_test_umap_c2_norm)    # umap-reduced test data
np.save("y_test_pred_umap_c2_norm.npy", y_pred_svm_test)  # SVM predictions
np.save("cv_scores_umap_c2_norm.npy", cv_scores_umap_c2_norm)      # Cross-validation scores

In [None]:
# Load intermediate data
x_train_umap_c2_norm= np.load("x_train_umap_c2_norm.npy") 
x_test_umap_c2_norm= np.load("x_test_umap_c2_norm.npy") 
y_pred_svm= np.load("y_test_pred_umap_c2_norm.npy") 
cv_scores_umap_c2_norm= np.load("cv_scores_umap_c2_norm.npy")  

In [None]:
# Save k-NN accuracies to JSON
with open("knn_accuracy_umap_c2_norm.json", "w") as file:
    json.dump(knn_accuracy_umap_c2_norm, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_umap_c2_norm_serializable = convert_to_serializable({
    'ARI': ari_umap_c2_norm,
    'Silhouette Score': silhouette_umap_c2_norm,
    'SVM Accuracy': svm_accuracy_umap_c2_norm,
    'k-NN Accuracy': knn_accuracy_umap_c2_norm,
    '10-Fold CV Accuracy': (cv_accuracy_umap_c2_norm, cv_std_umap_c2_norm)
})

# Save results summary to JSON
with open("umap_c2_norm_results.json", "w") as file:
    json.dump(results_umap_c2_norm_serializable, file, indent=4)

print("umap results and intermediate data saved successfully!")

In [None]:
# Load results from JSON file
with open("umap_c2_norm_results.json", "r") as file:
    results_umap_c2_norm = json.load(file)

In [None]:
print(results_umap_c2_norm)

In [None]:
# Step 2: Visualize the UMAP Results
plt.figure(figsize=(10, 8))
plt.scatter(x_train_umap_c2_norm[:, 0], x_train_umap_c2_norm[:, 1], c=y_train, cmap="tab10", s=5, alpha=0.8)
plt.title("UMAP Projection of MNIST Dataset")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.colorbar(label="MNIST Labels")
plt.show()

---------

## MDS

### MDS n_components= 2

In [None]:
def downsample_mnist_consistent(x_data, y_labels, sample_fraction=0.1):
    """
    Downsample the dataset consistently, returning indices to ensure
    the same points are selected in both spaces.
    """
    sampled_indices = []
    unique_labels = np.unique(y_labels)
    for label in unique_labels:
        # Select indices for the current label
        label_indices = np.where(y_labels == label)[0]
        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False, random_state=42
        )
        sampled_indices.extend(sampled_indices_label)
    return np.array(sampled_indices)

# Downsample training data
sampled_indices_train_mds = downsample_mnist_consistent(x_train_standardized1, y_train, sample_fraction=0.1)
x_train_sampled_mds = x_train_standardized1[sampled_indices_train_mds]
y_train_sampled_mds = y_train[sampled_indices_train_mds]

# Downsample test data
sampled_indices_test_mds = downsample_mnist_consistent(x_test_standardized1, y_test, sample_fraction=0.1)
x_test_sampled_mds = x_test_standardized1[sampled_indices_test_mds]
y_test_sampled_mds = y_test[sampled_indices_test_mds]

print(f"Training set reduced to {len(x_train_sampled_mds)} samples.")
print(f"Test set reduced to {len(x_test_sampled_mds)} samples.")

In [None]:
# Save the sampled indices
np.save("sampled_indices_train_mds.npy", sampled_indices_train_mds)
np.save("sampled_indices_test_mds.npy", sampled_indices_test_mds)

# Save the downsampled dataset
np.save("x_train_sampled_mds.npy", x_train_sampled_mds)
np.save("y_train_sampled_mds.npy", y_train_sampled_mds)
np.save("x_test_sampled_mds.npy", x_test_sampled_mds)
np.save("y_test_sampled_mds.npy", y_test_sampled_mds)

print("Downsampling saved successfully!")

In [None]:
# Load sampled indices
sampled_indices_train_mds= np.load("sampled_indices_train_mds.npy")
sampled_indices_test_mds= np.load("sampled_indices_test_mds.npy")

# Load downsampled dataset
x_train_sampled_mds= np.load("x_train_sampled_mds.npy")
y_train_sampled_mds= np.load("y_train_sampled_mds.npy")
x_test_sampled_mds= np.load("x_test_sampled_mds.npy")
y_test_sampled_mds= np.load("y_test_sampled_mds.npy")

# Load 
x_train_mds_c2= np.load("x_train_mds_c2.npy")
x_test_mds_c2= np.load("x_test_mds_c2.npy")

In [None]:
# Apply MDS
mds = MDS(n_components=2, random_state=42, n_jobs=-1)
x_train_mds_c2 = mds.fit_transform(x_train_sampled_mds)
x_test_mds_c2 = mds.fit_transform(x_test_sampled_mds)  # MDS needs to be run separately for the test set

In [None]:
# Save the downsampled dataset
np.save("x_train_mds_c2.npy", x_train_mds_c2)
np.save("x_test_mds_c2.npy", x_test_mds_c2)

#### Clustering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_mds_c2 = kmeans.fit_predict(x_test_mds_c2)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_mds_c2 = adjusted_rand_score(y_test_sampled_mds, cluster_labels_mds_c2)
print(f"Adjusted Rand Index (ARI): {ari_mds_c2}")

In [None]:
# Silhouette Score
silhouette_mds_c2 = silhouette_score(x_test_mds_c2, cluster_labels_mds_c2)
print(silhouette_mds_c2)

#### Classifiers

In [None]:
knn_accuracy_mds_c2 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_mds_c2, y_train_sampled_mds)
    knn_accuracy = knn.score(x_test_mds_c2, y_test_sampled_mds)
    knn_accuracy_mds_c2[k] = knn_accuracy

print(f"k-NN Accuracy: {knn_accuracy_mds_c2}")

In [None]:
# SVM Accuracy
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_mds_c2, y_train_sampled_mds)
y_test_pred_mds_c2 = svm_clf.predict(x_test_mds_c2)
svm_accuracy_mds_c2 = accuracy_score(y_test_sampled_mds, y_test_pred_mds_c2)

In [None]:
# 10-Fold Cross-Validation Accuracy
cv_scores_mds_c2 = cross_val_score(SVC(kernel='rbf', random_state=42), x_train_mds_c2, y_train_sampled_mds, cv=10)
cv_accuracy_mds_c2 = cv_scores_mds_c2.mean()
cv_std_mds_c2 = cv_scores_mds_c2.std()

In [None]:
# Results for MDS
results_mds_c2 = {
    'ARI': ari_mds_c2,
    'Silhouette Score': silhouette_mds_c2,
    'SVM Accuracy': svm_accuracy_mds_c2,
    'k-NN Accuracy': knn_accuracies_mds_c2,
    '10-Fold CV Accuracy': (cv_accuracy_mds_c2, cv_std_mds_c2)
}

print("MDS Results:")
print(results_mds_c2)

In [None]:
# Save intermediate data
np.save("x_train_mds_c2.npy", x_train_mds_c2)  # MDS-reduced training data
np.save("x_test_mds_c2.npy", x_test_mds_c2)    # MDS-reduced test data
np.save("y_test_pred_mds_c2.npy", y_test_pred_mds_c2)  # SVM predictions
np.save("cv_scores_mds_c2.npy", cv_scores_mds_c2)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracies_mds_c2.json", "w") as file:
    json.dump(knn_accuracies_mds_c2, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_mds_c2_serializable = convert_to_serializable({
    'ARI': ari_mds_c2,
    'Silhouette Score': silhouette_mds_c2,
    'SVM Accuracy': svm_accuracy_mds_c2,
    'k-NN Accuracy': knn_accuracies_mds_c2,
    '10-Fold CV Accuracy': {
        'Mean': cv_accuracy_mds_c2,
        'StdDev': cv_std_mds_c2
    }
})

# Save results summary to JSON
with open("mds_c2_results.json", "w") as file:
    json.dump(results_mds_c2_serializable, file, indent=4)

print("MDS results and intermediate data saved successfully!")


In [None]:
# Plot the 2D projection with cluster labels
plt.figure(figsize=(10, 8))
sns.scatterplot(x=x_train_mds_c2[:, 0], y=x_train_mds_c2[:, 1], hue=y_train_sampled_mds, palette='tab10', s=10, legend='full')
plt.title("2D Scatter Plot of PCA-reduced Data")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

------

### MDS n_components= 50

In [None]:
# Combine training and test sets
x_full_mds = np.vstack([x_train_sampled_mds, x_test_sampled_mds])

In [None]:
# Apply MDS once
mds = MDS(n_components=50, random_state=42, n_jobs=-1)
x_full_mds_c50 = mds.fit_transform(x_full_mds)

In [None]:
# Split the embeddings back into train and test sets
x_train_mds_c50 = x_full_mds_c50[:len(y_train_sampled_mds)]
x_test_mds_c50 = x_full_mds_c50[len(y_train_sampled_mds):]

In [None]:
np.save('x_full_mds_c50.npy', x_full_mds_c50)

In [None]:
np.save('x_train_mds_c50.npy', x_train_mds_c50)
np.save('x_test_mds_c50.npy', x_test_mds_c50)

#### Clustering

In [None]:
# Perform K-Means clustering on the PCA-reduced test data
kmeans = KMeans(n_clusters=10, random_state=42)  # 10 clusters for MNIST digits (0-9)
cluster_labels_mds_c50 = kmeans.fit_predict(x_test_mds_c50)

In [None]:
# ARI
# Compute ARI between true labels and cluster labels
ari_mds_c50 = adjusted_rand_score(y_test_sampled_mds, cluster_labels_mds_c50)
print(f"Adjusted Rand Index (ARI): {ari_mds_c50}")

In [None]:
# Silhouette Score
silhouette_mds_c50 = silhouette_score(x_test_mds_c50, cluster_labels_mds_c50)
print(silhouette_mds_c50)

#### Classifiers

In [None]:
knn_accuracy_mds_c50 = {}
for k in [100, 200, 400]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_mds_c50, y_train_sampled_mds)
    knn_accuracy = knn.score(x_test_mds_c50, y_test_sampled_mds)
    knn_accuracy_mds_c50[k] = knn_accuracy

print(f"k-NN Accuracy: {knn_accuracy_mds_c50}")

In [None]:
# Train SVM
svm_clf = SVC(kernel='rbf', random_state=42)
svm_clf.fit(x_train_mds_c50, y_train_sampled_mds)

# Predict on the test set
y_pred_svm = svm_clf.predict(x_test_mds_c50)

# Compute accuracy
svm_accuracy_mds_c50 = accuracy_score(y_test_sampled_mds, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy_mds_c50:.4f}")

In [None]:
# Perform 10-fold CV
cv_scores_mds_c50 = cross_val_score(svm_clf, x_train_mds_c50, y_train_sampled_mds, cv=10)
cv_accuracy_mds_c50 = cv_scores_mds_c50.mean()
cv_std_mds_c50 = cv_scores_mds_c50.std()

print(f"10-Fold CV Accuracy (SVM): {cv_accuracy_mds_c50:.4f} ± {cv_std_mds_c50:.4f}")

In [None]:
# Results for mds
results_mds_c50 = {
    'ARI': ari_mds_c50,
    'Silhouette Score': silhouette_mds_c50,
    'SVM Accuracy': svm_accuracy_mds_c50,
    'k-NN Accuracy': knn_accuracy_mds_c50,
    '10-Fold CV Accuracy': (cv_accuracy_mds_c50, cv_std_mds_c50)
}

print("mds Results:")
print(results_mds_c50)

In [None]:
# Save intermediate data
np.save("x_train_mds_c50.npy", x_train_mds_c50)  # mds-reduced training data
np.save("x_test_mds_c50.npy", x_test_mds_c50)    # mds-reduced test data
np.save("y_test_pred_mds_c50.npy", y_pred_svm)  # SVM predictions
np.save("cv_scores_mds_c50.npy", cv_scores_mds_c50)      # Cross-validation scores

# Save k-NN accuracies to JSON
with open("knn_accuracy_mds_c50.json", "w") as file:
    json.dump(knn_accuracy_mds_c50, file, indent=4)

# Helper function to convert to JSON-serializable format
def convert_to_serializable(obj):
    if isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(v) for v in obj]
    elif isinstance(obj, (np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.int32, np.int64)):
        return int(obj)
    else:
        return obj

# Convert results to JSON-serializable format
results_mds_c50_serializable = convert_to_serializable({
    'ARI': ari_mds_c50,
    'Silhouette Score': silhouette_mds_c50,
    'SVM Accuracy': svm_accuracy_mds_c50,
    'k-NN Accuracy': knn_accuracy_mds_c50,
    '10-Fold CV Accuracy': (cv_accuracy_mds_c50, cv_std_mds_c50)
})

# Save results summary to JSON
with open("mds_c50_results.json", "w") as file:
    json.dump(results_mds_c50_serializable, file, indent=4)

print("mds results and intermediate data saved successfully!")

------

## Visual Comparison

In [None]:
# Load embeddings and cluster labels
x_train_pca_c2 = np.load("x_train_pca_c2.npy")
x_test_pca_c2 = np.load("x_test_pca_c2.npy")

x_train_tsne_c2 = np.load("x_train_tsne_c2.npy")
x_test_tsne_c2 = np.load("x_test_tsne_c2.npy")

x_train_isomap_c2 = np.load("x_train_isomap_c2.npy")
x_test_isomap_c2 = np.load("x_test_isomap_c2.npy")
y_test_sampled = np.load("y_test_sampled.npy")
y_train_sampled = np.load("y_train_sampled.npy")

x_train_lle_c2 = np.load("x_train_lle_c2.npy")
x_test_lle_c2 = np.load("x_test_lle_c2.npy")

x_train_umap = np.load('x_train_umap_c2.npy')
x_test_umap_c2_norm= np.load('x_test_umap_c2_norm.npy')

x_train_mds_c2 = np.load("x_train_mds_c2.npy")
x_test_mds_c2 = np.load("x_test_mds_c2.npy")
y_train_sampled_mds = np.load("y_train_sampled_mds.npy")
y_test_sampled_mds = np.load("y_test_sampled_mds.npy")

# Use test embeddings and labels for visualization
methods = {
    'PCA': (x_test_pca_c2, y_test),
    'Isomap': (x_test_isomap_c2, y_test_sampled),
    'LLE': (x_test_lle_c2, y_test_sampled),
    'MDS': (x_test_mds_c2, y_test_sampled_mds),
    't-SNE': (x_test_tsne_c2, y_test),
    'UMAP': (x_test_umap_c2_norm, y_test)
}

# Create a grid of subplots with two columns and three rows
fig, axes = plt.subplots(3, 2, figsize=(16, 18))  # Two columns, three rows
fig.subplots_adjust(hspace=0.4, wspace=0.4)  # Adjust spacing between plots

# Flatten axes for easier iteration
axes = axes.flatten()

# Define the label names (digits 0-9)
label_names = [f"Digit {i}" for i in range(10)]

for ax, (method, (embedding, labels)) in zip(axes, methods.items()):
    scatter = sns.scatterplot(
        x=embedding[:, 0], 
        y=embedding[:, 1], 
        hue=labels.astype(str),  # Ensure labels are strings
        palette='tab10',  # Use a standard tab10 palette for each plot
        s=40,  # Larger markers for better visibility
        ax=ax
    )
    ax.set_title(f'{method} Embeddings', fontsize=14, pad=10, loc='center')  # Larger font size

    # Hide x and y axis ticks and labels
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    
    # Set equal aspect ratio for symmetry
    ax.set_aspect('equal')

    # Add a legend for each plot
    handles, labels = scatter.get_legend_handles_labels()
    ax.legend(
        handles=handles, labels=label_names, title="Cluster", fontsize=10, loc='upper right',
        frameon=True, edgecolor="black"
    )

# Ensure the layout updates properly
plt.tight_layout()

# Show all plots
plt.show()

# Verification: Ensure colors and digits are correctly matched
print("\nVerifying colors and digits for each plot:")
for method, (embedding, labels) in methods.items():
    print(f"\nMethod: {method}")
    
    # Generate scatter plot to extract handles and colors
    scatter = sns.scatterplot(
        x=embedding[:, 0],
        y=embedding[:, 1],
        hue=labels.astype(str),
        palette='tab10',
        s=10,
        legend=True  # Ensure legend is generated
    )
    legend = scatter.get_legend()
    handles = legend.legendHandles  # Get handles from the legend
    plt.close()  # Close the plot since we only need the handles

    # Ensure there are exactly 10 handles for digits 0-9
    if len(handles) != 10:
        print(f"Error: Expected 10 clusters but got {len(handles)} for {method}.")
        continue

    # Check colors for each digit
    for digit in range(10):
        # Extract color from the plot handle and normalize to (R, G, B)
        color_in_plot = tuple(handles[digit].get_facecolor()[0][:3])  # Normalize to a tuple
        expected_color = tuple(sns.color_palette('tab10', 10)[digit])  # Also as a tuple
        
        # Compare RGB components
        match = color_in_plot == expected_color
        print(
            f"Digit {digit}: Color in plot {color_in_plot} | Expected color {expected_color} | Match: {match}"
        )


In [None]:
"x_train_umap_c50.npy", x_train_umap_c50)
np.save("x_test_umap_c50.npy", x_test_umap_c50)

In [None]:
# Load embeddings and cluster labels
x_train_pca_c50 = np.load("x_train_pca_c50.npy")
x_test_pca_c50 = np.load("x_test_pca_c50.npy")

x_train_tsne_c2 = np.load("x_train_tsne_c2.npy")
x_test_tsne_c2 = np.load("x_test_tsne_c2.npy")

x_train_isomap_c50 = np.load("x_train_isomap_c50.npy")
x_test_isomap_c50 = np.load("x_test_isomap_c50.npy")
y_test_sampled = np.load("y_test_sampled.npy")
y_train_sampled = np.load("y_train_sampled.npy")

x_train_lle_c50 = np.load("x_train_lle_c50.npy")
x_test_lle_c50 = np.load("x_test_lle_c50.npy")

x_train_umap_c50 = np.load('x_train_umap_c50.npy')
x_test_umap_c50 = np.load('x_test_umap_c50.npy')



x_train_mds_c50 = np.load("x_train_mds_c50.npy")
x_test_mds_c50 = np.load("x_test_mds_c50.npy")
y_train_sampled_mds = np.load("y_train_sampled_mds.npy")
y_test_sampled_mds = np.load("y_test_sampled_mds.npy")

# Use test embeddings and labels for visualization
methods = {
    'PCA': (x_test_pca_c50, y_test),
    'Isomap': (x_test_isomap_c50, y_test_sampled),
    'LLE': (x_test_lle_c50, y_test_sampled),
    'MDS': (x_test_mds_c50, y_test_sampled_mds),
    't-SNE': (x_test_tsne_c2, y_test),
    'UMAP': (x_test_umap_c50, y_test)
}

# Create a grid of subplots with two columns and three rows
fig, axes = plt.subplots(3, 2, figsize=(16, 18))  # Two columns, three rows
fig.subplots_adjust(hspace=0.4, wspace=0.4)  # Adjust spacing between plots

# Flatten axes for easier iteration
axes = axes.flatten()

# Define the label names (digits 0-9)
label_names = [f"Digit {i}" for i in range(10)]

for ax, (method, (embedding, labels)) in zip(axes, methods.items()):
    scatter = sns.scatterplot(
        x=embedding[:, 0], 
        y=embedding[:, 1], 
        hue=labels.astype(str),  # Ensure labels are strings
        palette='tab10',  # Use a standard tab10 palette for each plot
        s=40,  # Larger markers for better visibility
        ax=ax
    )
    ax.set_title(f'{method} Embeddings', fontsize=14, pad=10, loc='center')  # Larger font size

    # Hide x and y axis ticks and labels
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    
    # Set equal aspect ratio for symmetry
    ax.set_aspect('equal')

    # Add a legend for each plot
    handles, labels = scatter.get_legend_handles_labels()
    ax.legend(
        handles=handles, labels=label_names, title="Cluster", fontsize=10, loc='upper right',
        frameon=True, edgecolor="black"
    )

# Ensure the layout updates properly
plt.tight_layout()

# Show all plots
plt.show()

# Verification: Ensure colors and digits are correctly matched
print("\nVerifying colors and digits for each plot:")
for method, (embedding, labels) in methods.items():
    print(f"\nMethod: {method}")
    
    # Generate scatter plot to extract handles and colors
    scatter = sns.scatterplot(
        x=embedding[:, 0],
        y=embedding[:, 1],
        hue=labels.astype(str),
        palette='tab10',
        s=10,
        legend=True  # Ensure legend is generated
    )
    legend = scatter.get_legend()
    handles = legend.legendHandles  # Get handles from the legend
    plt.close()  # Close the plot since we only need the handles

    # Ensure there are exactly 10 handles for digits 0-9
    if len(handles) != 10:
        print(f"Error: Expected 10 clusters but got {len(handles)} for {method}.")
        continue

    # Check colors for each digit
    for digit in range(10):
        # Extract color from the plot handle and normalize to (R, G, B)
        color_in_plot = tuple(handles[digit].get_facecolor()[0][:3])  # Normalize to a tuple
        expected_color = tuple(sns.color_palette('tab10', 10)[digit])  # Also as a tuple
        
        # Compare RGB components
        match = color_in_plot == expected_color
        print(
            f"Digit {digit}: Color in plot {color_in_plot} | Expected color {expected_color} | Match: {match}"
        )


------

# General UMAP - Hyperparameters experiments

In [10]:
# Convert the MNIST dataset to a flattened format suitable for UMAP
x_train_flattened = np.array([np.array(img).flatten() for img in x_train])
x_test_flattened = np.array([np.array(img).flatten() for img in x_test])

### UMAP n_neighbours=5 min_dist=0.1

In [44]:
umap_projections_5_01_35= np.load(f'umap_projections_5_01_35.npy')
mean_projection_5_01_35= np.load(f'mean_projection_5_01_35.npy')
std_projection_5_01_35= np.load(f'std_projection_5_01_35.npy')
lower_limit_intconf_matrix_5_01_35= np.load(f'lower_limit_intconf_matrix_5_01_35.npy')
upper_limit_intconf_matrix_5_01_35= np.load(f'upper_limit_intconf_matrix_5_01_35.npy')
distance_matrices_5_01_35=np.load(f'distance_matrices_neighbors_5_01_35.npy')
mean_distance_matrix_5_01_35=np.load(f'mean_distance_matrix_neighbors_5_01_35.npy')
norm_lower_limit_intconf_matrix_5_01_35=np.load(f'norm_lower_limit_intconf_matrix_5_01_35.npy')
norm_upper_limit_intconf_matrix_5_01_35=np.load(f'norm_upper_limit_intconf_matrix_5_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 5
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_5_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_5_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_5_01_35 = np.array(umap_projections_5_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_5_01_35 = np.mean(umap_projections_5_01_35, axis=0)
std_projection_5_01_35 = np.std(umap_projections_5_01_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_5_01_35.npy', umap_projections_5_01_35)
np.save('mean_projection_5_01_35.npy', mean_projection_5_01_35)
np.save('std_projection_5_01_35.npy', std_projection_5_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_5_01_35'.")


In [None]:
### USE THIS ###

# Instead of applying thresholds on a per-point basis.
# Calculate the average distance for all points in a run and compare it against an aggregated threshold (e.g., mean or percentile of all average distances across runs).

# Calculate distances from each point to the mean projection
distances_to_mean_5_01_35 = np.sqrt(np.sum((umap_projections_5_01_35 - mean_projection_5_01_35[None, :, :])**2, axis=2))  # Shape: (n_runs, n_samples)

# Calculate average distance per run
average_distances_per_run = np.mean(distances_to_mean_5_01_35, axis=1)  # Shape: (n_runs,)

# Define a threshold based on the 90th percentile of the average distances
average_distance_threshold = np.percentile(average_distances_per_run, 90)

# Identify runs that pass the threshold
valid_runs = [run for run, avg_dist in enumerate(average_distances_per_run)
              if avg_dist <= average_distance_threshold]

print(f"Valid runs: {valid_runs}")
print(f"Number of valid runs: {len(valid_runs)}")

In [None]:
print(f"y_train type: {type(y_train)}")  # Should now be <class 'numpy.ndarray'>
print(f"y_train shape: {y_train.shape}")

------

#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_5_01_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_5_01 = np.zeros((n_runs, n_clusters, umap_projections_5_01_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_5_01_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_5_01[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_5_01 = np.zeros(10)
std_dev_y_5_01 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_5_01[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_5_01[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_5_01[i] = np.std(cluster_x_coords)
    std_dev_y_5_01[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_5_01)
print("Standard deviation of y coordinates per cluster:", std_dev_y_5_01)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_5[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_5[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_5[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_5_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
### NO NEED TO RE RUN ###
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
### NO NEED TO RE RUN ###
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:
### NO NEED TO RE RUN ###
# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{5_01}_35.csv', index=False)

-------

#### Distance Matrix calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_5_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_5_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_5_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_5_01_35 = np.array(distance_matrices_5_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_5_01_35 = np.mean(distance_matrices_5_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_01_35 = (mean_distance_matrix_5_01_35 - np.min(mean_distance_matrix_5_01_35)) / (np.max(mean_distance_matrix_5_01_35) - np.min(mean_distance_matrix_5_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=5)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_01_35.npy', distance_matrices_5_01_35)
np.save('mean_distance_matrix_neighbors_5_01_35.npy', mean_distance_matrix_5_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_5_01_35}")

#### Minimum Spanning Tree - MST

In [None]:
# Create a graph from the distance matrix
G_5_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_5_01_35,3))
np.save('G_5_01_35.npy',G_5_01_35)

# Draw the graph
pos = nx.spring_layout(G_5_01_35, seed=42)  # positions for all nodes
nx.draw(G_5_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_5_01_35, 'weight')
nx.draw_networkx_edge_labels(G_5_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_5_01_35 = nx.minimum_spanning_tree(G_5_01_35)
np.save('mst_5_01_35.npy', mst_5_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_5_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_5_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_5_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_5_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=5, min_dist=0.1")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_5_01_35 = np.std(distance_matrices_5_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_5_01_35.npy", distance_matrix_std_5_01_35)

# Output the results
print("Standard Deviation Distance Matrix (5_01_35):\n", distance_matrix_std_5_01_35)

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_5_01_35 = distance_matrix_std_5_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_5_01_35 = z_score * sem_matrix_5_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_5_01_35 = mean_distance_matrix_5_01_35 - margin_of_error_matrix_5_01_35
upper_limit_intconf_matrix_5_01_35 = mean_distance_matrix_5_01_35 + margin_of_error_matrix_5_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_5_01_35 = np.maximum(lower_limit_intconf_matrix_5_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_5_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_5_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_5_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_5_01_35.npy', lower_limit_intconf_matrix_5_01_35)
np.save('upper_limit_intconf_matrix_5_01_35.npy', upper_limit_intconf_matrix_5_01_35)

Interval of confidence

In [16]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_5_01_35 = normalize_matrix(lower_limit_intconf_matrix_5_01_35)
norm_upper_limit_intconf_matrix_5_01_35 = normalize_matrix(upper_limit_intconf_matrix_5_01_35)
np.save('norm_lower_limit_intconf_matrix_5_01_35.npy', norm_lower_limit_intconf_matrix_5_01_35)
np.save('norm_upper_limit_intconf_matrix_5_01_35.npy', norm_upper_limit_intconf_matrix_5_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_5_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=5, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_5_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=5, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_5_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=5, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_5_01_35, "UMAP MST - Mean Distances - n_neighbors=5 min_dist = 0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_5_01_35, "UMAP MST - Lower Limit - n_neighbors=5 min_dist = 0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_5_01_35, "UMAP MST - Upper Limit - n_neighbors=5 min_dist = 0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

#### Adaptable Radius Final Version & Results

In [None]:
# Function to calculate cluster metrics
def calculate_cluster_metrics(umap_projections, y_labels, n_clusters=10):
    """
    Calculate average cluster radii and neighbor counts for each cluster over all runs.
    """
    n_runs = len(umap_projections)  # Number of runs
    cluster_centers_full = []
    
    # Step 1: Calculabte cluster centers for each run
    for run_idx, x_umap in enumerate(umap_projections):
        cluster_centers_run = []
        for label in np.unique(y_labels):
            cluster_points = x_umap[y_labels == label]
            if len(cluster_points) > 0:
                cluster_center = np.mean(cluster_points, axis=0)
                cluster_centers_run.append(cluster_center)
        cluster_centers_full.append(np.array(cluster_centers_run))
    
    cluster_centers_full = np.array(cluster_centers_full)  # Shape: (n_runs, n_clusters, 2)

    # Step 2: Calculate average radii for each cluster
    radii_per_cluster = []
    for cluster_idx in range(n_clusters):
        radii_cluster = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_center = cluster_centers_full[run_idx][cluster_idx]
            cluster_points = x_umap[y_labels == cluster_idx]
            if len(cluster_points) > 0:
                distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                dynamic_radius = np.mean(distances_to_center)  # Mean distance to center
                radii_cluster.append(dynamic_radius)
        radii_per_cluster.append(np.mean(radii_cluster))  # Average radius across runs

    # Step 3: Calculate neighbor counts for each cluster
    neighbor_counts_full = []
    for run_idx, x_umap in enumerate(umap_projections):
        counts_run = []
        for cluster_idx, cluster_center in enumerate(cluster_centers_full[run_idx]):
            radius = radii_per_cluster[cluster_idx]  # Use the average radius
            distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
            count = np.sum(distances_to_center <= radius)  # Count points within the radius
            counts_run.append(count)
        neighbor_counts_full.append(counts_run)

    neighbor_counts_full = np.array(neighbor_counts_full)  # Shape: (n_runs, n_clusters)
    average_neighbor_counts = np.mean(neighbor_counts_full, axis=0)  # Average across runs

    return radii_per_cluster, average_neighbor_counts

# Define n_neighbors values
n_neighbors_values = [5, 10, 20, 30, 50, 100]
results = []

# Iterate over each n_neighbors value
for n_neighbors in n_neighbors_values:
    if n_neighbors == 5:
        umap_projections = umap_projections_5_01_35
    elif n_neighbors == 10:
        umap_projections = umap_projections_10_01_35
    elif n_neighbors == 20:
        umap_projections = umap_projections_20_01_35
    elif n_neighbors == 30:
        umap_projections = umap_projections_30_01_35
    elif n_neighbors == 50:
        umap_projections = umap_projections_50_01_35
    elif n_neighbors == 100:
        umap_projections = umap_projections_100_01_35

    # Calculate metrics
    radii_per_cluster, average_neighbor_counts = calculate_cluster_metrics(umap_projections, y_train)

    # Store results
    for cluster_idx in range(len(radii_per_cluster)):
        results.append({
            "N": n_neighbors,
            "Cluster": cluster_idx,
            "Radius": np.round(radii_per_cluster[cluster_idx], 3),
            "Number of Neighbors": np.round(average_neighbor_counts[cluster_idx], 0)
        })

# Create a DataFrame for the results
df_results = pd.DataFrame(results)

# Save results for later use
df_results.to_csv("radius_neighbor_analysis_merged_MinDist_01.csv", index=False)

# Pivot table for easy visualization
pivot_table = df_results.pivot(index="Cluster", columns="N", values=["Radius", "Number of Neighbors"])
print(pivot_table)

In [55]:
def plot_mean_neighbor_counts_across_runs(umap_projections_list, n_neighbors_values, y_labels, n_clusters=10):
    """
    Plot mean number of neighbors across runs for different n_neighbors values.
    """
    neighbor_counts_avg_runs = []

    for umap_projections in umap_projections_list:
        # Calculate neighbor counts for each run
        neighbor_counts_per_run = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_centers = []
            radii_per_cluster = []
            for cluster_idx in range(n_clusters):
                # Compute cluster center
                cluster_points = x_umap[y_labels == cluster_idx]
                if len(cluster_points) > 0:
                    cluster_center = np.mean(cluster_points, axis=0)
                    cluster_centers.append(cluster_center)

                    # Compute dynamic radius for this cluster
                    distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                    dynamic_radius = np.mean(distances_to_center)
                    radii_per_cluster.append(dynamic_radius)
                else:
                    radii_per_cluster.append(0)
                    cluster_centers.append(np.array([0, 0]))

            # Compute number of neighbors within radius for each cluster
            neighbor_counts = []
            for cluster_idx, cluster_center in enumerate(cluster_centers):
                if radii_per_cluster[cluster_idx] > 0:  # Avoid empty clusters
                    distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
                    neighbor_count = np.sum(distances_to_center <= radii_per_cluster[cluster_idx])
                    neighbor_counts.append(neighbor_count)

            # Store the mean neighbor count for this run
            neighbor_counts_per_run.append(np.mean(neighbor_counts))
        
        neighbor_counts_avg_runs.append(neighbor_counts_per_run)

    # Plot results
    plt.figure(figsize=(10, 6))
    for i, counts in enumerate(neighbor_counts_avg_runs):
        plt.plot(range(1, len(counts) + 1), counts, label=f'n_neighbors={n_neighbors_values[i]}', marker='o')

    plt.xlabel("Run Index")
    plt.ylabel("Mean Number of Points")
    plt.title("Mean Number of Points Across Runs for min_dist = 0.1")
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Adjust legend position
    plt.grid(True)
    plt.show()

In [None]:
plot_mean_neighbor_counts_across_runs(
    umap_projections_list=[
        umap_projections_5_01_35,
        umap_projections_10_01_35,
        umap_projections_20_01_35,
        umap_projections_30_01_35,
        umap_projections_50_01_35,
        umap_projections_100_01_35
    ],
    n_neighbors_values=n_neighbors_values,
    y_labels=y_train
)

-----------

### UMAP n_neighbours=10 min_dist=0.1

In [16]:
umap_projections_10_01_35= np.load(f'umap_projections_10_01_35.npy')
mean_projection_10_01_35= np.load(f'mean_projection_10_01_35.npy')
std_projection_10_01_35= np.load(f'std_projection_10_01_35.npy')
lower_limit_intconf_matrix_10_01_35= np.load(f'lower_limit_intconf_matrix_10_01_35.npy')
upper_limit_intconf_matrix_10_01_35= np.load(f'upper_limit_intconf_matrix_10_01_35.npy')
distance_matrices_10_01_35=np.load(f'distance_matrices_neighbors_10_01_35.npy')
mean_distance_matrix_10_01_35=np.load(f'mean_distance_matrix_neighbors_10_01_35.npy')
norm_lower_limit_intconf_matrix_10_01_35=np.load(f'norm_lower_limit_intconf_matrix_10_01_35.npy')
norm_upper_limit_intconf_matrix_10_01_35=np.load(f'norm_upper_limit_intconf_matrix_10_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_10_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_10_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_10_01_35 = np.array(umap_projections_10_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_10_01_35 = np.mean(umap_projections_10_01_35, axis=0)
std_projection_10_01_35 = np.std(umap_projections_10_01_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_10_01_35.npy', umap_projections_10_01_35)
np.save('mean_projection_10_01_35.npy', mean_projection_10_01_35)
np.save('std_projection_10_01_35.npy', std_projection_10_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_10_01_35'.")

#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_10_01_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_10_01 = np.zeros((n_runs, n_clusters, umap_projections_10_01_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_10_01_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_10_01[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_10_01 = np.zeros(10)
std_dev_y_10_01 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_10_01[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_10_01[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_10_01[i] = np.std(cluster_x_coords)
    std_dev_y_10_01[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_10_01)
print("Standard deviation of y coordinates per cluster:", std_dev_y_10_01)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_10[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_10[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_10[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_10_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{10_01}_35.csv', index=False)

-----

#### Distance Matrix Calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_10_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_10_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_10_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_10_01_35 = np.array(distance_matrices_10_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_10_01_35 = np.mean(distance_matrices_10_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_10_01_35 = (mean_distance_matrix_10_01_35 - np.min(mean_distance_matrix_10_01_35)) / (np.max(mean_distance_matrix_10_01_35) - np.min(mean_distance_matrix_10_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_10_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_10_01_35.npy', distance_matrices_10_01_35)
np.save('mean_distance_matrix_neighbors_10_01_35.npy', mean_distance_matrix_10_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_10_01_35}")

#### Minimum Spaning Tree - MST

In [None]:
# Create a graph from the distance matrix
G_10_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_10_01_35,3))
np.save('G_10_01_35.npy',G_10_01_35)

# Draw the graph
pos = nx.spring_layout(G_10_01_35, seed=42)  # positions for all nodes
nx.draw(G_10_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_10_01_35, 'weight')
nx.draw_networkx_edge_labels(G_10_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
# plt.show()

In [None]:
# Calculate the total weight of the MST
total_weight = sum(nx.get_edge_attributes(mst_10_01_35, 'weight').values())

# Print the total weight
print(f"Total weight of the MST: {total_weight}")

# Compute the minimum spanning tree of the graph
mst_10_01_35 = nx.minimum_spanning_tree(G_10_01_35)
np.save('mst_10_01_35.npy', mst_10_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_10_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_10_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_10_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_10_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=10, min_dist=0.1")
plt.show()

In [None]:
# Compute the minimum spanning tree (MST) of the graph
mst_10_01_35 = nx.minimum_spanning_tree(G_10_01_35)

# Save the MST for later use
np.save('mst_10_01_35.npy', mst_10_01_35)

# Define positions for all nodes in the MST using a spring layout
pos = nx.spring_layout(mst_10_01_35, seed=42)

# Increase figure size for better visibility
plt.figure(figsize=(12, 8))

# Draw the MST with larger nodes, thicker edges, and a larger font for labels
nx.draw(
    mst_10_01_35,
    pos,
    with_labels=True,
    node_color='lightblue',
    edge_color='red',
    node_size=1000,  # Larger node size for better visibility
    font_size=12,    # Larger font size for node labels
    width=3          # Thicker edge lines
)

# Get edge weights and format them to 2 decimal places for clarity
edge_labels = nx.get_edge_attributes(mst_10_01_35, 'weight')
formatted_edge_labels = {k: f"{v:.2f}" for k, v in edge_labels.items()}

# Draw edge labels with formatted weights
nx.draw_networkx_edge_labels(
    mst_10_01_35,
    pos,
    edge_labels=formatted_edge_labels,
    font_size=20,    # Font size for edge labels
    label_pos=0.5    # Position edge labels at the center of edges
)

# Add a title to the plot
plt.title("MST UMAP - n_neighbors=10, min_dist=0.1", fontsize=16)

# Display the plot
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_10_01_35 = np.std(distance_matrices_10_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_10_01_35.npy", distance_matrix_std_10_01_35)

# Output the results
print("Standard Deviation Distance Matrix (10_01_35):\n", distance_matrix_std_10_01_35)

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_10_01_35 = distance_matrix_std_10_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_10_01_35 = z_score * sem_matrix_10_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_10_01_35 = mean_distance_matrix_10_01_35 - margin_of_error_matrix_10_01_35
upper_limit_intconf_matrix_10_01_35 = mean_distance_matrix_10_01_35 + margin_of_error_matrix_10_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_10_01_35 = np.maximum(lower_limit_intconf_matrix_10_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_10_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_10_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_10_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_10_01_35.npy', lower_limit_intconf_matrix_10_01_35)
np.save('upper_limit_intconf_matrix_10_01_35.npy', upper_limit_intconf_matrix_10_01_35)

In [25]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_10_01_35 = normalize_matrix(lower_limit_intconf_matrix_10_01_35)
norm_upper_limit_intconf_matrix_10_01_35 = normalize_matrix(upper_limit_intconf_matrix_10_01_35)
np.save('norm_lower_limit_intconf_matrix_10_01_35.npy', norm_lower_limit_intconf_matrix_10_01_35)
np.save('norm_upper_limit_intconf_matrix_10_01_35.npy', norm_upper_limit_intconf_matrix_10_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_10_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=10, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_10_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=10, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_10_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=10, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_10_01_35, "UMAP MST - Mean Distances - n_neighbors=10, min_dist=0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_10_01_35, "UMAP MST - Lower Limit- n_neighbors=10, min_dist=0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_10_01_35, "UMAP MST - Upper Limit- n_neighbors=10, min_dist=0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

-------

### UMAP n_neighbours=20 min_dist= 0.1

In [9]:
umap_projections_20_01_35= np.load(f'umap_projections_20_01_35.npy')
mean_projection_20_01_35= np.load(f'mean_projection_20_01_35.npy')
std_projection_20_01_35= np.load(f'std_projection_20_01_35.npy')
lower_limit_intconf_matrix_20_01_35= np.load(f'lower_limit_intconf_matrix_20_01_35.npy')
upper_limit_intconf_matrix_20_01_35= np.load(f'upper_limit_intconf_matrix_20_01_35.npy')
distance_matrices_20_01_35=np.load(f'distance_matrices_neighbors_20_01_35.npy')
mean_distance_matrix_20_01_35=np.load(f'mean_distance_matrix_neighbors_20_01_35.npy')
norm_lower_limit_intconf_matrix_20_01_35=np.load(f'norm_lower_limit_intconf_matrix_20_01_35.npy')
norm_upper_limit_intconf_matrix_20_01_35=np.load(f'norm_upper_limit_intconf_matrix_20_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 20
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_20_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_20_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_20_01_35 = np.array(umap_projections_20_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_20_01_35 = np.mean(umap_projections_20_01_35, axis=0)
std_projection_20_01_35 = np.std(umap_projections_20_01_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_20_01_35.npy', umap_projections_20_01_35)
np.save('mean_projection_20_01_35.npy', mean_projection_20_01_35)
np.save('std_projection_20_01_35.npy', std_projection_20_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_20_01_35'.")

#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_20_01_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_20_01 = np.zeros((n_runs, n_clusters, umap_projections_20_01_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_20_01_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_20_01[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_20_01 = np.zeros(10)
std_dev_y_20_01 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_20_01[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_20_01[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_20_01[i] = np.std(cluster_x_coords)
    std_dev_y_20_01[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_20_01)
print("Standard deviation of y coordinates per cluster:", std_dev_y_20_01)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_20[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_20[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_20[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_20_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:
# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{20_01}_35.csv', index=False)

#### Distance Matrix calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_20_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_20_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_20_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_20_01_35 = np.array(distance_matrices_20_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_20_01_35 = np.mean(distance_matrices_20_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_20_01_35 = (mean_distance_matrix_20_01_35 - np.min(mean_distance_matrix_20_01_35)) / (np.max(mean_distance_matrix_20_01_35) - np.min(mean_distance_matrix_20_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_20_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=20, min_dist=0.1)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_20_01_35.npy', distance_matrices_20_01_35)
np.save('mean_distance_matrix_neighbors_20_01_35.npy', mean_distance_matrix_20_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_20_01_35}")

In [None]:
# Create a graph from the distance matrix
G_20_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_20_01_35,3))
np.save('G_20_01_35.npy',G_20_01_35)

# Draw the graph
pos = nx.spring_layout(G_20_01_35, seed=42)  # positions for all nodes
nx.draw(G_20_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_20_01_35, 'weight')
nx.draw_networkx_edge_labels(G_20_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

#### Minimum Spanning Tree - MST

In [None]:
# Compute the minimum spanning tree of the graph
mst_20_01_35 = nx.minimum_spanning_tree(G_20_01_35)
np.save('mst_20_01_35.npy', mst_20_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_20_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_20_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_20_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_20_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=20, min_dist=0.1")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_20_01_35 = np.std(distance_matrices_20_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_20_01_35.npy", distance_matrix_std_20_01_35)

# Output the results
print("Standard Deviation Distance Matrix (20_01_35):\n", distance_matrix_std_20_01_35)


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_20_01_35 = distance_matrix_std_20_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_20_01_35 = z_score * sem_matrix_20_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_20_01_35 = mean_distance_matrix_20_01_35 - margin_of_error_matrix_20_01_35
upper_limit_intconf_matrix_20_01_35 = mean_distance_matrix_20_01_35 + margin_of_error_matrix_20_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_20_01_35 = np.maximum(lower_limit_intconf_matrix_20_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_20_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_20_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_20_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_20_01_35.npy', lower_limit_intconf_matrix_20_01_35)
np.save('upper_limit_intconf_matrix_20_01_35.npy', upper_limit_intconf_matrix_20_01_35)

In [37]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_20_01_35 = normalize_matrix(lower_limit_intconf_matrix_20_01_35)
norm_upper_limit_intconf_matrix_20_01_35 = normalize_matrix(upper_limit_intconf_matrix_20_01_35)
np.save('norm_lower_limit_intconf_matrix_20_01_35.npy', norm_lower_limit_intconf_matrix_20_01_35)
np.save('norm_upper_limit_intconf_matrix_20_01_35.npy', norm_upper_limit_intconf_matrix_20_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_20_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=20, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_20_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=20, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_20_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=20, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_20_01_35, "UMAP MST - Mean Distances - n_neighbors=20, min_dist=0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_20_01_35, "UMAP MST - Lower Limit - n_neighbors=20, min_dist=0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_20_01_35, "UMAP MST - Upper Limit - n_neighbors=20, min_dist=0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

### UMAP n_neighbours=30 min_dist=0.1

In [10]:
umap_projections_30_01_35= np.load(f'umap_projections_30_01_35.npy')
mean_projection_30_01_35= np.load(f'mean_projection_30_01_35.npy')
std_projection_30_01_35= np.load(f'std_projection_30_01_35.npy')
lower_limit_intconf_matrix_30_01_35= np.load(f'lower_limit_intconf_matrix_30_01_35.npy')
upper_limit_intconf_matrix_30_01_35= np.load(f'upper_limit_intconf_matrix_30_01_35.npy')
distance_matrices_30_01_35=np.load(f'distance_matrices_neighbors_30_01_35.npy')
mean_distance_matrix_30_01_35=np.load(f'mean_distance_matrix_neighbors_30_01_35.npy')
norm_lower_limit_intconf_matrix_30_01_35=np.load(f'norm_lower_limit_intconf_matrix_30_01_35.npy')
norm_upper_limit_intconf_matrix_30_01_35=np.load(f'norm_upper_limit_intconf_matrix_30_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 30
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_30_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_30_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_30_01_35 = np.array(umap_projections_30_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_30_01_35 = np.mean(umap_projections_30_01_35, axis=0)
std_projection_30_01_35 = np.std(umap_projections_30_01_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_30_01_35.npy', umap_projections_30_01_35)
np.save('mean_projection_30_01_35.npy', mean_projection_30_01_35)
np.save('std_projection_30_01_35.npy', std_projection_30_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_30_01_35'.")

#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_30_01_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_30_01 = np.zeros((n_runs, n_clusters, umap_projections_30_01_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_30_01_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_30_01[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_30_01 = np.zeros(10)
std_dev_y_30_01 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_30_01[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_30_01[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_30_01[i] = np.std(cluster_x_coords)
    std_dev_y_30_01[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_30_01)
print("Standard deviation of y coordinates per cluster:", std_dev_y_30_01)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_30[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_30[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_30[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_20_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{30_01}_35.csv', index=False)

#### Distance Matrix Calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_30_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_30_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_30_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_30_01_35 = np.array(distance_matrices_30_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_30_01_35 = np.mean(distance_matrices_30_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_30_01_35 = (mean_distance_matrix_30_01_35 - np.min(mean_distance_matrix_30_01_35)) / (np.max(mean_distance_matrix_30_01_35) - np.min(mean_distance_matrix_30_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_30_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10,  n_neighbors=30, min_dist=0.1)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_30_01_35.npy', distance_matrices_30_01_35)
np.save('mean_distance_matrix_neighbors_30_01_35.npy', mean_distance_matrix_30_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_30_01_35}")

In [None]:
# Create a graph from the distance matrix
G_30_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_30_01_35,3))
np.save('G_30_01_35.npy',G_30_01_35)

# Draw the graph
pos = nx.spring_layout(G_30_01_35, seed=42)  # positions for all nodes
nx.draw(G_30_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_30_01_35, 'weight')
nx.draw_networkx_edge_labels(G_30_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

#### Minimum Spanning Tree - MST

In [None]:
# Compute the minimum spanning tree of the graph
mst_30_01_35 = nx.minimum_spanning_tree(G_30_01_35)
np.save('mst_30_01_35.npy', mst_30_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_30_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_30_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_30_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_30_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=30, min_dist=0.1")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_30_01_35 = np.std(distance_matrices_30_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_30_01_35.npy", distance_matrix_std_30_01_35)

# Output the results
print("Standard Deviation Distance Matrix (30_01_35):\n", distance_matrix_std_30_01_35)

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_30_01_35 = distance_matrix_std_30_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_30_01_35 = z_score * sem_matrix_30_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_30_01_35 = mean_distance_matrix_30_01_35 - margin_of_error_matrix_30_01_35
upper_limit_intconf_matrix_30_01_35 = mean_distance_matrix_30_01_35 + margin_of_error_matrix_30_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_30_01_35 = np.maximum(lower_limit_intconf_matrix_30_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_30_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_30_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_30_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_30_01_35.npy', lower_limit_intconf_matrix_30_01_35)
np.save('upper_limit_intconf_matrix_30_01_35.npy', upper_limit_intconf_matrix_30_01_35)

In [47]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_30_01_35 = normalize_matrix(lower_limit_intconf_matrix_30_01_35)
norm_upper_limit_intconf_matrix_30_01_35 = normalize_matrix(upper_limit_intconf_matrix_30_01_35)
np.save('norm_lower_limit_intconf_matrix_30_01_35.npy', norm_lower_limit_intconf_matrix_30_01_35)
np.save('norm_upper_limit_intconf_matrix_30_01_35.npy', norm_upper_limit_intconf_matrix_30_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_30_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=30, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_30_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=30, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_30_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=30, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_30_01_35, "UMAP MST - Mean Distances - n_neighbors=30, min_dist=0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_30_01_35, "UMAP MST - Lower Limit - n_neighbors=30, min_dist=0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_30_01_35, "UMAP MST - Upper Limit - n_neighbors=30, min_dist=0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

--------

### UMAP n_neighbours=50 min_dist=0.1

In [2]:
umap_projections_50_01_35= np.load(f'umap_projections_50_01_35.npy')
mean_projection_50_01_35= np.load(f'mean_projection_50_01_35.npy')
std_projection_50_01_35= np.load(f'std_projection_50_01_35.npy')
lower_limit_intconf_matrix_50_01_35= np.load(f'lower_limit_intconf_matrix_50_01_35.npy')
upper_limit_intconf_matrix_50_01_35= np.load(f'upper_limit_intconf_matrix_50_01_35.npy')
distance_matrices_50_01_35=np.load(f'distance_matrices_neighbors_50_01_35.npy')
mean_distance_matrix_50_01_35=np.load(f'mean_distance_matrix_neighbors_50_01_35.npy')
norm_lower_limit_intconf_matrix_50_01_35=np.load(f'norm_lower_limit_intconf_matrix_50_01_35.npy')
norm_upper_limit_intconf_matrix_50_01_35=np.load(f'norm_upper_limit_intconf_matrix_50_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 50
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_50_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_50_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_50_01_35 = np.array(umap_projections_50_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_50_01_35 = np.mean(umap_projections_50_01_35, axis=0)
std_projection_50_01_35 = np.std(umap_projections_50_01_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_50_01_35.npy', umap_projections_50_01_35)
np.save('mean_projection_50_01_35.npy', mean_projection_50_01_35)
np.save('std_projection_50_01_35.npy', std_projection_50_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_50_01_35'.")

#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_50_01_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_50_01 = np.zeros((n_runs, n_clusters, umap_projections_50_01_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_50_01_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_50_01[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_50_01 = np.zeros(10)
std_dev_y_50_01 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_50_01[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_50_01[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_50_01[i] = np.std(cluster_x_coords)
    std_dev_y_50_01[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_50_01)
print("Standard deviation of y coordinates per cluster:", std_dev_y_50_01)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_50[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_50[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_50[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_50_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{50_01}_35.csv', index=False)

#### Distance Matrix Calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_50_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_50_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_50_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_50_01_35 = np.array(distance_matrices_50_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_50_01_35 = np.mean(distance_matrices_50_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_01_35 = (mean_distance_matrix_50_01_35 - np.min(mean_distance_matrix_50_01_35)) / (np.max(mean_distance_matrix_50_01_35) - np.min(mean_distance_matrix_50_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_50_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=50, min_dist=0.1)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_50_01_35.npy', distance_matrices_50_01_35)
np.save('mean_distance_matrix_neighbors_50_01_35.npy', mean_distance_matrix_50_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_50_01_35}")

#### Minimum Spanning Tree - MST

In [None]:
# Create a graph from the distance matrix
G_50_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_50_01_35,3))
np.save('G_50_01_35.npy',G_50_01_35)

# Draw the graph
pos = nx.spring_layout(G_50_01_35, seed=42)  # positions for all nodes
nx.draw(G_50_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_50_01_35, 'weight')
nx.draw_networkx_edge_labels(G_50_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [3]:
# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_01_35 = (mean_distance_matrix_50_01_35 - np.min(mean_distance_matrix_50_01_35)) / (np.max(mean_distance_matrix_50_01_35) - np.min(mean_distance_matrix_50_01_35))

In [None]:
# Calculate the total weight of the MST
total_weight_50 = sum(nx.get_edge_attributes(mst_50_01_35, 'weight').values())

# Print the total weight
print(f"Total weight of the MST: {total_weight_50}")

# Compute the minimum spanning tree of the graph
mst_50_01_35 = nx.minimum_spanning_tree(G_50_01_35)
np.save('mst_50_01_35.npy', mst_50_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_50_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_50_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_50_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_50_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=50, min_dist=0.1")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_50_01_35 = np.std(distance_matrices_50_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_50_01_35.npy", distance_matrix_std_50_01_35)

# Output the results
print("Standard Deviation Distance Matrix (50_01_35):\n", distance_matrix_std_50_01_35)


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_50_01_35 = distance_matrix_std_50_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_50_01_35 = z_score * sem_matrix_50_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_50_01_35 = mean_distance_matrix_50_01_35 - margin_of_error_matrix_50_01_35
upper_limit_intconf_matrix_50_01_35 = mean_distance_matrix_50_01_35 + margin_of_error_matrix_50_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_50_01_35 = np.maximum(lower_limit_intconf_matrix_50_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_50_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_50_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_50_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_50_01_35.npy', lower_limit_intconf_matrix_50_01_35)
np.save('upper_limit_intconf_matrix_50_01_35.npy', upper_limit_intconf_matrix_50_01_35)

In [57]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_50_01_35 = normalize_matrix(lower_limit_intconf_matrix_50_01_35)
norm_upper_limit_intconf_matrix_50_01_35 = normalize_matrix(upper_limit_intconf_matrix_50_01_35)
np.save('norm_lower_limit_intconf_matrix_50_01_35.npy', norm_lower_limit_intconf_matrix_50_01_35)
np.save('norm_upper_limit_intconf_matrix_50_01_35.npy', norm_upper_limit_intconf_matrix_50_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_50_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=50, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_50_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=50, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_50_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=50, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_50_01_35, "UMAP MST - Mean Distances - n_neighbors=50, min_dist=0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_50_01_35, "UMAP MST - Lower Limit - n_neighbors=50, min_dist=0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_50_01_35, "UMAP MST - Upper Limit - n_neighbors=50, min_dist=0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

------------

### UMAP n_neighbours=100 min_dist=0.1

In [41]:
umap_projections_100_01_35= np.load(f'umap_projections_100_01_35.npy')
mean_projection_100_01_35= np.load(f'mean_projection_100_01_35.npy')
std_projection_100_01_35= np.load(f'std_projection_100_01_35.npy')
lower_limit_intconf_matrix_100_01_35= np.load(f'lower_limit_intconf_matrix_100_01_35.npy')
upper_limit_intconf_matrix_100_01_35= np.load(f'upper_limit_intconf_matrix_100_01_35.npy')
distance_matrices_100_01_35=np.load(f'distance_matrices_neighbors_100_01_35.npy')
mean_distance_matrix_100_01_35=np.load(f'mean_distance_matrix_neighbors_100_01_35.npy')
norm_lower_limit_intconf_matrix_100_01_35=np.load(f'norm_lower_limit_intconf_matrix_100_01_35.npy')
norm_upper_limit_intconf_matrix_100_01_35=np.load(f'norm_upper_limit_intconf_matrix_100_01_35.npy')

In [None]:
# Define parameters
n_neighbors = 100
min_dist = 0.1
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_100_01_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_100_01_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_100_01_35 = np.array(umap_projections_100_01_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_100_01_35 = np.mean(umap_projections_100_01_35, axis=0)
std_projection_100_01_35 = np.std(umap_projections_100_01_35, axis=0)
                                                                        
# Save the projections, mean, and standard deviation
np.save('umap_projections_100_01_35.npy', umap_projections_100_01_35)
np.save('mean_projection_100_01_35.npy', mean_projection_100_01_35)
np.save('std_projection_100_01_35.npy', std_projection_100_01_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_100_01_35'.")

#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_100_01_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_100_01 = np.zeros((n_runs, n_clusters, umap_projections_100_01_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_100_01_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_100_01[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_100_01 = np.zeros(10)
std_dev_y_100_01 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_100_01[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_100_01[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_100_01[i] = np.std(cluster_x_coords)
    std_dev_y_100_01[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_100_01)
print("Standard deviation of y coordinates per cluster:", std_dev_y_100_01)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_100[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_100[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_100[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_100_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{100_01}_35.csv', index=False)

#### Distance MAtrix Calacualtion

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_100_01_35[run]  # Shape: (n_samples, 2)

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_100_01_35 = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_100_01_35.append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_100_01_35 = np.array(distance_matrices_100_01_35)  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_100_01_35 = np.mean(distance_matrices_100_01_35, axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_01_35 = (mean_distance_matrix_100_01_35 - np.min(mean_distance_matrix_100_01_35)) / (np.max(mean_distance_matrix_100_01_35) - np.min(mean_distance_matrix_100_01_35))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_100_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100, min_dist=0.1)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_100_01_35.npy', distance_matrices_100_01_35)
np.save('mean_distance_matrix_neighbors_100_01_35.npy', mean_distance_matrix_100_01_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_100_01_35}")

#### Minimum Spanning Tree - MST

In [None]:
# Create a graph from the distance matrix
G_100_01_35 = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_100_01_35,3))
np.save('G_100_01_35.npy',G_100_01_35)

# Draw the graph
pos = nx.spring_layout(G_100_01_35, seed=42)  # positions for all nodes
nx.draw(G_100_01_35, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_100_01_35, 'weight')
nx.draw_networkx_edge_labels(G_100_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Calculate the total weight of the MST
total_weight_100 = sum(nx.get_edge_attributes(mst_100_01_35, 'weight').values())

# Print the total weight
print(f"Total weight of the MST: {total_weight_100}")

# Compute the minimum spanning tree of the graph
mst_100_01_35 = nx.minimum_spanning_tree(G_100_01_35)
np.save('mst_100_01_35.npy', mst_100_01_35)

# Define positions for all nodes
pos = nx.spring_layout(mst_100_01_35, seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_100_01_35, pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_100_01_35, 'weight')
nx.draw_networkx_edge_labels(mst_100_01_35, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=100, min_dist=0.1")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_100_01_35 = np.std(distance_matrices_100_01_35, axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_100_01_35.npy", distance_matrix_std_100_01_35)

# Output the results
print("Standard Deviation Distance Matrix (100_01_35):\n", distance_matrix_std_100_01_35)

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_100_01_35 = distance_matrix_std_100_01_35 / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_100_01_35 = z_score * sem_matrix_100_01_35

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_100_01_35 = mean_distance_matrix_100_01_35 - margin_of_error_matrix_100_01_35
upper_limit_intconf_matrix_100_01_35 = mean_distance_matrix_100_01_35 + margin_of_error_matrix_100_01_35

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_100_01_35 = np.maximum(lower_limit_intconf_matrix_100_01_35, 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_100_01_35)
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_100_01_35)
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_100_01_35)

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_100_01_35.npy', lower_limit_intconf_matrix_100_01_35)
np.save('upper_limit_intconf_matrix_100_01_35.npy', upper_limit_intconf_matrix_100_01_35)

In [67]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_100_01_35 = normalize_matrix(lower_limit_intconf_matrix_100_01_35)
norm_upper_limit_intconf_matrix_100_01_35 = normalize_matrix(upper_limit_intconf_matrix_100_01_35)
np.save('norm_lower_limit_intconf_matrix_100_01_35.npy', norm_lower_limit_intconf_matrix_100_01_35)
np.save('norm_upper_limit_intconf_matrix_100_01_35.npy', norm_upper_limit_intconf_matrix_100_01_35)

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_100_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix (k=10, n_neighbors=100, min_dist=0.1)")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_100_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix (k=10, n_neighbors=100, min_dist=0.1)")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_100_01_35, annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix (k=10, n_neighbors=100, min_dist=0.1)")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_100_01_35, "UMAP MST - Mean Distances - n_neighbors=100, min_dist=0.1", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_100_01_35, "UMAP MST - Lower Limit - n_neighbors=100, min_dist=0.1", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_100_01_35, "UMAP MST - Upper Limit - n_neighbors=100, min_dist=0.1", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

--------

--------

### UMAP n_neighbours=5, min_dist=0.0125

In [14]:
umap_projections_5_00125_35= np.load('umap_projections_5_00125_35.npy')
mean_umap_projection_5_00125_35= np.load('mean_projection_5_00125_35.npy')
std_projection_umap_5_00125_35= np.load('std_projection_5_00125_35.npy')

In [None]:
# Define parameters
n_neighbors = 5
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_5_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_5_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_5_00125_35 = np.array(umap_projections_5_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_5_00125_35 = np.mean(umap_projections_5_00125_35, axis=0)
std_projection_5_00125_35 = np.std(umap_projections_5_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_5_00125_35.npy', umap_projections_5_00125_35)
np.save('mean_projection_5_00125_35.npy', mean_projection_5_00125_35)
np.save('std_projection_5_00125_35.npy', std_projection_5_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_5_00125_35'.")


#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_5_00125_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_5_00125 = np.zeros((n_runs, n_clusters, umap_projections_5_00125_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_5_00125_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_5_00125[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_5_00125 = np.zeros(10)
std_dev_y_5_00125 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_5_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_5_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_5_00125[i] = np.std(cluster_x_coords)
    std_dev_y_5_00125[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_5_00125)
print("Standard deviation of y coordinates per cluster:", std_dev_y_5_00125)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_5_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_5_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_5_00125[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_5_00125_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{5_00125}_35.csv', index=False)

#### Distance Matrix Calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_5_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_5_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_5_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_5_00125_35  = np.array(distance_matrices_5_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_5_00125_35  = np.mean(distance_matrices_5_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_00125_35  = (mean_distance_matrix_5_00125_35  - np.min(mean_distance_matrix_5_00125_35 )) / (np.max(mean_distance_matrix_5_00125_35 ) - np.min(mean_distance_matrix_5_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=5, , min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_00125_35 .npy', distance_matrices_5_00125_35)
np.save('mean_distance_matrix_neighbors_5_00125_35 .npy', mean_distance_matrix_5_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_5_00125_35 }")

#### Minimum Spanning Tree - MST

In [None]:
# Create a graph from the distance matrix
G_5_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_5_00125_35 ,3))
np.save('G_5_00125_35 .npy',G_5_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_5_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_5_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_5_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_5_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_5_00125_35  = nx.minimum_spanning_tree(G_5_00125_35 )
np.save('mst_5_00125_35 .npy', mst_5_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_5_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_5_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_5_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_5_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=5, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_5_00125_35  = np.std(distance_matrices_5_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_5_00125_35 .npy", distance_matrix_std_5_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_5_00125_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_5_00125_35  = distance_matrix_std_5_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_5_00125_35  = z_score * sem_matrix_5_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_5_00125_35  = mean_distance_matrix_5_00125_35  - margin_of_error_matrix_5_00125_35 
upper_limit_intconf_matrix_5_00125_35  = mean_distance_matrix_5_00125_35  + margin_of_error_matrix_5_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_5_00125_35  = np.maximum(lower_limit_intconf_matrix_5_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_5_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_5_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_5_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_5_00125_35 .npy', lower_limit_intconf_matrix_5_00125_35 )
np.save('upper_limit_intconf_matrix_5_00125_35 .npy', upper_limit_intconf_matrix_5_00125_35 )

In [77]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_5_00125_35  = normalize_matrix(lower_limit_intconf_matrix_5_00125_35 )
norm_upper_limit_intconf_matrix_5_00125_35  = normalize_matrix(upper_limit_intconf_matrix_5_00125_35 )
np.save('norm_lower_limit_intconf_matrix_5_00125_35.npy', norm_lower_limit_intconf_matrix_5_00125_35 )
np.save('norm_upper_limit_intconf_matrix_5_00125_35.npy', norm_upper_limit_intconf_matrix_5_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_5_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=5, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_5_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=5, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_5_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=5, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_5_00125_35 , "UMAP MST - Mean Distances - n_neighbors=5, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_5_00125_35 , "UMAP MST - Lower Limit - n_neighbors=5, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_5_00125_35 , "UMAP MST - Upper Limit - n_neighbors=5, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

--------

### UMAP n_neighbours=10, min_dist=0.0125

In [15]:
umap_projections_10_00125_35= np.load('umap_projections_10_00125_35.npy')
mean_umap_projection_10_00125_35= np.load('mean_projection_10_00125_35.npy')
std_projection_umap_10_00125_35= np.load('std_projection_10_00125_35.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_10_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_10_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_10_00125_35 = np.array(umap_projections_10_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_10_00125_35 = np.mean(umap_projections_10_00125_35, axis=0)
std_projection_10_00125_35 = np.std(umap_projections_10_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_10_00125_35.npy', umap_projections_10_00125_35)
np.save('mean_projection_10_00125_35.npy', mean_projection_10_00125_35)
np.save('std_projection_10_00125_35.npy', std_projection_10_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_10_00125_35'.")


#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_10_00125_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_10_00125 = np.zeros((n_runs, n_clusters, umap_projections_10_00125_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_10_00125_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_10_00125[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_10_00125 = np.zeros(10)
std_dev_y_10_00125 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_10_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_10_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_10_00125[i] = np.std(cluster_x_coords)
    std_dev_y_10_00125[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_10_00125)
print("Standard deviation of y coordinates per cluster:", std_dev_y_10_00125)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_10_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_10_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_10_00125[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_10_00125_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{10_00125}_35.csv', index=False)

#### Distance Matrix Calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_10_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_10_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_10_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_10_00125_35  = np.array(distance_matrices_10_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_10_00125_35  = np.mean(distance_matrices_10_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_10_00125_35  = (mean_distance_matrix_10_00125_35  - np.min(mean_distance_matrix_10_00125_35 )) / (np.max(mean_distance_matrix_10_00125_35 ) - np.min(mean_distance_matrix_10_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_10_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10, , min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_10_00125_35 .npy', distance_matrices_10_00125_35)
np.save('mean_distance_matrix_neighbors_10_00125_35 .npy', mean_distance_matrix_10_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_10_00125_35 }")

#### Minimum Spanning Tree - MST

In [None]:
# Create a graph from the distance matrix
G_10_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_10_00125_35 ,3))
np.save('G_10_00125_35 .npy',G_10_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_10_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_10_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_10_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_10_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_10_00125_35  = nx.minimum_spanning_tree(G_10_00125_35 )
np.save('mst_10_00125_35 .npy', mst_10_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_10_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_10_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_10_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_10_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=10, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_10_00125_35  = np.std(distance_matrices_10_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_10_00125_35 .npy", distance_matrix_std_10_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_10_00125_35 )

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_10_00125_35  = distance_matrix_std_10_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_10_00125_35  = z_score * sem_matrix_10_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_10_00125_35  = mean_distance_matrix_10_00125_35  - margin_of_error_matrix_10_00125_35 
upper_limit_intconf_matrix_10_00125_35  = mean_distance_matrix_10_00125_35  + margin_of_error_matrix_10_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_10_00125_35  = np.maximum(lower_limit_intconf_matrix_10_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_10_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_10_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_10_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_10_00125_35 .npy', lower_limit_intconf_matrix_10_00125_35 )
np.save('upper_limit_intconf_matrix_10_00125_35 .npy', upper_limit_intconf_matrix_10_00125_35 )

In [87]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_10_00125_35  = normalize_matrix(lower_limit_intconf_matrix_10_00125_35 )
norm_upper_limit_intconf_matrix_10_00125_35  = normalize_matrix(upper_limit_intconf_matrix_10_00125_35 )
np.save('norm_lower_limit_intconf_matrix_10_00125_35.npy', norm_lower_limit_intconf_matrix_10_00125_35 )
np.save('norm_upper_limit_intconf_matrix_10_00125_35.npy', norm_upper_limit_intconf_matrix_10_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_10_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=10, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_10_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=10, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_10_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=10, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_10_00125_35 , "UMAP MST - Mean Distances - n_neighbors=10, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_10_00125_35 , "UMAP MST - Lower Limit - n_neighbors=10, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_10_00125_35 , "UMAP MST - Upper Limit - n_neighbors=10, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

--------

### UMAP n_neighbours=20, min_dist=0.0125

In [16]:
umap_projections_20_00125_35= np.load('umap_projections_20_00125_35.npy')
mean_umap_projection_20_00125_35= np.load('mean_projection_20_00125_35.npy')
std_projection_umap_20_00125_35= np.load('std_projection_20_00125_35.npy')

In [None]:
# Define parameters
n_neighbors = 20
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_20_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_20_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_20_00125_35 = np.array(umap_projections_20_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_20_00125_35 = np.mean(umap_projections_20_00125_35, axis=0)
std_projection_20_00125_35 = np.std(umap_projections_20_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_20_00125_35.npy', umap_projections_20_00125_35)
np.save('mean_projection_20_00125_35.npy', mean_projection_20_00125_35)
np.save('std_projection_20_00125_35.npy', std_projection_20_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_20_00125_35'.")


#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_20_00125_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_20_00125 = np.zeros((n_runs, n_clusters, umap_projections_20_00125_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_20_00125_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_20_00125[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_20_00125 = np.zeros(10)
std_dev_y_20_00125 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_20_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_20_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_20_00125[i] = np.std(cluster_x_coords)
    std_dev_y_20_00125[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_20_00125)
print("Standard deviation of y coordinates per cluster:", std_dev_y_20_00125)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_20_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_20_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_20_00125[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_20_00125_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{20_00125}_35.csv', index=False)

#### Distance Matrix Calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_20_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_20_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_20_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_20_00125_35  = np.array(distance_matrices_20_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_20_00125_35  = np.mean(distance_matrices_20_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_20_00125_35  = (mean_distance_matrix_20_00125_35  - np.min(mean_distance_matrix_20_00125_35 )) / (np.max(mean_distance_matrix_20_00125_35 ) - np.min(mean_distance_matrix_20_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_20_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10, , min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_20_00125_35 .npy', distance_matrices_20_00125_35)
np.save('mean_distance_matrix_neighbors_20_00125_35 .npy', mean_distance_matrix_20_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_20_00125_35 }")

#### Minimum Spanning Tree - MST

In [None]:
# Create a graph from the distance matrix
G_20_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_20_00125_35 ,3))
np.save('G_20_00125_35 .npy',G_20_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_20_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_20_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_20_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_20_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Compute the minimum spanning tree of the graph
mst_20_00125_35  = nx.minimum_spanning_tree(G_20_00125_35 )
np.save('mst_20_00125_35 .npy', mst_20_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_20_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_20_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_20_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_20_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=20, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_20_00125_35  = np.std(distance_matrices_20_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_20_00125_35 .npy", distance_matrix_std_20_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_20_00125_35 )

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_20_00125_35  = distance_matrix_std_20_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_20_00125_35  = z_score * sem_matrix_20_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_20_00125_35  = mean_distance_matrix_20_00125_35  - margin_of_error_matrix_20_00125_35 
upper_limit_intconf_matrix_20_00125_35  = mean_distance_matrix_20_00125_35  + margin_of_error_matrix_20_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_20_00125_35  = np.maximum(lower_limit_intconf_matrix_20_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_20_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_20_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_20_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_20_00125_35 .npy', lower_limit_intconf_matrix_20_00125_35 )
np.save('upper_limit_intconf_matrix_20_00125_35 .npy', upper_limit_intconf_matrix_20_00125_35 )

In [97]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_20_00125_35  = normalize_matrix(lower_limit_intconf_matrix_20_00125_35 )
norm_upper_limit_intconf_matrix_20_00125_35  = normalize_matrix(upper_limit_intconf_matrix_20_00125_35 )
np.save('norm_lower_limit_intconf_matrix_20_00125_35.npy', norm_lower_limit_intconf_matrix_20_00125_35 )
np.save('norm_upper_limit_intconf_matrix_20_00125_35.npy', norm_upper_limit_intconf_matrix_20_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_20_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=20, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_20_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=20, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_20_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=20, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_20_00125_35 , "UMAP MST - Mean Distances - n_neighbors=20, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_20_00125_35 , "UMAP MST - Lower Limit - n_neighbors=20, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_20_00125_35 , "UMAP MST - Upper Limit - n_neighbors=20, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

-----

### UMAP n_neighbours=30, min_dist=0.0125

In [17]:
umap_projections_30_00125_35= np.load('umap_projections_30_00125_35.npy')
mean_umap_projection_30_00125_35= np.load('mean_projection_30_00125_35.npy')
std_projection_umap_30_00125_35= np.load('std_projection_30_00125_35.npy')

In [None]:
# Define parameters
n_neighbors = 30
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_30_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_30_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_30_00125_35 = np.array(umap_projections_30_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_30_00125_35 = np.mean(umap_projections_30_00125_35, axis=0)
std_projection_30_00125_35 = np.std(umap_projections_30_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_30_00125_35.npy', umap_projections_30_00125_35)
np.save('mean_projection_30_00125_35.npy', mean_projection_30_00125_35)
np.save('std_projection_30_00125_35.npy', std_projection_30_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_30_00125_35'.")


#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_30_00125_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_30_00125 = np.zeros((n_runs, n_clusters, umap_projections_30_00125_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_30_00125_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_30_00125[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_30_00125 = np.zeros(10)
std_dev_y_30_00125 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_30_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_30_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_30_00125[i] = np.std(cluster_x_coords)
    std_dev_y_30_00125[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_30_00125)
print("Standard deviation of y coordinates per cluster:", std_dev_y_30_00125)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_30_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_30_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_30_00125[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_30_00125_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{30_00125}_35.csv', index=False)

#### Distance Matrix Calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_30_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_30_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_30_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_30_00125_35  = np.array(distance_matrices_30_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_30_00125_35  = np.mean(distance_matrices_30_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_30_00125_35  = (mean_distance_matrix_30_00125_35  - np.min(mean_distance_matrix_30_00125_35 )) / (np.max(mean_distance_matrix_30_00125_35 ) - np.min(mean_distance_matrix_30_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_30_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10, , min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_30_00125_35 .npy', distance_matrices_30_00125_35)
np.save('mean_distance_matrix_neighbors_30_00125_35 .npy', mean_distance_matrix_30_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_30_00125_35 }")

In [None]:
# Create a graph from the distance matrix
G_30_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_30_00125_35 ,3))
np.save('G_30_00125_35 .npy',G_30_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_30_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_30_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_30_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_30_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

#### Minimum Spanning Tree - MST

In [None]:
# Compute the minimum spanning tree of the graph
mst_30_00125_35  = nx.minimum_spanning_tree(G_30_00125_35 )
np.save('mst_30_00125_35 .npy', mst_30_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_30_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_30_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_30_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_30_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=30, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_30_00125_35  = np.std(distance_matrices_30_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_30_00125_35 .npy", distance_matrix_std_30_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_30_00125_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_30_00125_35  = distance_matrix_std_30_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_30_00125_35  = z_score * sem_matrix_30_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_30_00125_35  = mean_distance_matrix_30_00125_35  - margin_of_error_matrix_30_00125_35 
upper_limit_intconf_matrix_30_00125_35  = mean_distance_matrix_30_00125_35  + margin_of_error_matrix_30_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_30_00125_35  = np.maximum(lower_limit_intconf_matrix_30_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_30_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_30_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_30_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_30_00125_35 .npy', lower_limit_intconf_matrix_30_00125_35 )
np.save('upper_limit_intconf_matrix_30_00125_35 .npy', upper_limit_intconf_matrix_30_00125_35 )

In [107]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_30_00125_35  = normalize_matrix(lower_limit_intconf_matrix_30_00125_35 )
norm_upper_limit_intconf_matrix_30_00125_35  = normalize_matrix(upper_limit_intconf_matrix_30_00125_35 )
np.save('norm_lower_limit_intconf_matrix_30_00125_35.npy', norm_lower_limit_intconf_matrix_30_00125_35 )
np.save('norm_upper_limit_intconf_matrix_30_00125_35.npy', norm_upper_limit_intconf_matrix_30_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_30_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=30, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_30_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=30, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_30_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=30, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_30_00125_35 , "UMAP MST - Mean Distances - n_neighbors=30, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_30_00125_35 , "UMAP MST - Lower Limit - n_neighbors=30, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_30_00125_35 , "UMAP MST - Upper Limit - n_neighbors=30, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

------------

### UMAP n_neighbours=50, min_dist=0.0125

In [6]:
umap_projections_50_00125_35= np.load('umap_projections_50_00125_35.npy')
mean_umap_projection_50_00125_35= np.load('mean_projection_50_00125_35.npy')
std_projection_umap_50_00125_35= np.load('std_projection_50_00125_35.npy')
distance_matrices_50_00125_35= np.load('distance_matrices_neighbors_50_00125_35 .npy')
mean_distance_matrix_50_00125_35= np.load('mean_distance_matrix_neighbors_50_00125_35 .npy')

In [None]:
# Define parameters
n_neighbors = 50
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs
    
# Store UMAP projections for each run
umap_projections_50_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_50_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_50_00125_35 = np.array(umap_projections_50_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_50_00125_35 = np.mean(umap_projections_50_00125_35, axis=0)
std_projection_50_00125_35 = np.std(umap_projections_50_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_50_00125_35.npy', umap_projections_50_00125_35)
np.save('mean_projection_50_00125_35.npy', mean_projection_50_00125_35)
np.save('std_projection_50_00125_35.npy', std_projection_50_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_50_00125_35'.")


#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_50_00125_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_50_00125 = np.zeros((n_runs, n_clusters, umap_projections_50_00125_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_50_00125_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_50_00125[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_50_00125 = np.zeros(10)
std_dev_y_50_00125 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_50_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_50_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_50_00125[i] = np.std(cluster_x_coords)
    std_dev_y_50_00125[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_50_00125)
print("Standard deviation of y coordinates per cluster:", std_dev_y_50_00125)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_50_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_50_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_50_00125[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_50_00125_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{50_00125}_35.csv', index=False)

#### Distance Matrix Calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_50_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_50_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_50_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_50_00125_35  = np.array(distance_matrices_50_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_50_00125_35  = np.mean(distance_matrices_50_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_00125_35  = (mean_distance_matrix_50_00125_35  - np.min(mean_distance_matrix_50_00125_35 )) / (np.max(mean_distance_matrix_50_00125_35 ) - np.min(mean_distance_matrix_50_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_50_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=50, , min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_50_00125_35 .npy', distance_matrices_50_00125_35)
np.save('mean_distance_matrix_neighbors_50_00125_35 .npy', mean_distance_matrix_50_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_50_00125_35 }")

In [None]:
# Create a graph from the distance matrix
G_50_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_50_00125_35 ,3))
np.save('G_50_00125_35 .npy',G_50_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_50_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_50_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_50_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_50_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

#### Minimum Spanning Tree - MST

In [None]:
# Calculate the total weight of the MST
total_weight_50_00125 = sum(nx.get_edge_attributes(mst_50_00125_35, 'weight').values())

# Print the total weight
print(f"Total weight of the MST: {total_weight_50_00125}")

# Compute the minimum spanning tree of the graph
mst_50_00125_35  = nx.minimum_spanning_tree(G_50_00125_35 )
np.save('mst_50_00125_35 .npy', mst_50_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_50_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_50_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_50_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_50_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=50, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_50_00125_35  = np.std(distance_matrices_50_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_50_00125_35 .npy", distance_matrix_std_50_00125_35)

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_50_00125_35)

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_50_00125_35  = distance_matrix_std_50_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_50_00125_35  = z_score * sem_matrix_50_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_50_00125_35  = mean_distance_matrix_50_00125_35  - margin_of_error_matrix_50_00125_35 
upper_limit_intconf_matrix_50_00125_35  = mean_distance_matrix_50_00125_35  + margin_of_error_matrix_50_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_50_00125_35  = np.maximum(lower_limit_intconf_matrix_50_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_50_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_50_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_50_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_50_00125_35 .npy', lower_limit_intconf_matrix_50_00125_35 )
np.save('upper_limit_intconf_matrix_50_00125_35 .npy', upper_limit_intconf_matrix_50_00125_35 )

In [117]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_50_00125_35  = normalize_matrix(lower_limit_intconf_matrix_50_00125_35 )
norm_upper_limit_intconf_matrix_50_00125_35  = normalize_matrix(upper_limit_intconf_matrix_50_00125_35 )
np.save('norm_lower_limit_intconf_matrix_50_00125_35.npy', norm_lower_limit_intconf_matrix_50_00125_35 )
np.save('norm_upper_limit_intconf_matrix_50_00125_35.npy', norm_upper_limit_intconf_matrix_50_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_50_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=50, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_50_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=50, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_50_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=50, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_50_00125_35 , "UMAP MST - Mean Distances - n_neighbors=50, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_50_00125_35 , "UMAP MST - Lower Limit - n_neighbors=50, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_50_00125_35 , "UMAP MST - Upper Limit - n_neighbors=50, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

------

### UMAP n_neighbours=100, min_dist=0.0125

In [19]:
umap_projections_100_00125_35= np.load('umap_projections_100_00125_35.npy')
mean_umap_projection_100_00125_35= np.load('mean_projection_100_00125_35.npy')
std_projection_umap_100_00125_35= np.load('std_projection_100_00125_35.npy')

In [None]:
# Define parameters
n_neighbors = 100
min_dist = 0.0125
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_100_00125_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_100_00125_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_100_00125_35 = np.array(umap_projections_100_00125_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_100_00125_35 = np.mean(umap_projections_100_00125_35, axis=0)
std_projection_100_00125_35 = np.std(umap_projections_100_00125_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_100_00125_35.npy', umap_projections_100_00125_35)
np.save('mean_projection_100_00125_35.npy', mean_projection_100_00125_35)
np.save('std_projection_100_00125_35.npy', std_projection_100_00125_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_100_00125_35'.")


#### Clustering

In [None]:
# Number of clusters
n_clusters = 10

# Number of runs
n_runs = umap_projections_100_00125_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_100_00125 = np.zeros((n_runs, n_clusters, umap_projections_100_00125_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_100_00125_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_100_00125[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_100_00125 = np.zeros(10)
std_dev_y_100_00125 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_100_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_100_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_100_00125[i] = np.std(cluster_x_coords)
    std_dev_y_100_00125[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_100_00125)
print("Standard deviation of y coordinates per cluster:", std_dev_y_100_00125)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_100_00125[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_100_00125[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_100_00125[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_100_00125_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{100_00125}_35.csv', index=False)

#### Distance Matrix Calculation

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_100_00125_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_100_00125_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_100_00125_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_100_00125_35  = np.array(distance_matrices_100_00125_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_100_00125_35  = np.mean(distance_matrices_100_00125_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_00125_35  = (mean_distance_matrix_100_00125_35  - np.min(mean_distance_matrix_100_00125_35 )) / (np.max(mean_distance_matrix_100_00125_35 ) - np.min(mean_distance_matrix_100_00125_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_100_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100, min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_100_00125_35 .npy', distance_matrices_100_00125_35)
np.save('mean_distance_matrix_neighbors_100_00125_35 .npy', mean_distance_matrix_100_00125_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_100_00125_35 }")

In [None]:
# Create a graph from the distance matrix
G_100_00125_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_100_00125_35 ,3))
np.save('G_100_00125_35 .npy',G_100_00125_35 )

# Draw the graph
pos = nx.spring_layout(G_100_00125_35 , seed=42)  # positions for all nodes
nx.draw(G_100_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_100_00125_35 , 'weight')
nx.draw_networkx_edge_labels(G_100_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

#### Minimum Spanning Tree - MST

In [None]:
# Compute the minimum spanning tree of the graph
mst_100_00125_35  = nx.minimum_spanning_tree(G_100_00125_35 )
np.save('mst_100_00125_35 .npy', mst_100_00125_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_100_00125_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_100_00125_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_100_00125_35 , 'weight')
nx.draw_networkx_edge_labels(mst_100_00125_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=100, min_dist=0.0125")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_100_00125_35  = np.std(distance_matrices_100_00125_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_100_00125_35 .npy", distance_matrix_std_100_00125_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_100_00125_35 )

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_100_00125_35  = distance_matrix_std_100_00125_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_100_00125_35  = z_score * sem_matrix_100_00125_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_100_00125_35  = mean_distance_matrix_100_00125_35  - margin_of_error_matrix_100_00125_35 
upper_limit_intconf_matrix_100_00125_35  = mean_distance_matrix_100_00125_35  + margin_of_error_matrix_100_00125_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_100_00125_35  = np.maximum(lower_limit_intconf_matrix_100_00125_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_100_00125_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_100_00125_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_100_00125_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_100_00125_35 .npy', lower_limit_intconf_matrix_100_00125_35 )
np.save('upper_limit_intconf_matrix_100_00125_35 .npy', upper_limit_intconf_matrix_100_00125_35 )

In [127]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_100_00125_35  = normalize_matrix(lower_limit_intconf_matrix_100_00125_35 )
norm_upper_limit_intconf_matrix_100_00125_35  = normalize_matrix(upper_limit_intconf_matrix_100_00125_35 )
np.save('norm_lower_limit_intconf_matrix_100_00125_35.npy', norm_lower_limit_intconf_matrix_100_00125_35 )
np.save('norm_upper_limit_intconf_matrix_100_00125_35.npy', norm_upper_limit_intconf_matrix_100_00125_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_100_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=100, min_dist=0.0125")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_100_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=100, min_dist=0.0125")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_100_00125_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=100, min_dist=0.0125")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_100_00125_35 , "UMAP MST - Mean Distances - n_neighbors=100, min_dist=0.0125", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_100_00125_35 , "UMAP MST - Lower Limit - n_neighbors=100, min_dist=0.0125", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_100_00125_35 , "UMAP MST - Upper Limit - n_neighbors=100, min_dist=0.0125", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

#### Adaptable Radius for min_dist= 0.0125

In [None]:
## FOR min_dist=0,0125.
# Function to calculate cluster metrics
def calculate_cluster_metrics(umap_projections, y_labels, n_clusters=10):
    """
    Calculate average cluster radii and neighbor counts for each cluster over all runs.
    """
    n_runs = len(umap_projections)  # Number of runs
    cluster_centers_full = []
    
    # Step 1: Calculate cluster centers for each run
    for run_idx, x_umap in enumerate(umap_projections):
        cluster_centers_run = []
        for label in np.unique(y_labels):
            cluster_points = x_umap[y_labels == label]
            if len(cluster_points) > 0:
                cluster_center = np.mean(cluster_points, axis=0)
                cluster_centers_run.append(cluster_center)
        cluster_centers_full.append(np.array(cluster_centers_run))
    
    cluster_centers_full = np.array(cluster_centers_full)  # Shape: (n_runs, n_clusters, 2)

    # Step 2: Calculate average radii for each cluster
    radii_per_cluster = []
    for cluster_idx in range(n_clusters):
        radii_cluster = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_center = cluster_centers_full[run_idx][cluster_idx]
            cluster_points = x_umap[y_labels == cluster_idx]
            if len(cluster_points) > 0:
                distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                dynamic_radius = np.mean(distances_to_center)  # Mean distance to center
                radii_cluster.append(dynamic_radius)
        radii_per_cluster.append(np.mean(radii_cluster))  # Average radius across runs

    # Step 3: Calculate neighbor counts for each cluster
    neighbor_counts_full = []
    for run_idx, x_umap in enumerate(umap_projections):
        counts_run = []
        for cluster_idx, cluster_center in enumerate(cluster_centers_full[run_idx]):
            radius = radii_per_cluster[cluster_idx]  # Use the average radius
            distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
            count = np.sum(distances_to_center <= radius)  # Count points within the radius
            counts_run.append(count)
        neighbor_counts_full.append(counts_run)

    neighbor_counts_full = np.array(neighbor_counts_full)  # Shape: (n_runs, n_clusters)
    average_neighbor_counts = np.mean(neighbor_counts_full, axis=0)  # Average across runs

    return radii_per_cluster, average_neighbor_counts

# Define n_neighbors values
n_neighbors_values = [5, 10, 20, 30, 50, 100]
results = []

# Iterate over each n_neighbors value
for n_neighbors in n_neighbors_values:
    if n_neighbors == 5:
        umap_projections = umap_projections_5_00125_35
    elif n_neighbors == 10:
        umap_projections = umap_projections_10_00125_35
    elif n_neighbors == 20:
        umap_projections = umap_projections_20_00125_35
    elif n_neighbors == 30:
        umap_projections = umap_projections_30_00125_35
    elif n_neighbors == 50:
        umap_projections = umap_projections_50_00125_35
    elif n_neighbors == 100:
        umap_projections = umap_projections_100_00125_35

    # Calculate metrics
    radii_per_cluster, average_neighbor_counts = calculate_cluster_metrics(umap_projections, y_train)

    # Store results
    for cluster_idx in range(len(radii_per_cluster)):
        results.append({
            "N": n_neighbors,
            "Cluster": cluster_idx,
            "Radius": np.round(radii_per_cluster[cluster_idx], 3),
            "Number of Neighbors": np.round(average_neighbor_counts[cluster_idx], 0)
        })

# Create a DataFrame for the results
df_results = pd.DataFrame(results)

# Save results for later use
df_results.to_csv("radius_neighbor_analysis_merged_MinDist_00125.csv", index=False)

# Pivot table for easy visualization
pivot_table = df_results.pivot(index="Cluster", columns="N", values=["Radius", "Number of Neighbors"])
print(pivot_table)

In [None]:
def plot_mean_neighbor_counts_across_runs(umap_projections_list, n_neighbors_values, y_labels, n_clusters=10):
    """
    Plot mean number of neighbors across runs for different n_neighbors values.
    """
    neighbor_counts_avg_runs = []

    for umap_projections in umap_projections_list:
        # Calculate neighbor counts for each run
        neighbor_counts_per_run = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_centers = []
            radii_per_cluster = []
            for cluster_idx in range(n_clusters):
                # Compute cluster center
                cluster_points = x_umap[y_labels == cluster_idx]
                if len(cluster_points) > 0:
                    cluster_center = np.mean(cluster_points, axis=0)
                    cluster_centers.append(cluster_center)

                    # Compute dynamic radius for this cluster
                    distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                    dynamic_radius = np.mean(distances_to_center)
                    radii_per_cluster.append(dynamic_radius)
                else:
                    radii_per_cluster.append(0)
                    cluster_centers.append(np.array([0, 0]))

            # Compute number of neighbors within radius for each cluster
            neighbor_counts = []
            for cluster_idx, cluster_center in enumerate(cluster_centers):
                if radii_per_cluster[cluster_idx] > 0:  # Avoid empty clusters
                    distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
                    neighbor_count = np.sum(distances_to_center <= radii_per_cluster[cluster_idx])
                    neighbor_counts.append(neighbor_count)

            # Store the mean neighbor count for this run
            neighbor_counts_per_run.append(np.mean(neighbor_counts))
        
        neighbor_counts_avg_runs.append(neighbor_counts_per_run)

    # Plot results
    plt.figure(figsize=(10, 6))
    for i, counts in enumerate(neighbor_counts_avg_runs):
        plt.plot(range(1, len(counts) + 1), counts, label=f'n_neighbors={n_neighbors_values[i]}', marker='o')

    plt.xlabel("Run Index")
    plt.ylabel("Mean Number of Points")
    plt.title("Mean Number of Points Across Runs for min_dist = 0.0125")
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))  # Adjust legend position
    plt.grid(True)
    plt.show()

In [None]:
plot_mean_neighbor_counts_across_runs(
    umap_projections_list=[
        umap_projections_5_00125_35,
        umap_projections_10_00125_35,
        umap_projections_20_00125_35,
        umap_projections_30_00125_35,
        umap_projections_50_00125_35,
        umap_projections_100_00125_35
    ],
    n_neighbors_values=n_neighbors_values,
    y_labels=y_train
)

In [None]:
# Loading .csv 
df_results_00125 = pd.read_csv('radius_neighbor_analysis_merged_MinDist_00125.csv')

# Add a density column to df_results
df_results_00125['Density'] = df_results_00125['Number of Neighbors'] / df_results_00125['Radius']

# Find the row with the maximum density
max_density_row = df_results_00125.loc[df_results_00125['Density'].idxmax()]

# Extract the cluster, n_neighbors, and maximum density
max_density = max_density_row['Density']
max_cluster = max_density_row['Cluster']
max_n_neighbors = max_density_row['N']

# Print the results
print(f"Highest Density: {max_density:.2f}")
print(f"Cluster: {int(max_cluster)}")
print(f"n_neighbors: {int(max_n_neighbors)}")

------------

------------

### UMAP n_neighbours=5, min_dist=0.8

In [20]:
umap_projections_5_08_35= np.load('umap_projections_5_08_35.npy')
mean_umap_projection_5_08_35= np.load('mean_projection_5_08_35.npy')
std_projection_umap_5_08_35= np.load('std_projection_5_08_35.npy')

In [None]:
# Define parameters
n_neighbors = 5
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_5_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_5_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_5_08_35 = np.array(umap_projections_5_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_5_08_35 = np.mean(umap_projections_5_08_35, axis=0)
std_projection_5_08_35 = np.std(umap_projections_5_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_5_08_35.npy', umap_projections_5_08_35)
np.save('mean_projection_5_08_35.npy', mean_projection_5_08_35)
np.save('std_projection_5_08_35.npy', std_projection_5_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_5_08_35'.")


#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_5_08_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_5_08 = np.zeros((n_runs, n_clusters, umap_projections_5_08_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_5_08_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_5_08[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_5_08 = np.zeros(10)
std_dev_y_5_08 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_5_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_5_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_5_08[i] = np.std(cluster_x_coords)
    std_dev_y_5_08[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_5_08)
print("Standard deviation of y coordinates per cluster:", std_dev_y_5_08)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_5_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_5_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_5_08[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_5_08_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{5_08}_35.csv', index=False)

#### Distance Matrix Calacualtion

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_5_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_5_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_5_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_5_08_35  = np.array(distance_matrices_5_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_5_08_35  = np.mean(distance_matrices_5_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_5_08_35  = (mean_distance_matrix_5_08_35  - np.min(mean_distance_matrix_5_08_35 )) / (np.max(mean_distance_matrix_5_08_35 ) - np.min(mean_distance_matrix_5_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_5_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100, min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_5_08_35 .npy', distance_matrices_5_08_35)
np.save('mean_distance_matrix_neighbors_5_08_35 .npy', mean_distance_matrix_5_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_5_08_35 }")

In [None]:
# Create a graph from the distance matrix
G_5_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_5_08_35 ,3))
np.save('G_5_08_35 .npy',G_5_08_35 )

# Draw the graph
pos = nx.spring_layout(G_5_08_35 , seed=42)  # positions for all nodes
nx.draw(G_5_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_5_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_5_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

#### Minimum Spanning Tree - MST

In [None]:
# Compute the minimum spanning tree of the graph
mst_5_08_35  = nx.minimum_spanning_tree(G_5_08_35 )
np.save('mst_5_08_35 .npy', mst_5_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_5_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_5_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_5_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_5_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=5, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_5_08_35  = np.std(distance_matrices_5_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_5_08_35 .npy", distance_matrix_std_5_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (5_00125_35):\n", distance_matrix_std_5_08_35 )

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_5_08_35  = distance_matrix_std_5_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_5_08_35  = z_score * sem_matrix_5_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_5_08_35  = mean_distance_matrix_5_08_35  - margin_of_error_matrix_5_08_35 
upper_limit_intconf_matrix_5_08_35  = mean_distance_matrix_5_08_35  + margin_of_error_matrix_5_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_5_08_35  = np.maximum(lower_limit_intconf_matrix_5_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_5_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_5_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_5_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_5_08_35 .npy', lower_limit_intconf_matrix_5_08_35 )
np.save('upper_limit_intconf_matrix_5_08_35 .npy', upper_limit_intconf_matrix_5_08_35 )

In [137]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_5_08_35  = normalize_matrix(lower_limit_intconf_matrix_5_08_35 )
norm_upper_limit_intconf_matrix_5_08_35  = normalize_matrix(upper_limit_intconf_matrix_5_08_35 )
np.save('norm_lower_limit_intconf_matrix_5_08_35.npy', norm_lower_limit_intconf_matrix_5_08_35 )
np.save('norm_upper_limit_intconf_matrix_5_08_35.npy', norm_upper_limit_intconf_matrix_5_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_5_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=5, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_5_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=5, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_5_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=5, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_5_08_35 , "UMAP MST - Mean Distances - n_neighbors=5, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_5_08_35 , "UMAP MST - Lower Limit - n_neighbors=5, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_5_08_35 , "UMAP MST - Upper Limit - n_neighbors=5, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

------

### UMAP n_neighbours=10, min_dist=0.8

In [21]:
umap_projections_10_08_35= np.load('umap_projections_10_08_35.npy')
mean_umap_projection_10_08_35= np.load('mean_projection_10_08_35.npy')
std_projection_umap_10_08_35= np.load('std_projection_10_08_35.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_10_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_10_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_10_08_35 = np.array(umap_projections_10_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_10_08_35 = np.mean(umap_projections_10_08_35, axis=0)
std_projection_10_08_35 = np.std(umap_projections_10_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_10_08_35.npy', umap_projections_10_08_35)
np.save('mean_projection_10_08_35.npy', mean_projection_10_08_35)
np.save('std_projection_10_08_35.npy', std_projection_10_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_10_08_35'.")


#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_10_08_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_10_08 = np.zeros((n_runs, n_clusters, umap_projections_10_08_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_10_08_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_10_08[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_10_08 = np.zeros(10)
std_dev_y_10_08 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_10_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_10_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_10_08[i] = np.std(cluster_x_coords)
    std_dev_y_10_08[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_10_08)
print("Standard deviation of y coordinates per cluster:", std_dev_y_10_08)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_10_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_10_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_10_08[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_10_08_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{10_08}_35.csv', index=False)

#### Distance Matrix Calacualtion

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_10_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_10_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_10_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_10_08_35  = np.array(distance_matrices_10_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_10_08_35  = np.mean(distance_matrices_10_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_10_08_35  = (mean_distance_matrix_10_08_35  - np.min(mean_distance_matrix_10_08_35 )) / (np.max(mean_distance_matrix_10_08_35 ) - np.min(mean_distance_matrix_10_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_10_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=10, min_dists=0.8)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_10_08_35 .npy', distance_matrices_10_08_35)
np.save('mean_distance_matrix_neighbors_10_08_35 .npy', mean_distance_matrix_10_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_10_08_35 }")

In [None]:
# Create a graph from the distance matrix
G_10_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_10_08_35 ,3))
np.save('G_10_08_35 .npy',G_10_08_35 )

# Draw the graph
pos = nx.spring_layout(G_10_08_35 , seed=42)  # positions for all nodes
nx.draw(G_10_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_10_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_10_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

#### Minimum Spanning Tree - MST

In [None]:
# Compute the minimum spanning tree of the graph
mst_10_08_35  = nx.minimum_spanning_tree(G_10_08_35 )
np.save('mst_10_08_35 .npy', mst_10_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_10_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_10_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_10_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_10_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=10, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_10_08_35  = np.std(distance_matrices_10_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_10_08_35 .npy", distance_matrix_std_10_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (10_08_35):\n", distance_matrix_std_10_08_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_10_08_35  = distance_matrix_std_10_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_10_08_35  = z_score * sem_matrix_10_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_10_08_35  = mean_distance_matrix_10_08_35  - margin_of_error_matrix_10_08_35 
upper_limit_intconf_matrix_10_08_35  = mean_distance_matrix_10_08_35  + margin_of_error_matrix_10_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_10_08_35  = np.maximum(lower_limit_intconf_matrix_10_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_10_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_10_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_10_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_10_08_35 .npy', lower_limit_intconf_matrix_10_08_35 )
np.save('upper_limit_intconf_matrix_10_08_35 .npy', upper_limit_intconf_matrix_10_08_35 )

In [147]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_10_08_35  = normalize_matrix(lower_limit_intconf_matrix_10_08_35 )
norm_upper_limit_intconf_matrix_10_08_35  = normalize_matrix(upper_limit_intconf_matrix_10_08_35 )
np.save('norm_lower_limit_intconf_matrix_10_08_35.npy', norm_lower_limit_intconf_matrix_10_08_35 )
np.save('norm_upper_limit_intconf_matrix_10_08_35.npy', norm_upper_limit_intconf_matrix_10_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_10_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=10, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_10_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=10, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_10_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=10, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_10_08_35 , "UMAP MST - Mean Distances - n_neighbors=10, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_10_08_35 , "UMAP MST - Lower Limit - n_neighbors=10, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_10_08_35 , "UMAP MST - Upper Limit - n_neighbors=10, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

--------

### UMAP n_neighbours=20, min_dist=0.8

In [22]:
umap_projections_20_08_35= np.load('umap_projections_20_08_35.npy')
mean_umap_projection_20_08_35= np.load('mean_projection_20_08_35.npy')
std_projection_umap_20_08_35= np.load('std_projection_20_08_35.npy')

In [None]:
# Define parameters
n_neighbors = 20
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_20_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_20_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_20_08_35 = np.array(umap_projections_20_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_20_08_35 = np.mean(umap_projections_20_08_35, axis=0)
std_projection_20_08_35 = np.std(umap_projections_20_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_20_08_35.npy', umap_projections_20_08_35)
np.save('mean_projection_20_08_35.npy', mean_projection_20_08_35)
np.save('std_projection_20_08_35.npy', std_projection_20_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_20_08_35'.")


#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_20_08_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_20_08 = np.zeros((n_runs, n_clusters, umap_projections_20_08_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_20_08_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_20_08[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_20_08 = np.zeros(10)
std_dev_y_20_08 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_20_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_20_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_20_08[i] = np.std(cluster_x_coords)
    std_dev_y_20_08[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_20_08)
print("Standard deviation of y coordinates per cluster:", std_dev_y_20_08)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_20_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_20_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_20_08[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_20_08_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{20_08}_35.csv', index=False)

#### Distance Matrix Calacualtion

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_20_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_20_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_20_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_20_08_35  = np.array(distance_matrices_20_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_20_08_35  = np.mean(distance_matrices_20_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_20_08_35  = (mean_distance_matrix_20_08_35  - np.min(mean_distance_matrix_20_08_35 )) / (np.max(mean_distance_matrix_20_08_35 ) - np.min(mean_distance_matrix_20_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_20_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100, min_dists=0.0125)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_20_08_35 .npy', distance_matrices_20_08_35)
np.save('mean_distance_matrix_neighbors_20_08_35 .npy', mean_distance_matrix_20_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_20_08_35 }")

In [None]:
# Create a graph from the distance matrix
G_20_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_20_08_35 ,3))
np.save('G_20_08_35 .npy',G_20_08_35 )

# Draw the graph
pos = nx.spring_layout(G_20_08_35 , seed=42)  # positions for all nodes
nx.draw(G_20_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_20_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_20_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

#### Minimum Spanning Tree - Calculation

In [None]:
# Compute the minimum spanning tree of the graph
mst_20_08_35  = nx.minimum_spanning_tree(G_20_08_35 )
np.save('mst_20_08_35 .npy', mst_20_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_20_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_20_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_20_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_20_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=20, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_20_08_35  = np.std(distance_matrices_20_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_20_08_35 .npy", distance_matrix_std_20_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (20_08_35):\n", distance_matrix_std_20_08_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_20_08_35  = distance_matrix_std_20_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_20_08_35  = z_score * sem_matrix_20_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_20_08_35  = mean_distance_matrix_20_08_35  - margin_of_error_matrix_20_08_35 
upper_limit_intconf_matrix_20_08_35  = mean_distance_matrix_20_08_35  + margin_of_error_matrix_20_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_20_08_35  = np.maximum(lower_limit_intconf_matrix_20_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_20_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_20_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_20_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_20_08_35 .npy', lower_limit_intconf_matrix_20_08_35 )
np.save('upper_limit_intconf_matrix_20_08_35 .npy', upper_limit_intconf_matrix_20_08_35 )

In [157]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_20_08_35  = normalize_matrix(lower_limit_intconf_matrix_20_08_35 )
norm_upper_limit_intconf_matrix_20_08_35  = normalize_matrix(upper_limit_intconf_matrix_20_08_35 )
np.save('norm_lower_limit_intconf_matrix_20_08_35.npy', norm_lower_limit_intconf_matrix_20_08_35 )
np.save('norm_upper_limit_intconf_matrix_20_08_35.npy', norm_upper_limit_intconf_matrix_20_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_20_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=20, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_20_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=20, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_20_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=20, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_20_08_35 , "UMAP MST - Mean Distances - n_neighbors=20, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_20_08_35 , "UMAP MST - Lower Limit - n_neighbors=20, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_20_08_35 , "UMAP MST - Upper Limit - n_neighbors=20, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

-------

### UMAP n_neighbours=30, min_dist=0.8

In [23]:
umap_projections_30_08_35= np.load('umap_projections_30_08_35.npy')
mean_umap_projection_30_08_35= np.load('mean_projection_30_08_35.npy')
std_projection_umap_30_08_35= np.load('std_projection_30_08_35.npy')

In [None]:
# Define parameters
n_neighbors = 30
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_30_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_30_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_30_08_35 = np.array(umap_projections_30_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_30_08_35 = np.mean(umap_projections_30_08_35, axis=0)
std_projection_30_08_35 = np.std(umap_projections_30_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_30_08_35.npy', umap_projections_30_08_35)
np.save('mean_projection_30_08_35.npy', mean_projection_30_08_35)
np.save('std_projection_30_08_35.npy', std_projection_30_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_30_08_35'.")


#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_30_08_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_30_08 = np.zeros((n_runs, n_clusters, umap_projections_30_08_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_30_08_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_30_08[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_30_08 = np.zeros(10)
std_dev_y_30_08 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_30_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_30_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_30_08[i] = np.std(cluster_x_coords)
    std_dev_y_30_08[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_30_08)
print("Standard deviation of y coordinates per cluster:", std_dev_y_30_08)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_30_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_30_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_30_08[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_30_08_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{30_08}_35.csv', index=False)

#### Distance Matrix Calacualtion

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_30_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_30_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_30_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_30_08_35  = np.array(distance_matrices_30_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_30_08_35  = np.mean(distance_matrices_30_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_30_08_35  = (mean_distance_matrix_30_08_35  - np.min(mean_distance_matrix_30_08_35 )) / (np.max(mean_distance_matrix_30_08_35 ) - np.min(mean_distance_matrix_30_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_30_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=30, min_dists=0.8)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_30_08_35 .npy', distance_matrices_30_08_35)
np.save('mean_distance_matrix_neighbors_30_08_35 .npy', mean_distance_matrix_30_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_30_08_35 }")

In [None]:
# Create a graph from the distance matrix
G_30_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_30_08_35 ,3))
np.save('G_30_08_35 .npy',G_30_08_35 )

# Draw the graph
pos = nx.spring_layout(G_30_08_35 , seed=42)  # positions for all nodes
nx.draw(G_30_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_30_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_30_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

#### Minimum Spanning Tree - MST

In [None]:
# Compute the minimum spanning tree of the graph
mst_30_08_35  = nx.minimum_spanning_tree(G_30_08_35 )
np.save('mst_30_08_35 .npy', mst_30_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_30_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_30_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_30_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_30_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=30, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_30_08_35  = np.std(distance_matrices_30_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_30_08_35 .npy", distance_matrix_std_30_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (30_08_35):\n", distance_matrix_std_30_08_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_30_08_35  = distance_matrix_std_30_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_30_08_35  = z_score * sem_matrix_30_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_30_08_35  = mean_distance_matrix_30_08_35  - margin_of_error_matrix_30_08_35 
upper_limit_intconf_matrix_30_08_35  = mean_distance_matrix_30_08_35  + margin_of_error_matrix_30_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_30_08_35  = np.maximum(lower_limit_intconf_matrix_30_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_30_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_30_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_30_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_30_08_35 .npy', lower_limit_intconf_matrix_30_08_35 )
np.save('upper_limit_intconf_matrix_30_08_35 .npy', upper_limit_intconf_matrix_30_08_35 )

In [167]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_30_08_35  = normalize_matrix(lower_limit_intconf_matrix_30_08_35 )
norm_upper_limit_intconf_matrix_30_08_35  = normalize_matrix(upper_limit_intconf_matrix_30_08_35 )
np.save('norm_lower_limit_intconf_matrix_30_08_35.npy', norm_lower_limit_intconf_matrix_30_08_35 )
np.save('norm_upper_limit_intconf_matrix_30_08_35.npy', norm_upper_limit_intconf_matrix_30_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_30_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=30, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_30_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=30, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_30_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=30, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_30_08_35 , "UMAP MST - Mean Distances - n_neighbors=30, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_30_08_35 , "UMAP MST - Lower Limit - n_neighbors=30, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_30_08_35 , "UMAP MST - Upper Limit - n_neighbors=30, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

------------

### UMAP n_neighbours=50, min_dist=0.8

In [12]:
umap_projections_50_08_35= np.load('umap_projections_50_08_35.npy')
mean_umap_projection_50_08_35= np.load('mean_projection_50_08_35.npy')
std_projection_umap_50_08_35= np.load('std_projection_50_08_35.npy')
distance_matrices_50_08_35= np.load('distance_matrices_neighbors_50_08_35 .npy')
mean_distance_matrix_50_08_35= np.load('mean_distance_matrix_neighbors_50_08_35 .npy')

In [None]:
# Define parameters
n_neighbors = 50
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_50_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_50_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_50_08_35 = np.array(umap_projections_50_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_50_08_35 = np.mean(umap_projections_50_08_35, axis=0)
std_projection_50_08_35 = np.std(umap_projections_50_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_50_08_35.npy', umap_projections_50_08_35)
np.save('mean_projection_50_08_35.npy', mean_projection_50_08_35)
np.save('std_projection_50_08_35.npy', std_projection_50_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_50_08_35'.")

#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_50_08_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_50_08 = np.zeros((n_runs, n_clusters, umap_projections_50_08_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_50_08_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_50_08[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_50_08 = np.zeros(10)
std_dev_y_50_08 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_50_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_50_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_50_08[i] = np.std(cluster_x_coords)
    std_dev_y_50_08[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_50_08)
print("Standard deviation of y coordinates per cluster:", std_dev_y_50_08)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_50_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_50_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_50_08[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_50_08_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{50_08}_35.csv', index=False)

#### Distance Matrix Calacualtion

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_50_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_50_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_50_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_50_08_35  = np.array(distance_matrices_50_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_50_08_35  = np.mean(distance_matrices_50_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_50_08_35  = (mean_distance_matrix_50_08_35  - np.min(mean_distance_matrix_50_08_35 )) / (np.max(mean_distance_matrix_50_08_35 ) - np.min(mean_distance_matrix_50_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_50_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=50, min_dists=0.8)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_50_08_35 .npy', distance_matrices_50_08_35)
np.save('mean_distance_matrix_neighbors_50_08_35 .npy', mean_distance_matrix_50_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_50_08_35 }")

#### Minimum Spanning Tree - MST

In [None]:
# Create a graph from the distance matrix
G_50_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_50_08_35 ,3))
np.save('G_50_08_35 .npy',G_50_08_35 )

# Draw the graph
pos = nx.spring_layout(G_50_08_35 , seed=42)  # positions for all nodes
nx.draw(G_50_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_50_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_50_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

In [None]:
# Calculate the total weight of the MST
total_weight_50_08_35 = sum(nx.get_edge_attributes(mst_50_08_35, 'weight').values())

# Print the total weight
print(f"Total weight of the MST: {total_weight_50_08_35}")

# Compute the minimum spanning tree of the graph
mst_50_08_35  = nx.minimum_spanning_tree(G_50_08_35 )
np.save('mst_50_08_35 .npy', mst_50_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_50_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_50_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_50_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_50_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=50, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_50_08_35  = np.std(distance_matrices_50_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_50_08_35 .npy", distance_matrix_std_50_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (50_08_35):\n", distance_matrix_std_50_08_35 )

In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_50_08_35  = distance_matrix_std_50_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_50_08_35  = z_score * sem_matrix_50_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_50_08_35  = mean_distance_matrix_50_08_35  - margin_of_error_matrix_50_08_35 
upper_limit_intconf_matrix_50_08_35  = mean_distance_matrix_50_08_35  + margin_of_error_matrix_50_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_50_08_35  = np.maximum(lower_limit_intconf_matrix_50_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_50_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_50_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_50_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_50_08_35 .npy', lower_limit_intconf_matrix_50_08_35 )
np.save('upper_limit_intconf_matrix_50_08_35 .npy', upper_limit_intconf_matrix_50_08_35 )

In [177]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_50_08_35  = normalize_matrix(lower_limit_intconf_matrix_50_08_35 )
norm_upper_limit_intconf_matrix_50_08_35  = normalize_matrix(upper_limit_intconf_matrix_50_08_35 )
np.save('norm_lower_limit_intconf_matrix_50_08_35.npy', norm_lower_limit_intconf_matrix_50_08_35 )
np.save('norm_upper_limit_intconf_matrix_50_08_35.npy', norm_upper_limit_intconf_matrix_50_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_50_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=50, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_50_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=50, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_50_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=50, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_50_08_35 , "UMAP MST - Mean Distances - n_neighbors=50, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_50_08_35 , "UMAP MST - Lower Limit - n_neighbors=50, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_50_08_35 , "UMAP MST - Upper Limit - n_neighbors=50, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

-----

### UMAP n_neighbours=100, min_dist=0.8

In [25]:
umap_projections_100_08_35= np.load('umap_projections_100_08_35.npy')
mean_umap_projection_100_08_35= np.load('mean_projection_100_08_35.npy')
std_projection_umap_100_08_35= np.load('std_projection_100_08_35.npy')

In [None]:
# Define parameters
n_neighbors = 100
min_dist = 0.8
n_components = 2
n_runs = 35  # Number of runs

# Store UMAP projections for each run
umap_projections_100_08_35 = []

# Run UMAP multiple times
for run in range(n_runs):
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components,random_state=None)  # Allow randomness
    
    # Fit and transform the data
    projection = umap_model.fit_transform(x_train_flattened)
    
    # Store the projection
    umap_projections_100_08_35.append(projection)

# Convert the list of projections to a numpy array
umap_projections_100_08_35 = np.array(umap_projections_100_08_35)

# Calculate mean and standard deviation of projections across runs
mean_projection_100_08_35 = np.mean(umap_projections_100_08_35, axis=0)
std_projection_100_08_35 = np.std(umap_projections_100_08_35, axis=0)

# Save the projections, mean, and standard deviation
np.save('umap_projections_100_08_35.npy', umap_projections_100_08_35)
np.save('mean_projection_100_08_35.npy', mean_projection_100_08_35)
np.save('std_projection_100_08_35.npy', std_projection_100_08_35)

# Output confirmation
print("UMAP projections, mean, and standard deviation have been saved with identifiers '_100_08_35'.")


#### Clustering

In [None]:
# Number of clusters (e.g., 10)
n_clusters = 10

# Number of runs (e.g., 35)
n_runs = umap_projections_100_08_35.shape[0]

# Array to store KMeans centroids for all runs
kmeans_centroids_100_08 = np.zeros((n_runs, n_clusters, umap_projections_100_08_35.shape[2]))

# Apply KMeans for each run and store centroids
for run in range(n_runs):
    umap_projection = umap_projections_100_08_35[run]  # Shape (n_samples, n_dimensions)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(umap_projection)
    kmeans_centroids_100_08[run] = kmeans.cluster_centers_

In [None]:
# Initialize arrays to store standard deviations
std_dev_x_100_08 = np.zeros(10)
std_dev_y_100_08 = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_100_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_100_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x_100_08[i] = np.std(cluster_x_coords)
    std_dev_y_100_08[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x_100_08)
print("Standard deviation of y coordinates per cluster:", std_dev_y_100_08)

#### Centroid stability

Standard deviation calculation

In [None]:

# Initialize arrays to store standard deviations
std_dev_x = np.zeros(10)
std_dev_y = np.zeros(10)

# Loop through each cluster to calculate std deviation for x and y coordinates
for i in range(10):
    # Extract all x and y coordinates for the i-th cluster over all runs
    cluster_x_coords = kmeans_centroids_100_08[:, i, 0]  # All x coords for cluster i
    cluster_y_coords = kmeans_centroids_100_08[:, i, 1]  # All y coords for cluster i
    
    # Calculate standard deviation in x and y
    std_dev_x[i] = np.std(cluster_x_coords)
    std_dev_y[i] = np.std(cluster_y_coords)

# Output the results
print("Standard deviation of x coordinates per cluster:", std_dev_x)
print("Standard deviation of y coordinates per cluster:", std_dev_y)

In [None]:

# Create an empty list to hold the data for the DataFrame
data_v2 = []

# Loop through each trial and each cluster to evaluate the condition
for trial in range(35):
    for cluster in range(10):
        # Extract the centroid coordinates for the current trial and cluster
        centroid_coord = kmeans_centroids_100_08[trial, cluster]
        
        # Calculate the bounds for the 2 standard deviations range for x and y
        mean_x, mean_y = centroid_mean_100_08_35[cluster]
        lower_bound_x, upper_bound_x = mean_x - 2 * std_dev_x[cluster], mean_x + 2 * std_dev_x[cluster]
        lower_bound_y, upper_bound_y = mean_y - 2 * std_dev_y[cluster], mean_y + 2 * std_dev_y[cluster]
        
        # Check if the centroid is inside the 2 std range
        inside_2_std = (lower_bound_x <= centroid_coord[0] <= upper_bound_x) and (lower_bound_y <= centroid_coord[1] <= upper_bound_y)
        
        # Append the data as a new row in the list
        data_v2.append([trial + 1, cluster, centroid_coord, inside_2_std])

# Create a DataFrame from the list of data
df_results_v2 = pd.DataFrame(data_v2, columns=['Trial', 'Cluster', 'Centroid Coord', 'Inside 2 std'])

In [None]:

# Group the DataFrame by Trial and check if all clusters in each trial are True for 'Inside 2 std'
trials_all_true = df_results_v2.groupby('Trial')['Inside 2 std'].all()

In [None]:
# Filter the trials where all clusters were True
trials_with_all_true = trials_all_true[trials_all_true].index.tolist()

In [None]:
# Output the list of trials
print("Trials where all clusters were True:", trials_with_all_true)

In [None]:
# Filter the trials where not all clusters were True
trials_with_some_false = trials_all_true[~trials_all_true].index.tolist()

# Output the list of trials where some clusters were False
print("Trials where some clusters were False:", trials_with_some_false)

In [None]:

# Save the result table to a CSV file
df_results_v2.to_csv(f'result_table_neighbors_v2_{100_08}_35.csv', index=False)

#### Distance Matrix Calacualtion

In [None]:
# Placeholder for cluster centroids (center of cluster i for each valid run)
cluster_centroids_per_run = []

# Iterate over valid runs to calculate centroids for each cluster
for run in valid_runs:
    # Extract the UMAP projections for this run
    projections = umap_projections_100_08_35 [run]

    # Calculate centroids for each cluster (digits 0-9)
    centroids = []
    for cluster_label in range(10):  # Assuming 10 clusters (digits 0-9)
        cluster_points = projections[y_train == cluster_label]  # Points in this cluster
        centroid = np.mean(cluster_points, axis=0)  # Calculate the centroid
        centroids.append(centroid)
    
    cluster_centroids_per_run.append(np.array(centroids))  # Store centroids for this run

# Calculate pairwise distances between centroids for each run
distance_matrices_100_08_35  = []
for centroids in cluster_centroids_per_run:
    # Calculate the pairwise Euclidean distance between centroids for this run
    distance_matrix = cdist(centroids, centroids, metric='euclidean')  # Shape: (10, 10)
    distance_matrices_100_08_35 .append(distance_matrix)

# Convert the list of distance matrices to a NumPy array
distance_matrices_100_08_35  = np.array(distance_matrices_100_08_35 )  # Shape: (n_valid_runs, 10, 10)

# Calculate the mean distance matrix across all valid runs
mean_distance_matrix_100_08_35  = np.mean(distance_matrices_100_08_35 , axis=0)  # Shape: (10, 10)

# Normalize the mean distance matrix
normalized_mean_distance_matrix_100_08_35  = (mean_distance_matrix_100_08_35  - np.min(mean_distance_matrix_100_08_35 )) / (np.max(mean_distance_matrix_100_08_35 ) - np.min(mean_distance_matrix_100_08_35 ))

# Plot the normalized mean distance matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(normalized_mean_distance_matrix_100_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5)
plt.title("Normalized Mean Distance Matrix (k=10, n_neighbors=100, min_dists=0.8)")
plt.xlabel("Cluster")
plt.ylabel("Cluster")
plt.show()

# Save the distance matrices and mean distance matrix
np.save('distance_matrices_neighbors_100_08_35 .npy', distance_matrices_100_08_35)
np.save('mean_distance_matrix_neighbors_100_08_35 .npy', mean_distance_matrix_100_08_35)

# Output the mean distance matrix
print(f"Mean distance matrix across all valid runs:\n{mean_distance_matrix_100_08_35 }")

In [None]:
# Create a graph from the distance matrix
G_100_08_35  = nx.from_numpy_array(np.round(normalized_mean_distance_matrix_100_08_35 ,3))
np.save('G_100_08_35 .npy',G_100_08_35 )

# Draw the graph
pos = nx.spring_layout(G_100_08_35 , seed=42)  # positions for all nodes
nx.draw(G_100_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=800, font_size=10)

# Draw edge labels (distances)
edge_labels = nx.get_edge_attributes(G_100_08_35 , 'weight')
nx.draw_networkx_edge_labels(G_100_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)
plt.show()

#### Minimum Spanning Tree - MST

In [None]:
# Compute the minimum spanning tree of the graph
mst_100_08_35  = nx.minimum_spanning_tree(G_100_08_35 )
np.save('mst_100_08_35 .npy', mst_100_08_35 )

# Define positions for all nodes
pos = nx.spring_layout(mst_100_08_35 , seed=42)

# Draw the minimum spanning tree only
nx.draw(mst_100_08_35 , pos, with_labels=True, node_color='lightblue', edge_color='red', node_size=500, font_size=10, width=2)

# Draw edge labels (distances) for the MST
edge_labels = nx.get_edge_attributes(mst_100_08_35 , 'weight')
nx.draw_networkx_edge_labels(mst_100_08_35 , pos, edge_labels=edge_labels, font_size=8, label_pos=0.3)

plt.title("MST UMAP - n_neighbors=100, min_dist=0.8")
plt.show()

In [None]:
# Step 1: Calculate the standard deviation for each pair of clusters across all runs
distance_matrix_std_100_08_35  = np.std(distance_matrices_100_08_35 , axis=0)  # Shape: (n_clusters, n_clusters)

# Step 2: Save the standard deviation matrix for future use
np.save("distance_matrix_std_100_08_35 .npy", distance_matrix_std_100_08_35 )

# Output the results
print("Standard Deviation Distance Matrix (100_08_35):\n", distance_matrix_std_100_08_35 )


In [None]:
# Parameters
confidence_level = 0.95
z_score = norm.ppf((1 + confidence_level) / 2)  # Critical value for the normal distribution
n_runs = 35  # Number of runs

# Step 1: Calculate the Standard Error of the Mean (SEM)
sem_matrix_100_08_35  = distance_matrix_std_100_08_35  / np.sqrt(n_runs)

# Step 2: Calculate the margin of error
margin_of_error_matrix_100_08_35  = z_score * sem_matrix_100_08_35 

# Step 3: Compute the lower and upper confidence interval matrices
lower_limit_intconf_matrix_100_08_35  = mean_distance_matrix_100_08_35  - margin_of_error_matrix_100_08_35 
upper_limit_intconf_matrix_100_08_35  = mean_distance_matrix_100_08_35  + margin_of_error_matrix_100_08_35 

# Ensure no negative values in the lower limit matrix (optional)
lower_limit_intconf_matrix_100_08_35  = np.maximum(lower_limit_intconf_matrix_100_08_35 , 0)

# Output the results
print("Mean Distance Matrix:\n", mean_distance_matrix_100_08_35 )
print("\nLower Limit Matrix:\n", lower_limit_intconf_matrix_100_08_35 )
print("\nUpper Limit Matrix:\n", upper_limit_intconf_matrix_100_08_35 )

# Save the matrices for future use
np.save('lower_limit_intconf_matrix_100_08_35 .npy', lower_limit_intconf_matrix_100_08_35 )
np.save('upper_limit_intconf_matrix_100_08_35 .npy', upper_limit_intconf_matrix_100_08_35 )

In [187]:
def normalize_matrix(matrix):
    return (matrix - np.min(matrix)) / (np.max(matrix) - np.min(matrix))

norm_lower_limit_intconf_matrix_100_08_35  = normalize_matrix(lower_limit_intconf_matrix_100_08_35 )
norm_upper_limit_intconf_matrix_100_08_35  = normalize_matrix(upper_limit_intconf_matrix_100_08_35 )
np.save('norm_lower_limit_intconf_matrix_100_08_35.npy', norm_lower_limit_intconf_matrix_100_08_35 )
np.save('norm_upper_limit_intconf_matrix_100_08_35.npy', norm_upper_limit_intconf_matrix_100_08_35 )

In [None]:
# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(21, 9))

# Plot each normalized matrix as a heatmap
sns.heatmap(norm_lower_limit_intconf_matrix_100_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[0])
axes[0].set_title("Normalized Lower bound Dist. Matrix - n_neighbors=100, min_dist=0.8")
axes[0].set_xlabel("Cluster")
axes[0].set_ylabel("")

sns.heatmap(normalized_mean_distance_matrix_100_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[1])
axes[1].set_title("Normalized Mean Dist. Matrix - n_neighbors=100, min_dist=0.8")
axes[1].set_xlabel("Cluster")
axes[1].set_ylabel("Cluster")

sns.heatmap(norm_upper_limit_intconf_matrix_100_08_35 , annot=True, cmap="viridis", fmt=".2f", linewidths=0.5, ax=axes[2])
axes[2].set_title("Normalized Upper bound Dist. Matrix - n_neighbors=100, min_dist=0.8")
axes[2].set_xlabel("Cluster")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Set up the figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot MSTs for mean, lower, and upper matrices
plot_mst(normalized_mean_distance_matrix_100_08_35 , "UMAP MST - Mean Distances - n_neighbors=100, min_dist=0.8", axes[1], color='red')
plot_mst(norm_lower_limit_intconf_matrix_100_08_35 , "UMAP MST - Lower Limit - n_neighbors=100, min_dist=0.8", axes[0], color='blue')
plot_mst(norm_upper_limit_intconf_matrix_100_08_35 , "UMAP MST - Upper Limit - n_neighbors=100, min_dist=0.8", axes[2], color='green')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

#### Adaptable Radius for min_dist=0,8.

In [None]:
## For min_dist=0,8.

# Function to calculate cluster metrics
def calculate_cluster_metrics(umap_projections, y_labels, n_clusters=10):
    """
    Calculate average cluster radii and neighbor counts for each cluster over all runs.
    """
    n_runs = len(umap_projections)  # Number of runs
    cluster_centers_full = []
    
    # Step 1: Calculate cluster centers for each run
    for run_idx, x_umap in enumerate(umap_projections):
        cluster_centers_run = []
        for label in np.unique(y_labels):
            cluster_points = x_umap[y_labels == label]
            if len(cluster_points) > 0:
                cluster_center = np.mean(cluster_points, axis=0)
                cluster_centers_run.append(cluster_center)
        cluster_centers_full.append(np.array(cluster_centers_run))
    
    cluster_centers_full = np.array(cluster_centers_full)  # Shape: (n_runs, n_clusters, 2)

    # Step 2: Calculate average radii for each cluster
    radii_per_cluster = []
    for cluster_idx in range(n_clusters):
        radii_cluster = []
        for run_idx, x_umap in enumerate(umap_projections):
            cluster_center = cluster_centers_full[run_idx][cluster_idx]
            cluster_points = x_umap[y_labels == cluster_idx]
            if len(cluster_points) > 0:
                distances_to_center = np.linalg.norm(cluster_points - cluster_center, axis=1)
                dynamic_radius = np.mean(distances_to_center)  # Mean distance to center
                radii_cluster.append(dynamic_radius)
        radii_per_cluster.append(np.mean(radii_cluster))  # Average radius across runs

    # Step 3: Calculate neighbor counts for each cluster
    neighbor_counts_full = []
    for run_idx, x_umap in enumerate(umap_projections):
        counts_run = []
        for cluster_idx, cluster_center in enumerate(cluster_centers_full[run_idx]):
            radius = radii_per_cluster[cluster_idx]  # Use the average radius
            distances_to_center = np.linalg.norm(x_umap - cluster_center, axis=1)
            count = np.sum(distances_to_center <= radius)  # Count points within the radius
            counts_run.append(count)
        neighbor_counts_full.append(counts_run)

    neighbor_counts_full = np.array(neighbor_counts_full)  # Shape: (n_runs, n_clusters)
    average_neighbor_counts = np.mean(neighbor_counts_full, axis=0)  # Average across runs

    return radii_per_cluster, average_neighbor_counts

# Define n_neighbors values
n_neighbors_values = [5, 10, 20, 30, 50, 100]
results = []

# Iterate over each n_neighbors value
for n_neighbors in n_neighbors_values:
    if n_neighbors == 5:
        umap_projections = umap_projections_5_08_35
    elif n_neighbors == 10:
        umap_projections = umap_projections_10_08_35
    elif n_neighbors == 20:
        umap_projections = umap_projections_20_08_35
    elif n_neighbors == 30:
        umap_projections = umap_projections_30_08_35
    elif n_neighbors == 50:
        umap_projections = umap_projections_50_08_35
    elif n_neighbors == 100:
        umap_projections = umap_projections_100_08_35

    # Calculate metrics
    radii_per_cluster, average_neighbor_counts = calculate_cluster_metrics(umap_projections, y_train)

    # Store results
    for cluster_idx in range(len(radii_per_cluster)):
        results.append({
            "N": n_neighbors,
            "Cluster": cluster_idx,
            "Radius": np.round(radii_per_cluster[cluster_idx], 3),
            "Number of Neighbors": np.round(average_neighbor_counts[cluster_idx], 0)
        })

# Create a DataFrame for the results
df_results = pd.DataFrame(results)

# Save results for later use
df_results.to_csv("radius_neighbor_analysis_merged_MinDist_08.csv", index=False)

# Pivot table for easy visualization
pivot_table = df_results.pivot(index="Cluster", columns="N", values=["Radius", "Number of Neighbors"])
print(pivot_table)

In [None]:
plot_mean_neighbor_counts_across_runs(
    umap_projections_list=[
        umap_projections_5_08_35,
        umap_projections_10_08_35,
        umap_projections_20_08_35,
        umap_projections_30_08_35,
        umap_projections_50_08_35,
        umap_projections_100_08_35
    ],
    n_neighbors_values=n_neighbors_values,
    y_labels=y_train
)

In [None]:
# Loading .csv 
df_results_08 = pd.read_csv('radius_neighbor_analysis_merged_MinDist_08.csv')

# Add a density column to df_results
df_results_08['Density'] = df_results_08['Number of Neighbors'] / df_results_08['Radius']

# Find the row with the maximum density
max_density_row = df_results_08.loc[df_results_08['Density'].idxmax()]

# Extract the cluster, n_neighbors, and maximum density
max_density = max_density_row['Density']
max_cluster = max_density_row['Cluster']
max_n_neighbors = max_density_row['N']

# Print the results
print(f"Highest Density: {max_density:.2f}")
print(f"Cluster: {int(max_cluster)}")
print(f"n_neighbors: {int(max_n_neighbors)}")

-------

-------

### General UMAP Comparison

In [None]:
from matplotlib.backends.backend_pdf import PdfPages

In [None]:
# Define clusters
clusters = np.arange(10)  # Clusters from 0 to 9

# Define colors for each n_neighbors
colors = {5: "orange", 10: "blue", 20: "yellow", 30: "grey", 50: "green", 100: "red"}

# Create a PDF to save all the plots
with PdfPages("Cluster_Confidence_Intervals.pdf") as pdf:
    # Iterate over each cluster as the base cluster
    for base_cluster in clusters:
        
        # Define the data for each n_neighbors, adjusted for the base cluster
        data = {
            5: {
                "mean": np.delete(mean_distance_matrix_5_01_35[base_cluster], base_cluster),  # Distances from base cluster
                "lower": np.delete(lower_limit_intconf_matrix_5_01_35[base_cluster], base_cluster),  # Lower bounds
                "upper": np.delete(upper_limit_intconf_matrix_5_01_35[base_cluster], base_cluster)   # Upper bounds
            },
            10: {
                "mean": np.delete(mean_distance_matrix_10_01_35[base_cluster], base_cluster),
                "lower": np.delete(lower_limit_intconf_matrix_10_01_35[base_cluster], base_cluster),
                "upper": np.delete(upper_limit_intconf_matrix_10_01_35[base_cluster], base_cluster)
            },
            20: {
                "mean": np.delete(mean_distance_matrix_20_01_35[base_cluster], base_cluster),
                "lower": np.delete(lower_limit_intconf_matrix_20_01_35[base_cluster], base_cluster),
                "upper": np.delete(upper_limit_intconf_matrix_20_01_35[base_cluster], base_cluster)
            },
            30: {
                "mean": np.delete(mean_distance_matrix_30_01_35[base_cluster], base_cluster),
                "lower": np.delete(lower_limit_intconf_matrix_30_01_35[base_cluster], base_cluster),
                "upper": np.delete(upper_limit_intconf_matrix_30_01_35[base_cluster], base_cluster)
            },
            50: {
                "mean": np.delete(mean_distance_matrix_50_01_35[base_cluster], base_cluster),
                "lower": np.delete(lower_limit_intconf_matrix_50_01_35[base_cluster], base_cluster),
                "upper": np.delete(upper_limit_intconf_matrix_50_01_35[base_cluster], base_cluster)
            },
            100: {
                "mean": np.delete(mean_distance_matrix_100_01_35[base_cluster], base_cluster),
                "lower": np.delete(lower_limit_intconf_matrix_100_01_35[base_cluster], base_cluster),
                "upper": np.delete(upper_limit_intconf_matrix_100_01_35[base_cluster], base_cluster)
            }
        }

        # Define clusters to be compared against (excluding the base cluster)
        compare_clusters = np.delete(clusters, base_cluster)

        # Plotting
        fig, ax = plt.subplots(figsize=(16, 8))

        width = 0.15  # Bar width
        x = np.arange(len(compare_clusters))  # X positions for clusters

        for idx, (n_neighbors, values) in enumerate(data.items()):
            # Calculate positions for the current set of bars
            x_positions = x + (idx - len(data) / 2) * width

            # Plot bars for the mean distances
            ax.bar(
                x_positions,
                values["mean"],  # Mean distances
                yerr=[
                    values["mean"] - values["lower"],  # Lower error
                    values["upper"] - values["mean"]   # Upper error
                ],
                width=width,
                color=colors[n_neighbors],
                alpha=0.7,
                label=f"n={n_neighbors}",
                capsize=5
            )

        # Add labels, title, and legend
        ax.set_xlabel("Clusters", fontsize=14)
        ax.set_ylabel("Distance", fontsize=14)
        ax.set_title(f"Confidence Intervals of Distances from Cluster {base_cluster} to Other Clusters", fontsize=16)
        ax.set_xticks(x)
        ax.set_xticklabels([f"{i}" for i in compare_clusters], fontsize=12)
        ax.legend(title="n_neighbors", fontsize=10)
        ax.grid(axis="y", linestyle="--", alpha=0.7)

        plt.tight_layout()

        # Save the figure to the PDF
        pdf.savefig(fig)
        plt.close(fig)

print("PDF with cluster confidence intervals has been successfully created.")

In [None]:
# Define clusters
clusters = np.arange(10)  # Clusters from 0 to 9

# Define colors for each n_neighbors
colors = {5: "orange", 10: "blue", 20: "yellow", 30: "grey", 50: "green", 100: "red"}

# Iterate over each cluster as the base cluster
for base_cluster in clusters:
    
    # Define the data for each n_neighbors, adjusted for the base cluster
    data = {
        5: {
            "mean": np.delete(mean_distance_matrix_5_01_35[base_cluster], base_cluster),  # Distances from base cluster
            "lower": np.delete(lower_limit_intconf_matrix_5_01_35[base_cluster], base_cluster),  # Lower bounds
            "upper": np.delete(upper_limit_intconf_matrix_5_01_35[base_cluster], base_cluster)   # Upper bounds
        },
        10: {
            "mean": np.delete(mean_distance_matrix_10_01_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_10_01_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_10_01_35[base_cluster], base_cluster)
        },
        20: {
            "mean": np.delete(mean_distance_matrix_20_01_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_20_01_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_20_01_35[base_cluster], base_cluster)
        },
        30: {
            "mean": np.delete(mean_distance_matrix_30_01_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_30_01_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_30_01_35[base_cluster], base_cluster)
        },
        50: {
            "mean": np.delete(mean_distance_matrix_50_01_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_50_01_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_50_01_35[base_cluster], base_cluster)
        },
        100: {
            "mean": np.delete(mean_distance_matrix_100_01_35[base_cluster], base_cluster),
            "lower": np.delete(lower_limit_intconf_matrix_100_01_35[base_cluster], base_cluster),
            "upper": np.delete(upper_limit_intconf_matrix_100_01_35[base_cluster], base_cluster)
        }
    }

    # Define clusters to be compared against (excluding the base cluster)
    compare_clusters = np.delete(clusters, base_cluster)

    # Plotting
    fig, ax = plt.subplots(figsize=(16, 8))

    width = 0.15  # Bar width
    x = np.arange(len(compare_clusters))  # X positions for clusters

    for idx, (n_neighbors, values) in enumerate(data.items()):
        # Calculate positions for the current set of bars
        x_positions = x + (idx - len(data) / 2) * width

        # Plot bars for the mean distances
        ax.bar(
            x_positions,
            values["mean"],  # Mean distances
            yerr=[
                values["mean"] - values["lower"],  # Lower error
                values["upper"] - values["mean"]   # Upper error
            ],
            width=width,
            color=colors[n_neighbors],
            alpha=0.7,
            label=f"n={n_neighbors}",
            capsize=5
        )

    # Add labels, title, and legend
    ax.set_xlabel("Clusters", fontsize=14)
    ax.set_ylabel("Distance", fontsize=14)
    ax.set_title(f"Confidence Intervals of Distances from Cluster {base_cluster} to Other Clusters", fontsize=16)
    ax.set_xticks(x)
    ax.set_xticklabels([f"{i}" for i in compare_clusters], fontsize=12)
    ax.legend(title="n_neighbors", fontsize=10)
    ax.grid(axis="y", linestyle="--", alpha=0.7)

    plt.tight_layout()
    plt.show()

In [None]:
# Define a function to plot MST for a given normalized distance matrix
def plot_mst(matrix, title, ax, color='red'):
    # Create a graph from the distance matrix
    G = nx.from_numpy_array(np.round(matrix, 3))
    
    # Compute the minimum spanning tree of the graph
    mst = nx.minimum_spanning_tree(G)
    
    # Define positions for all nodes
    pos = nx.spring_layout(mst, seed=42)
    
    # Draw the minimum spanning tree
    nx.draw(mst, pos, with_labels=True, node_color='lightblue', edge_color=color, node_size=500, font_size=10, width=2, ax=ax)
    
    # Draw edge labels (distances)
    edge_labels = nx.get_edge_attributes(mst, 'weight')
    nx.draw_networkx_edge_labels(mst, pos, edge_labels=edge_labels, font_size=8, label_pos=0.3, ax=ax)
    
    # Set the title
    ax.set_title(title)

# Define your matrices for each n_neighbors value
matrices = {
    5: {
        "mean": normalized_mean_distance_matrix_5_08_35,
        "lower": norm_lower_limit_intconf_matrix_5_08_35,
        "upper": norm_upper_limit_intconf_matrix_5_08_35
    },
    10: {
        "mean": normalized_mean_distance_matrix_10_08_35,
        "lower": norm_lower_limit_intconf_matrix_10_08_35,
        "upper": norm_upper_limit_intconf_matrix_10_08_35
    },
    20: {
        "mean": normalized_mean_distance_matrix_20_08_35,
        "lower": norm_lower_limit_intconf_matrix_20_08_35,
        "upper": norm_upper_limit_intconf_matrix_20_08_35
    },
    30: {
        "mean": normalized_mean_distance_matrix_30_08_35,
        "lower": norm_lower_limit_intconf_matrix_30_08_35,
        "upper": norm_upper_limit_intconf_matrix_30_08_35
    },
    50: {
        "mean": normalized_mean_distance_matrix_50_08_35,
        "lower": norm_lower_limit_intconf_matrix_50_08_35,
        "upper": norm_upper_limit_intconf_matrix_50_08_35
    },
    100: {
        "mean": normalized_mean_distance_matrix_100_08_35,
        "lower": norm_lower_limit_intconf_matrix_100_08_35,
        "upper": norm_upper_limit_intconf_matrix_100_08_35
    }
}

# Open a PDF to save the plots
with PdfPages('MST_UMAP_Comparisons min_dis=0.8.pdf') as pdf:
    for n_neighbors, matrix_set in matrices.items():
        # Set up the figure with three subplots
        fig, axes = plt.subplots(1, 3, figsize=(18, 6))
        
        # Plot MSTs for mean, lower, and upper matrices
        plot_mst(matrix_set["mean"], f"MST UMAP - Mean Distances (n_neighbors={n_neighbors}, min_dis=0.8)", axes[1], color='red')
        plot_mst(matrix_set["lower"], f"MST UMAP - Lower Limit (n_neighbors={n_neighbors}, min_dis=0.8)", axes[0], color='blue')
        plot_mst(matrix_set["upper"], f"MST UMAP - Upper Limit (n_neighbors={n_neighbors}, min_dis=0.8)", axes[2], color='green')
        
        # Adjust layout for better spacing
        plt.tight_layout()
        
        # Save the current figure to the PDF
        pdf.savefig(fig)
        plt.close(fig)

print("PDF with MST UMAP Comparisons has been successfully created.")

-----

### MDS vs UMAP Sammon's stress

The following analysis was done using the MDS with 2 components indeces and results from Exploration and comparison of multiple algorithms - MNIST and UMAP Projections of min_dist=0.1 and n_neighbours=10, n_neighbours=50 and n_neighbours=100.

In [None]:
# Step 1: Downsample the Dataset Consistently
def downsample_mnist_consistent(x_data, y_labels, sample_fraction=0.1):
    """
    Downsample the dataset consistently, returning indices to ensure
    the same points are selected in both spaces.
    """
    sampled_indices = []
    unique_labels = np.unique(y_labels)
    for label in unique_labels:
        # Select indices for the current label
        label_indices = np.where(y_labels == label)[0]
        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=int(len(label_indices) * sample_fraction), replace=False
        )
        sampled_indices.extend(sampled_indices_label)
    return np.array(sampled_indices)

# Get consistent indices for sampling
sampled_indices = downsample_mnist_consistent(x_train_flattened, y_train, sample_fraction=0.1)

# Step 2: Use the Sampled Indices to Extract Points from Both Spaces
# Downsample the high-dimensional original space
x_sampled = x_train_flattened[sampled_indices]  # Original high-dimensional space
y_sampled = y_train[sampled_indices]            # Corresponding labels

# Load the mean projections and downsample
umap_projections_downsampled = {
    10: np.load("mean_projection_10_01_35.npy")[sampled_indices],  # Mean projection for n_neighbors=10
    50: np.load("mean_projection_50_01_35.npy")[sampled_indices],  # Mean projection for n_neighbors=50
    100: np.load("mean_projection_100_01_35.npy")[sampled_indices],  # Mean projection for n_neighbors=100
}

# Output shapes for verification
print(f"x_sampled shape: {x_sampled.shape}")
print(f"y_sampled shape: {y_sampled.shape}")
for n_neighbors, projection in umap_projections_downsampled.items():
    print(f"UMAP (n_neighbors={n_neighbors}) downsampled shape: {projection.shape}")

In [None]:
# Use sklearn's pairwise_distances for better handling of large arrays
# "True" distances between points in the original/high dimensional space
pairwise_distances = sklearn_pairwise_distances(x_sampled, metric='euclidean')

# Initialize and fit MDS
# Uses MDS to create a reference (ideal or baseline) embedding in 2D while preserving the global structure of pairwise distances
mds_model = MDS(n_components=2, dissimilarity='precomputed', random_state=42)
mds_embedding = mds_model.fit_transform(pairwise_distances)

def sammons_stress(original_distances, embedding_distances):
    """
    Calculate Sammon's stress/error with normalization.
    """
    epsilon = 1e-9  # Avoid division by zero
    original_distances = np.maximum(original_distances, epsilon)
    normalization = np.sum(original_distances)  # Sum of all original distances
    stress = np.sum(((original_distances - embedding_distances) ** 2) / original_distances)
    return stress / normalization  # Normalize by the total sum of original distances

# Above eq. quantifies the degree to which the low-dimensional embedding preserves the pairwise distances from the original space. Lower stress indicates better preservation.

# Calculate Sammon's stress for UMAP embeddings
# Evaluates how well each UMAP embedding preserves global structures compared to the original distances
stress_results = {}
original_distances = pairwise_distances
for n_neighbors, umap_embedding in umap_projections_downsampled.items():
    # Compute pairwise distances for the UMAP embedding
    umap_distances = sklearn_pairwise_distances(umap_embedding, metric='euclidean')
    # Calculate Sammon's stress
    stress = sammons_stress(original_distances, umap_distances)
    stress_results[n_neighbors] = stress

-----

In [None]:
# Load sampled indices
sampled_indices_train_mds= np.load("sampled_indices_train_mds.npy")
sampled_indices_test_mds= np.load("sampled_indices_test_mds.npy")

# Load downsampled dataset
x_train_sampled_mds= np.load("x_train_sampled_mds.npy")
y_train_sampled_mds= np.load("y_train_sampled_mds.npy")
x_test_sampled_mds= np.load("x_test_sampled_mds.npy")
y_test_sampled_mds= np.load("y_test_sampled_mds.npy")

# Load MDS embeddings
x_train_mds_c2= np.load("x_train_mds_c2.npy")
x_test_mds_c2= np.load("x_test_mds_c2.npy")

In [None]:
# Load the mean projections and downsample
umap_projections_downsampled = {
    10: np.load("mean_projection_10_01_35.npy")[sampled_indices_train_mds],  
    50: np.load("mean_projection_50_01_35.npy")[sampled_indices_train_mds],  
    100: np.load("mean_projection_100_01_35.npy")[sampled_indices_train_mds],
}

In [None]:
for n_neighbors, projection in umap_projections_downsampled.items():
    print(f"UMAP (n_neighbors={n_neighbors}) downsampled shape: {projection.shape}")

#### Sammon's stress

In [None]:
# Function to calculate Sammon's stress
def sammons_stress(original_distances, embedding_distances):
    """
    Calculate Sammon's stress/error with normalization.
    """
    epsilon = 1e-9  # Avoid division by zero
    original_distances = np.maximum(original_distances, epsilon)  # Prevent zero distances
    normalization = np.sum(original_distances)  # Sum of all original distances
    stress = np.sum(((original_distances - embedding_distances) ** 2) / original_distances)
    return stress / normalization  # Normalize by the total sum of original distances

# Compute pairwise distances for the original MDS embedding
original_distances = sklearn_pairwise_distances(x_train_mds_c2, metric='euclidean')
print(f"Original distances shape: {original_distances.shape}")  # Should be (5996, 5996)

In [None]:
# Calculate Sammon's stress for UMAP embeddings
stress_results = {}
for n_neighbors, umap_embedding in umap_projections_downsampled.items():
    # Compute pairwise distances for the UMAP embedding
    umap_distances = sklearn_pairwise_distances(umap_embedding, metric='euclidean')
    print(f"UMAP (n_neighbors={n_neighbors}) distances shape: {umap_distances.shape}")  # Should be (5996, 5996)

    # Calculate Sammon's stress
    stress = sammons_stress(original_distances, umap_distances)
    stress_results[n_neighbors] = stress

# Print the Sammon's stress results
for n_neighbors, stress in stress_results.items():
    print(f"Sammon's stress for UMAP (n_neighbors={n_neighbors}): {stress}")

In [None]:
# Step 3: Visualize Results
plt.figure(figsize=(12, 8))

# Plot MDS Embedding
plt.subplot(2, 2, 1)
plt.scatter(x_train_mds_c2[:, 0], x_train_mds_c2[:, 1], c=y_train_sampled_mds, cmap='Spectral', s=5)
plt.title("MDS Embedding")
plt.colorbar(label="Digit Label")

# Plot UMAP Embeddings for different n_neighbors
for idx, n_neighbors in enumerate([10, 50, 100], start=2):
    plt.subplot(2, 2, idx)
    plt.scatter(
        umap_projections_downsampled[n_neighbors][:, 0], 
        umap_projections_downsampled[n_neighbors][:, 1], 
        c=y_train_sampled_mds, cmap='Spectral', s=5
    )
    plt.title(f"UMAP Embedding (n_neighbors={n_neighbors})")
    plt.colorbar(label="Digit Label")

plt.tight_layout()
plt.show()

Sammon's stress and Variability with t-student (due to runs= 10<30)

In [None]:
# Iterate over stress results
for n_neighbors, run_stress_values in stress_results.items():
    # Ensure run_stress_values is an array
    run_stress_values = np.array(run_stress_values)

    if run_stress_values.ndim == 0:  # Handle scalar case (not iterable)
        run_stress_values = np.array([run_stress_values])  # Convert scalar to array

In [None]:
# Import necessary libraries
from sklearn.metrics import pairwise_distances
from scipy.stats import t
import numpy as np

# Step 1: Load the UMAP Projections for the First 10 Runs
umap_projections_dict = {
    10: umap_projections_10_01_35[:10, sampled_indices_train_mds, :],  # First 10 runs for n_neighbors=10
    50: umap_projections_50_01_35[:10, sampled_indices_train_mds, :],  # First 10 runs for n_neighbors=50
    100: umap_projections_100_01_35[:10, sampled_indices_train_mds, :]  # First 10 runs for n_neighbors=100
}

# Step 2: Calculate Sammon's Stress for Each Run
sammon_results = {}  # Dictionary to store stress results for each n_neighbors
for n_neighbors, projections in umap_projections_dict.items():
    stress_values = []
    for run_number, run_projection in enumerate(projections, start=1):
        # Compute pairwise distances for the UMAP embedding
        try:
            embedding_distances = pairwise_distances(run_projection, metric='euclidean')
            # Calculate Sammon's stress
            stress = sammons_stress(original_distances, embedding_distances)
            stress_values.append((run_number, stress))
        except Exception as e:
            print(f"Error calculating stress for n_neighbors={n_neighbors}, run={run_number}: {e}")
            continue
    sammon_results[n_neighbors] = stress_values

# Step 3: Print the Results for the First 10 Runs
print("Sammon's Stress Results for the First 10 Runs:")
for n_neighbors, stress_values in sammon_results.items():
    print(f"\nUMAP (n_neighbors={n_neighbors}):")
    for run_number, stress in stress_values:
        print(f"  Run {run_number}: Sammon's Stress = {stress:.6f}")

# Step 4: Update Variability Computation with Multiple Runs
final_results = {}
run_variability = {}

for n_neighbors, stress_values in sammon_results.items():
    run_stress_values = [stress for _, stress in stress_values]
    mean_stress = np.mean(run_stress_values)
    std_stress = np.std(run_stress_values, ddof=1)  # Use ddof=1 for sample standard deviation

    # Calculate confidence interval using Student's t-distribution
    if len(run_stress_values) > 1:  # Ensure enough data points for CI calculation
        t_score = t.ppf(0.975, df=len(run_stress_values) - 1)
        margin_of_error = t_score * (std_stress / np.sqrt(len(run_stress_values)))
        confidence_interval = (mean_stress - margin_of_error, mean_stress + margin_of_error)
    else:
        confidence_interval = (mean_stress, mean_stress)

    # Store final results
    final_results[n_neighbors] = {
        "mean": mean_stress,
        "std": std_stress,
        "95% CI": confidence_interval,
        "run_values": run_stress_values
    }
    # Store standard deviation for run-to-run variability
    run_variability[n_neighbors] = std_stress

# Step 5: Print Final Results with Variability
print("\nSammon's Stress Results with Variability (10 Runs):")
for n_neighbors, stats in final_results.items():
    print(f"n_neighbors={n_neighbors}:")
    print(f"  Mean Stress: {stats['mean']:.4f}")
    print(f"  Standard Deviation Across Runs: {stats['std']:.4f}")
    print(f"  95% Confidence Interval: {stats['95% CI']}")
    for run_idx, stress in enumerate(stats['run_values'], start=1):
        print(f"    Run {run_idx}: Stress={stress:.4f}")


#### Procrustes Distance

In [None]:
from scipy.spatial import procrustes

# Calculate Procrustes distance between MDS and UMAP embeddings
procrustes_results = {}
for n_neighbors, umap_embedding in umap_projections_downsampled.items():
    # Perform Procrustes analysis
    mds_embedding = x_train_mds_c2  # Reference embedding (MDS)
    _, umap_aligned, disparity = procrustes(mds_embedding, umap_embedding)
    # Store the Procrustes distance (disparity)
    procrustes_results[n_neighbors] = disparity

# Print results
for n_neighbors, distance in procrustes_results.items():
    print(f"Procrustes Distance for UMAP (n_neighbors={n_neighbors}): {np.round(distance,3)}")

--------

-------------

# UMAP Methodology in image-based facial emotion recognition

## FER 2013 Import and images preparation

In [None]:
df = pd.read_csv("fer2013.csv")
df.info()

In [None]:
# Sample 15 random rows from the DataFrame
random_samples = df.sample(n=15, random_state=42)
random_samples

In [None]:
# Loop through the sampled rows to display the images
for index, row in random_samples.iterrows():
    # Convert the pixel string into a NumPy array and reshape it
    pixels_array = np.array(list(map(int, row["pixels"].split())), dtype=np.uint8).reshape(48, 48)

In [None]:
# Create a grid for 15 images (3 rows x 5 columns)
fig, axes = plt.subplots(3, 5, figsize=(15, 9))
axes = axes.flatten()  # Flatten the grid to easily iterate

# Loop through the sampled rows and display the images
for ax, (_, row) in zip(axes, random_samples.iterrows()):
    # Convert the pixel string into a NumPy array and reshape it
    pixels_array = np.array(list(map(int, row["pixels"].split())), dtype=np.uint8).reshape(48, 48)
    
    # Display the image
    ax.imshow(pixels_array, cmap="gray")
    # Add a title with relevant information
    ax.set_title(f"{row['Usage']}[{row.name}] = {row['emotion']}")
    ax.axis('off')  # Turn off axis for a cleaner look

# Adjust layout to avoid overlapping titles
plt.tight_layout()
plt.show()

In [None]:
# Split into training and testing sets based on 'Usage' column
train_fer2013 = df[df.Usage == "Training"]
test_fer2013 = df[df.Usage != "Training"]  # Assuming all non-training rows are for testing

In [None]:
# Step 1: Convert 'pixels' column to numerical arrays
def convert_pixels(pixels_str):
    return np.array([int(pixel) for pixel in pixels_str.split()], dtype=np.uint8)

# Apply conversion
train_fer2013_pixels = np.vstack(train_fer2013["pixels"].apply(convert_pixels).values)
test_fer2013_pixels = np.vstack(test_fer2013["pixels"].apply(convert_pixels).values)

# Step 2: Reshape to flatten into 1D vectors (2304 features)
x_train_fer2013 = train_fer2013_pixels.reshape(-1, 48 * 48)  # Shape: (28   709, 2304)
x_test_fer2013 = test_fer2013_pixels.reshape(-1, 48 * 48)    # Shape: (7178, 2304)

# Step 3: Normalize the flattened data
scaler = StandardScaler()
x_train_fer2013_scaled = scaler.fit_transform(x_train_fer2013)
x_test_fer2013_scaled = scaler.transform(x_test_fer2013)

# Verify shapes
print("Shape of x_train_scaled:", x_train_fer2013_scaled.shape)  # Should be (28709, 2304)
print("Shape of x_test_scaled:", x_test_fer2013_scaled.shape)    # Should be (7178, 2304)

In [None]:
# Extract labels directly as a 1D array
y_train_fer2013 = train_fer2013["emotion"].values  # Shape: (28709,)
y_test_fer2013 = test_fer2013["emotion"].values    # Shape: (7178,)

# Verify the shapes
print("Shape of y_train_fer2013:", y_train_fer2013.shape)
print("Shape of y_test_fer2013:", y_test_fer2013.shape)

In [None]:
# Dataframe flatten and normalized
print("Shape of x_train_fer2013:", x_train_fer2013.shape)
print("Shape of x_test_fer2013:", x_test_fer2013.shape)
print("Shape of y_train_fer2013:", y_train_fer2013.shape)
print("Shape of y_test_fer2013:", y_test_fer2013.shape)

------

### UMAP

#### Supervised UMAP 10 runs

In [None]:
# load the projections, mean, and standard deviation for the training set
fer_sup_umap_projections_train_10_01= np.load('fer_sup_umap_projections_train_10_01.npy')
fer_mean_sup_umap_projection_train_10_01= np.load('fer_mean_sup_umap_projection_train_10_01.npy')
fer_std_sup_umap_projection_train_10_01= np.load('fer_std_sup_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
fer_sup_umap_projections_test_10_01= np.load('fer_sup_umap_projections_test_10_01.npy')
fer_mean_sup_umap_projection_test_10_01= np.load('fer_mean_sup_umap_projection_test_10_01.npy')
fer_std_sup_umap_projection_test_10_01= np.load('fer_std_sup_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
fer_sup_umap_projections_train_10_01 = []
fer_sup_umap_projections_test_10_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running Supervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(x_train_fer2013, y_train_fer2013)
    fer_sup_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_fer2013)
    fer_sup_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
fer_sup_umap_projections_train_10_01 = np.array(fer_sup_umap_projections_train_10_01)
fer_sup_umap_projections_test_10_01 = np.array(fer_sup_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
fer_mean_sup_umap_projection_train_10_01 = np.mean(fer_sup_umap_projections_train_10_01, axis=0)
fer_std_sup_umap_projection_train_10_01 = np.std(fer_sup_umap_projections_train_10_01, axis=0)

fer_mean_sup_umap_projection_test_10_01 = np.mean(fer_sup_umap_projections_test_10_01, axis=0)
fer_std_sup_umap_projection_test_10_01 = np.std(fer_sup_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('fer_sup_umap_projections_train_10_01.npy', fer_sup_umap_projections_train_10_01)
np.save('fer_mean_sup_umap_projection_train_10_01.npy', fer_mean_sup_umap_projection_train_10_01)
np.save('fer_std_sup_umap_projection_train_10_01.npy', fer_std_sup_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('fer_sup_umap_projections_test_10_01.npy', fer_sup_umap_projections_test_10_01)
np.save('fer_mean_sup_umap_projection_test_10_01.npy', fer_mean_sup_umap_projection_test_10_01)
np.save('fer_std_sup_umap_projection_test_10_01.npy', fer_std_sup_umap_projection_test_10_01)

# Output confirmation
print("Supervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Load the projections, mean, and standard deviation for the test set
fer_sup_umap_projections_train_10_01= np.load('fer_sup_umap_projections_train_10_01.npy')
fer_mean_sup_umap_projection_train_10_01= np.load('fer_mean_sup_umap_projection_train_10_01.npy')
fer_std_sup_umap_projection_train_10_01= np.load('fer_std_sup_umap_projection_train_10_01.npy')

# Load the projections, mean, and standard deviation for the test set
fer_sup_umap_projections_test_10_01= np.load('fer_sup_umap_projections_test_10_01.npy')
fer_mean_sup_umap_projection_test_10_01= np.load('fer_mean_sup_umap_projection_test_10_01.npy')
fer_std_sup_umap_projection_test_10_01= np.load('fer_std_sup_umap_projection_test_10_01.npy')

In [None]:
# Adjust colormap to have exactly 7 colors
unique_labels = np.unique(y_train_fer2013)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(fer_mean_sup_umap_projection_train_10_01[:, 0], fer_mean_sup_umap_projection_train_10_01[:, 1], c=y_train_fer2013, cmap=cmap, s=5, alpha=0.8)

# Add title and labels
plt.title("Supervised UMAP Projection of FER2013 Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(0, 7))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(0, 7)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_fer_umap_sup_10_01 = adjusted_rand_score(y_test_fer2013, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_sup_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_sup_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_fer_umap_sup_10_01:.4f}")
# Silhouette Score
silhouette_fer_umap_sup_10_01 = silhouette_score(fer_mean_sup_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_sup_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_sup_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_fer_umap_sup_10_01:.2f}")
# Use KMeans for clustering
n_clusters = len(np.unique(y_train_fer2013))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(fer_mean_sup_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
fer_sup_umap_projection_10_01_db_score = davies_bouldin_score(
    fer_mean_sup_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {fer_sup_umap_projection_10_01_db_score:.2f}")

In [None]:
# Step 1: Load the saved mean UMAP projections
mean_projection = np.load('mean_sup_projection_10_01_35.npy')  # Shape: (n_samples, 2)

# Step 2: Separate the mean projection by class
classes = np.unique(y_train_fer2013)
class_gaussians = {}

# Calculate the mean and covariance for each class
for c in classes:
    class_points = mean_projection[y_train_fer2013 == c]  # Filter by class
    mean = np.mean(class_points, axis=0)
    cov = np.cov(class_points, rowvar=False)
    class_gaussians[c] = {"mean": mean, "cov": cov}

# Step 3: Visualize Gaussian distributions
plt.figure(figsize=(10, 8))

# Plot UMAP embeddings for each class
for c in classes:
    class_points = mean_projection[y_train_fer2013 == c]
    plt.scatter(class_points[:, 0], class_points[:, 1], label=f"Class {c}", alpha=0.5, s=10)

    # Plot Gaussian contours
    mean = class_gaussians[c]["mean"]
    cov = class_gaussians[c]["cov"]
    x, y = np.meshgrid(
        np.linspace(mean[0] - 3, mean[0] + 3, 100), 
        np.linspace(mean[1] - 3, mean[1] + 3, 100)
    )
    pos = np.dstack((x, y))
    rv = multivariate_normal(mean, cov)
    plt.contour(x, y, rv.pdf(pos), levels=5, alpha=0.8)

plt.title("UMAP Mean Projections with Gaussian Distributions per Class")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")
plt.legend()
plt.show()

# Step 4: Evaluate likelihood for a random point
random_point = np.array([0, 0])  # Example point in UMAP space
likelihoods = {c: multivariate_normal(class_gaussians[c]["mean"], class_gaussians[c]["cov"]).pdf(random_point)
               for c in classes}

print("Likelihoods for Random Point:", likelihoods)


#### Unsupervised UMAP 10 runs

In [None]:
# load the projections, mean, and standard deviation for the training set
fer_unsup_umap_projections_train_10_01= np.load('fer_unsup_umap_projections_train_10_01.npy')
fer_mean_unsup_umap_projection_train_10_01= np.load('fer_mean_unsup_umap_projection_train_10_01.npy')
fer_std_unsup_umap_projection_train_10_01= np.load('fer_std_unsup_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
fer_unsup_umap_projections_test_10_01= np.load('fer_unsup_umap_projections_test_10_01.npy')
fer_mean_unsup_umap_projection_test_10_01= np.load('fer_mean_unsup_umap_projection_test_10_01.npy')
fer_std_unsup_umap_projection_test_10_01= np.load('fer_std_unsup_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
fer_unsup_umap_projections_train_10_01 = []
fer_unsup_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_fer2013)
    fer_unsup_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_fer2013)
    fer_unsup_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
fer_unsup_umap_projections_train_10_01 = np.array(fer_unsup_umap_projections_train_10_01)
fer_unsup_umap_projections_test_10_01 = np.array(fer_unsup_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
fer_mean_unsup_umap_projection_train_10_01 = np.mean(fer_unsup_umap_projections_train_10_01, axis=0)
fer_std_unsup_umap_projection_train_10_01 = np.std(fer_unsup_umap_projections_train_10_01, axis=0)

fer_mean_unsup_umap_projection_test_10_01 = np.mean(fer_unsup_umap_projections_test_10_01, axis=0)
fer_std_unsup_umap_projection_test_10_01 = np.std(fer_unsup_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('fer_unsup_umap_projections_train_10_01.npy', fer_unsup_umap_projections_train_10_01)
np.save('fer_mean_unsup_umap_projection_train_10_01.npy', fer_mean_unsup_umap_projection_train_10_01)
np.save('fer_std_unsup_umap_projection_train_10_01.npy', fer_std_unsup_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('fer_unsup_umap_projections_test_10_01.npy', fer_unsup_umap_projections_test_10_01)
np.save('fer_mean_unsup_umap_projection_test_10_01.npy', fer_mean_unsup_umap_projection_test_10_01)
np.save('fer_std_unsup_umap_projection_test_10_01.npy', fer_std_unsup_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
unique_labels = np.unique(y_train_fer2013)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    fer_mean_unsup_umap_projection_train_10_01[:, 0],
    fer_mean_unsup_umap_projection_train_10_01[:, 1],
    c=y_train_fer2013, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Unsupervised UMAP Projection of FER2013 Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(0, 7))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(0, 7)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_fer_umap_unsup_10_01 = adjusted_rand_score(y_test_fer2013, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_unsup_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_unsup_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_fer_umap_unsup_10_01:.4f}")
# Silhouette Score
silhouette_fer_umap_unsup_10_01 = silhouette_score(fer_mean_unsup_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_unsup_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_unsup_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_fer_umap_unsup_10_01:.2f}")
# Use KMeans for clustering
n_clusters = len(np.unique(y_train_fer2013))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(fer_mean_unsup_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
fer_unsup_umap_projection_10_01_db_score = davies_bouldin_score(
    fer_mean_unsup_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {fer_unsup_umap_projection_10_01_db_score:.2f}")

-----

### PCA + UMAP

In [None]:
# Step 2: Apply PCA
pca = PCA(0.95)
x_train_fer2013_pca_emotions = pca.fit_transform(x_train_fer2013)
x_test_fer2013_pca_emotions = pca.transform(x_test_fer2013)

print(f"Original number of features: {x_train_fer2013.shape[1]}")
print(f"Reduced number of features: {x_train_fer2013_pca_emotions.shape[1]}")

In [None]:
# Save the projections, mean, and standard deviation
np.save('x_train_fer2013_pca_emotions.npy', x_train_fer2013_pca_emotions)
np.save('x_test_fer2013_pca_emotions.npy', x_test_fer2013_pca_emotions)

#### PCA + UMAP Unsupervised 10 runs

In [None]:
# load the projections, mean, and standard deviation for the training set
fer_unsup_pca_umap_projections_train_10_01= np.load('fer_unsup_pca_umap_projections_train_10_01.npy')
fer_mean_unsup_pca_umap_projection_train_10_01= np.load('fer_mean_unsup_pca_umap_projection_train_10_01.npy')
fer_std_unsup_pca_umap_projection_train_10_01= np.load('fer_std_unsup_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
fer_unsup_pca_umap_projections_test_10_01= np.load('fer_unsup_pca_umap_projections_test_10_01.npy')
fer_mean_unsup_pca_umap_projection_test_10_01= np.load('fer_mean_unsup_pca_umap_projection_test_10_01.npy')
fer_std_unsup_pca_umap_projection_test_10_01= np.load('fer_std_unsup_pca_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
fer_unsup_pca_umap_projections_train_10_01 = []
fer_unsup_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_fer2013_pca_emotions)
    fer_unsup_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_fer2013_pca_emotions)
    fer_unsup_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
fer_unsup_pca_umap_projections_train_10_01 = np.array(fer_unsup_pca_umap_projections_train_10_01)
fer_unsup_pca_umap_projections_test_10_01 = np.array(fer_unsup_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
fer_mean_unsup_pca_umap_projection_train_10_01 = np.mean(fer_unsup_pca_umap_projections_train_10_01, axis=0)
fer_std_unsup_pca_umap_projection_train_10_01 = np.std(fer_unsup_pca_umap_projections_train_10_01, axis=0)

fer_mean_unsup_pca_umap_projection_test_10_01 = np.mean(fer_unsup_pca_umap_projections_test_10_01, axis=0)
fer_std_unsup_pca_umap_projection_test_10_01 = np.std(fer_unsup_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('fer_unsup_pca_umap_projections_train_10_01.npy', fer_unsup_pca_umap_projections_train_10_01)
np.save('fer_mean_unsup_pca_umap_projection_train_10_01.npy', fer_mean_unsup_pca_umap_projection_train_10_01)
np.save('fer_std_unsup_pca_umap_projection_train_10_01.npy', fer_std_unsup_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('fer_unsup_pca_umap_projections_test_10_01.npy', fer_unsup_pca_umap_projections_test_10_01)
np.save('fer_mean_unsup_pca_umap_projection_test_10_01.npy', fer_mean_unsup_pca_umap_projection_test_10_01)
np.save('fer_std_unsup_pca_umap_projection_test_10_01.npy', fer_std_unsup_pca_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
unique_labels = np.unique(y_train_fer2013)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    fer_mean_unsup_pca_umap_projection_train_10_01[:, 0],
    fer_mean_unsup_pca_umap_projection_train_10_01[:, 1],
    c=y_train_fer2013, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("PCA + Unsupervised UMAP Projection of FER2013 Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(0, 7))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(0, 7)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_fer_pca_umap_unsup_10_01 = adjusted_rand_score(y_test_fer2013, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_unsup_pca_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_unsup_pca_umap_projection_test_10_01))
print(f"ARI: {ari_fer_pca_umap_unsup_10_01:.4f}")
# Silhouette Score
silhouette_fer_pca_umap_unsup_10_01 = silhouette_score(fer_mean_unsup_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_unsup_pca_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_unsup_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_fer_pca_umap_unsup_10_01:.2f}")

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(fer_mean_unsup_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
fer_unsup_pca_projection_10_01_db_score = davies_bouldin_score(
    fer_mean_unsup_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {fer_unsup_pca_projection_10_01_db_score:.2f}")

----------

#### PCA + UMAP Supervised 10 runs

In [None]:
# load the projections, mean, and standard deviation for the training set
fer_sup_pca_umap_projections_train_10_01= np.load('fer_sup_pca_umap_projections_train_10_01.npy')
fer_mean_sup_pca_umap_projection_train_10_01= np.load('fer_mean_sup_pca_umap_projection_train_10_01.npy')
fer_std_sup_pca_umap_projection_train_10_01= np.load('fer_std_sup_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
fer_sup_pca_umap_projections_test_10_01= np.load('fer_sup_pca_umap_projections_test_10_01.npy')
fer_mean_sup_pca_umap_projection_test_10_01= np.load('fer_mean_sup_pca_umap_projection_test_10_01.npy')
fer_std_sup_pca_umap_projection_test_10_01= np.load('fer_std_sup_pca_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
fer_sup_pca_umap_projections_train_10_01 = []
fer_sup_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_fer2013_pca_emotions,y_train_fer2013)
    fer_sup_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_fer2013_pca_emotions)
    fer_sup_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
fer_sup_pca_umap_projections_train_10_01 = np.array(fer_sup_pca_umap_projections_train_10_01)
fer_sup_pca_umap_projections_test_10_01 = np.array(fer_sup_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
fer_mean_sup_pca_umap_projection_train_10_01 = np.mean(fer_sup_pca_umap_projections_train_10_01, axis=0)
fer_std_sup_pca_umap_projection_train_10_01 = np.std(fer_sup_pca_umap_projections_train_10_01, axis=0)

fer_mean_sup_pca_umap_projection_test_10_01 = np.mean(fer_sup_pca_umap_projections_test_10_01, axis=0)
fer_std_sup_pca_umap_projection_test_10_01 = np.std(fer_sup_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('fer_sup_pca_umap_projections_train_10_01.npy', fer_sup_pca_umap_projections_train_10_01)
np.save('fer_mean_sup_pca_umap_projection_train_10_01.npy', fer_mean_sup_pca_umap_projection_train_10_01)
np.save('fer_std_sup_pca_umap_projection_train_10_01.npy', fer_std_sup_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('fer_sup_pca_umap_projections_test_10_01.npy', fer_sup_pca_umap_projections_test_10_01)
np.save('fer_mean_sup_pca_umap_projection_test_10_01.npy', fer_mean_sup_pca_umap_projection_test_10_01)
np.save('fer_std_sup_pca_umap_projection_test_10_01.npy', fer_std_sup_pca_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
unique_labels = np.unique(y_train_fer2013)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    fer_mean_sup_pca_umap_projection_train_10_01[:, 0],
    fer_mean_sup_pca_umap_projection_train_10_01[:, 1],
    c=y_train_fer2013, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("PCA + Supervised UMAP Projection of FER2013 Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(0, 7))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(0, 7)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_fer_pca_umap_sup_10_01 = adjusted_rand_score(y_test_fer2013, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_sup_pca_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_sup_pca_umap_projection_test_10_01))
print(f"ARI: {ari_fer_pca_umap_sup_10_01:.4f}")
# Silhouette Score
silhouette_fer_pca_umap_sup_10_01 = silhouette_score(fer_mean_sup_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_sup_pca_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_sup_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_fer_pca_umap_sup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train_fer2013))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(fer_mean_sup_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
fer_sup_umap_pca_projection_10_01_db_score = davies_bouldin_score(
    fer_mean_sup_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {fer_sup_umap_pca_projection_10_01_db_score:.2f}")

---------

### Gabor filters + PCA + UMAP

#### Gabor filters + PCA + Unsupervised UMAP 10 runs

In [None]:
# load the projections, mean, and standard deviation for the training set
fer_unsup_gabor_pca_umap_projections_train_10_01= np.load('fer_unsup_gabor_pca_umap_projections_train_10_01.npy')
fer_mean_unsup_gabor_pca_umap_projection_train_10_01= np.load('fer_mean_unsup_gabor_pca_umap_projection_train_10_01.npy')
fer_std_unsup_gabor_pca_umap_projection_train_10_01= np.load('fer_std_unsup_gabor_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
fer_unsup_gabor_pca_umap_projections_test_10_01= np.load('fer_unsup_gabor_pca_umap_projections_test_10_01.npy')
fer_mean_unsup_gabor_pca_umap_projection_test_10_01= np.load('fer_mean_unsup_gabor_pca_umap_projection_test_10_01.npy')
fer_std_unsup_gabor_pca_umap_projection_test_10_01= np.load('fer_std_unsup_gabor_pca_umap_projection_test_10_01.npy')

In [None]:
# Create Gabor Kernels
def create_gabor_kernels():
    """Generates a set of Gabor kernels with different orientations and frequencies."""
    kernels = []
    ksize = 31  # Kernel size
    sigma = 4.0  # Standard deviation of the Gaussian envelope
    lambd = 10.0  # Wavelength of the sinusoidal factor
    gamma = 0.5  # Spatial aspect ratio
    for theta in np.arange(0, np.pi, np.pi / 4):  # 8 orientations
        kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lambd, gamma, psi=0, ktype=cv2.CV_32F)
        kernels.append(kernel)
    return kernels

# Apply Gabor Filters
def apply_gabor_filters(images, kernels):
    """Applies a set of Gabor filters to a batch of images."""
    gabor_features = []
    for image in images:
        image_2d = image.reshape(48, 48)  # Reshape back to 2D (assumes 48x48 images)
        responses = []
        for kernel in kernels:
            filtered = cv2.filter2D(image_2d, cv2.CV_32F, kernel)  # Apply Gabor filter
            responses.append(filtered.flatten())  # Flatten the filtered image
        gabor_features.append(np.concatenate(responses))  # Concatenate all filter responses
    return np.array(gabor_features)

In [None]:
# Generate Gabor kernels
gabor_kernels = create_gabor_kernels()
print(f"Generated {len(gabor_kernels)} Gabor kernels.")

In [None]:
# Apply Gabor filters to the training and test sets
fer_x_train_gabor = apply_gabor_filters(x_train_fer2013, gabor_kernels)
fer_x_test_gabor = apply_gabor_filters(x_test_fer2013, gabor_kernels)

print(f"Train Gabor feature shape: {fer_x_train_gabor.shape}")
print(f"Test Gabor feature shape: {fer_x_test_gabor.shape}")

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
fer_x_train_gabor = scaler.fit_transform(fer_x_train_gabor)
fer_x_test_gabor = scaler.transform(fer_x_test_gabor)

Applying PCA to Gabor filters

In [None]:
pca = PCA(0.95)
x_fer_train_gabor_pca = pca.fit_transform(fer_x_train_gabor)
x_fer_test_gabor_pca = pca.transform(fer_x_test_gabor)

print(f"Reduced train shape: {x_fer_train_gabor_pca.shape}")
print(f"Reduced test shape: {x_fer_test_gabor_pca.shape}")

In [None]:
np.save('x_raf_train_gabor_pca.npy', x_fer_train_gabor_pca)
np.save('x_raf_test_gabor_pca.npy', x_fer_test_gabor_pca)

Applying UMAP to PCA results from Gabor Filters

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
fer_unsup_gabor_pca_umap_projections_train_10_01 = []
fer_unsup_gabor_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_fer_train_gabor_pca)
    fer_unsup_gabor_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_fer_test_gabor_pca)
    fer_unsup_gabor_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
fer_unsup_gabor_pca_umap_projections_train_10_01 = np.array(fer_unsup_gabor_pca_umap_projections_train_10_01)
fer_unsup_gabor_pca_umap_projections_test_10_01 = np.array(fer_unsup_gabor_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
fer_mean_unsup_gabor_pca_umap_projection_train_10_01 = np.mean(fer_unsup_gabor_pca_umap_projections_train_10_01, axis=0)
fer_std_unsup_gabor_pca_umap_projection_train_10_01 = np.std(fer_unsup_gabor_pca_umap_projections_train_10_01, axis=0)

fer_mean_unsup_gabor_pca_umap_projection_test_10_01 = np.mean(fer_unsup_gabor_pca_umap_projections_test_10_01, axis=0)
fer_std_unsup_gabor_pca_umap_projection_test_10_01 = np.std(fer_unsup_gabor_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('fer_unsup_gabor_pca_umap_projections_train_10_01.npy', fer_unsup_gabor_pca_umap_projections_train_10_01)
np.save('fer_mean_unsup_gabor_pca_umap_projection_train_10_01.npy', fer_mean_unsup_gabor_pca_umap_projection_train_10_01)
np.save('fer_std_unsup_gabor_pca_umap_projection_train_10_01.npy', fer_std_unsup_gabor_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('fer_unsup_gabor_pca_umap_projections_test_10_01.npy', fer_unsup_gabor_pca_umap_projections_test_10_01)
np.save('fer_mean_unsup_gabor_pca_umap_projection_test_10_01.npy', fer_mean_unsup_gabor_pca_umap_projection_test_10_01)
np.save('fer_std_unsup_gabor_pca_umap_projection_test_10_01.npy', fer_std_unsup_gabor_pca_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
unique_labels = np.unique(y_train_fer2013)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    fer_mean_unsup_gabor_pca_umap_projection_train_10_01[:, 0],
    fer_mean_unsup_gabor_pca_umap_projection_train_10_01[:, 1],
    c=y_train_fer2013, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Gabor + PCA + Unsupervised UMAP Projection of FER2013 Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(0, 7))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(0, 7)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_fer_gabor_pca_umap_unsup_10_01 = adjusted_rand_score(y_test_fer2013, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_unsup_gabor_pca_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_unsup_gabor_pca_umap_projection_test_10_01)) # second argument is y_test_fer2013_pred_gabor_pca
print(f"ARI: {ari_fer_gabor_pca_umap_unsup_10_01:.4f}")
# Silhouette Score
silhouette_fer_gabor_pca_umap_unsup_10_01 = silhouette_score(fer_mean_unsup_gabor_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_unsup_gabor_pca_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_unsup_gabor_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_fer_gabor_pca_umap_unsup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train_fer2013))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(fer_mean_unsup_gabor_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
fer_unsup_gabor_pca_umap_projection_10_01_db_score = davies_bouldin_score(
    fer_mean_unsup_gabor_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {fer_unsup_gabor_pca_umap_projection_10_01_db_score:.2f}")

#### Gabor filters + PCA + UMAP Supervised 10 runs

In [None]:
# load the projections, mean, and standard deviation for the training set
fer_sup_gabor_pca_umap_projections_train_10_01= np.load('fer_sup_gabor_pca_umap_projections_train_10_01.npy')
fer_mean_sup_gabor_pca_umap_projection_train_10_01= np.load('fer_mean_sup_gabor_pca_umap_projection_train_10_01.npy')
fer_std_sup_gabor_pca_umap_projection_train_10_01= np.load('fer_std_sup_gabor_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
fer_sup_gabor_pca_umap_projections_test_10_01= np.load('fer_sup_gabor_pca_umap_projections_test_10_01.npy')
fer_mean_sup_gabor_pca_umap_projection_test_10_01= np.load('fer_mean_sup_gabor_pca_umap_projection_test_10_01.npy')
fer_std_sup_gabor_pca_umap_projection_test_10_01= np.load('fer_std_sup_gabor_pca_umap_projection_test_10_01.npy')

Label information added to results from Unsupervised Gabor filters + PCA when computing UMAP using them as input.

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
fer_sup_gabor_pca_umap_projections_train_10_01 = []
fer_sup_gabor_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running Supervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(x_fer_train_gabor_pca, y_train_fer2013)
    fer_sup_gabor_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_fer_test_gabor_pca)
    fer_sup_gabor_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
fer_sup_gabor_pca_umap_projections_train_10_01 = np.array(fer_sup_gabor_pca_umap_projections_train_10_01)
fer_sup_gabor_pca_umap_projections_test_10_01 = np.array(fer_sup_gabor_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
fer_mean_sup_gabor_pca_umap_projection_train_10_01 = np.mean(fer_sup_gabor_pca_umap_projections_train_10_01, axis=0)
fer_std_sup_gabor_pca_umap_projection_train_10_01 = np.std(fer_sup_gabor_pca_umap_projections_train_10_01, axis=0)

fer_mean_sup_gabor_pca_umap_projection_test_10_01 = np.mean(fer_sup_gabor_pca_umap_projections_test_10_01, axis=0)
fer_std_sup_gabor_pca_umap_projection_test_10_01 = np.std(fer_sup_gabor_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('fer_sup_gabor_pca_umap_projections_train_10_01.npy', fer_sup_gabor_pca_umap_projections_train_10_01)
np.save('fer_mean_sup_gabor_pca_umap_projection_train_10_01.npy', fer_mean_sup_gabor_pca_umap_projection_train_10_01)
np.save('fer_std_sup_gabor_pca_umap_projection_train_10_01.npy', fer_std_sup_gabor_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('fer_sup_gabor_pca_umap_projections_test_10_01.npy', fer_sup_gabor_pca_umap_projections_test_10_01)
np.save('fer_mean_sup_gabor_pca_umap_projection_test_10_01.npy', fer_mean_sup_gabor_pca_umap_projection_test_10_01)
np.save('fer_std_sup_gabor_pca_umap_projection_test_10_01.npy', fer_std_sup_gabor_pca_umap_projection_test_10_01)

# Output confirmation
print("Supervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
unique_labels = np.unique(y_train_fer2013)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    fer_mean_sup_gabor_pca_umap_projection_train_10_01[:, 0],
    fer_mean_sup_gabor_pca_umap_projection_train_10_01[:, 1],
    c=y_train_fer2013, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Gabor + PCA + Supervised UMAP Projection of FER2013 Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(0, 7))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(0, 7)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_fer_gabor_pca_umap_sup_10_01 = adjusted_rand_score(y_test_fer2013, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_sup_gabor_pca_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_sup_gabor_pca_umap_projection_test_10_01)) # second argument is y_test_fer2013_pred_gabor_pca
print(f"ARI: {ari_fer_gabor_pca_umap_sup_10_01:.4f}")
# Silhouette Score
silhouette_fer_gabor_pca_umap_sup_10_01 = silhouette_score(fer_mean_sup_gabor_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_sup_gabor_pca_umap_projection_train_10_01, y_train_fer2013).predict(fer_mean_sup_gabor_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_fer_gabor_pca_umap_sup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train_fer2013))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(fer_mean_sup_gabor_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
fer_sup_gabor_pca_umap_projection_10_01_db_score = davies_bouldin_score(
    fer_mean_sup_gabor_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {fer_sup_gabor_pca_umap_projection_10_01_db_score:.2f}")

--------

### LLE + UMAP

In [None]:
def downsample_fer_consistent(x_data, y_labels, sample_fraction=0.75):
    """
    Downsample the dataset consistently, returning indices to ensure
    the same points are selected in both spaces.
    
    Parameters:
        x_data (np.array): Input data to downsample.
        y_labels (np.array): Corresponding labels.
        sample_fraction (float): Fraction of samples to retain per label.

    Returns:
        np.array: Array of selected indices.
    """
    sampled_indices = []
    unique_labels = np.unique(y_labels)

    for label in unique_labels:
        # Select indices for the current label
        label_indices = np.where(y_labels == label)[0]

        # Handle cases with very few samples
        n_samples = max(1, int(len(label_indices) * sample_fraction))
        if n_samples > len(label_indices):
            n_samples = len(label_indices)

        # Sample a fraction of points for this label
        sampled_indices_label = resample(
            label_indices, n_samples=n_samples, replace=False, random_state=42
        )
        sampled_indices.extend(sampled_indices_label)

    return np.array(sampled_indices)

# Downsample training data
fer_sampled_indices_train = downsample_fer_consistent(x_train_fer2013, y_train_fer2013, sample_fraction=0.75)
x_train_fer_emotions_sampled = x_train_fer2013[fer_sampled_indices_train]
y_train_fer_emotions_sampled = y_train_fer2013[fer_sampled_indices_train]

# Downsample test data
fer_sampled_indices_test = downsample_fer_consistent(x_test_fer2013, y_test_fer2013, sample_fraction=0.75)
x_test_fer_emotions_sampled = x_test_fer2013[fer_sampled_indices_test]
y_test_fer_emotions_sampled = y_test_fer2013[fer_sampled_indices_test]

# Print results
print(f"Training set reduced to {len(x_train_fer_emotions_sampled)} samples.")
print(f"Test set reduced to {len(x_test_fer_emotions_sampled)} samples.")

In [None]:
# Define parameters for LLE
n_neighbors = 10
n_components = 253 # Same as the reduced from PCA

# Initialize the LLE model
lle = LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=n_components, method='standard', random_state=42)

# Fit and transform the training data
print("Running LLE on the training set...")
x_train_fer_lle = lle.fit_transform(x_train_fer_emotions_sampled)
print("LLE transformation on training set completed.")

# Transform the test data using the fitted LLE model
print("Running LLE on the test set...")
x_test_fer_lle = lle.transform(x_test_fer_emotions_sampled)
print("LLE transformation on test set completed.")

# Print shapes of transformed data
print(f"Shape of LLE-transformed training data: {x_train_fer_lle.shape}")
print(f"Shape of LLE-transformed test data: {x_test_fer_lle.shape}")

# Optional: Save the LLE-transformed data for later use
np.save('x_train_lle.npy', x_train_fer_lle)
np.save('x_test_lle.npy', x_test_fer_lle)

# Output confirmation
print("LLE-transformed data has been saved.")

#### LLE + UMAP Unsupervised 10 runs

In [None]:
# Save the projections, mean, and standard deviation for the training set
fer_unsup_lle_umap_projections_train_10_01= np.load('fer_unsup_lle_umap_projections_train_10_01.npy')
fer_mean_unsup_lle_umap_projection_train_10_01= np.load('fer_mean_unsup_lle_umap_projection_train_10_01.npy')
fer_std_unsup_lle_umap_projection_train_10_01= np.load('fer_std_unsup_lle_umap_projection_train_10_01.npy')

# Save the projections, mean, and standard deviation for the test set
fer_unsup_lle_umap_projections_test_10_01= np.load('fer_unsup_lle_umap_projections_test_10_01.npy')
fer_mean_unsup_lle_umap_projection_test_10_01= np.load('fer_mean_unsup_lle_umap_projection_test_10_01.npy')
fer_std_unsup_lle_umap_projection_test_10_01= np.load('fer_std_unsup_lle_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
fer_unsup_lle_umap_projections_train_10_01 = []
fer_unsup_lle_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_fer_lle)
    fer_unsup_lle_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_fer_lle)
    fer_unsup_lle_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
fer_unsup_lle_umap_projections_train_10_01 = np.array(fer_unsup_lle_umap_projections_train_10_01)
fer_unsup_lle_umap_projections_test_10_01 = np.array(fer_unsup_lle_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
fer_mean_unsup_lle_umap_projection_train_10_01 = np.mean(fer_unsup_lle_umap_projections_train_10_01, axis=0)
fer_std_unsup_lle_umap_projection_train_10_01 = np.std(fer_unsup_lle_umap_projections_train_10_01, axis=0)

fer_mean_unsup_lle_umap_projection_test_10_01 = np.mean(fer_unsup_lle_umap_projections_test_10_01, axis=0)
fer_std_unsup_lle_umap_projection_test_10_01 = np.std(fer_unsup_lle_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('fer_unsup_lle_umap_projections_train_10_01.npy', fer_unsup_lle_umap_projections_train_10_01)
np.save('fer_mean_unsup_lle_umap_projection_train_10_01.npy', fer_mean_unsup_lle_umap_projection_train_10_01)
np.save('fer_std_unsup_lle_umap_projection_train_10_01.npy', fer_std_unsup_lle_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('fer_unsup_lle_umap_projections_test_10_01.npy', fer_unsup_lle_umap_projections_test_10_01)
np.save('fer_mean_unsup_lle_umap_projection_test_10_01.npy', fer_mean_unsup_lle_umap_projection_test_10_01)
np.save('fer_std_unsup_lle_umap_projection_test_10_01.npy', fer_std_unsup_lle_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
unique_labels = np.unique(y_train_fer_emotions_sampled)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    fer_mean_unsup_lle_umap_projection_train_10_01[:, 0],
    fer_mean_unsup_lle_umap_projection_train_10_01[:, 1],
    c=y_train_fer_emotions_sampled, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("LLE + Unsupervised UMAP Projection of FER2013 Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(0, 7))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(0, 7)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_fer_lle_umap_unsup_10_01 = adjusted_rand_score(y_test_fer_emotions_sampled, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_unsup_lle_umap_projection_train_10_01, y_train_fer_emotions_sampled).predict(fer_mean_unsup_lle_umap_projection_test_10_01)) # second argument is y_test_pred_lle
print(f"ARI: {ari_fer_lle_umap_unsup_10_01:.2f}")
# Silhouette Score
silhouette_fer_lle_umap_unsup_10_01 = silhouette_score(fer_mean_unsup_lle_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_unsup_lle_umap_projection_train_10_01, y_train_fer_emotions_sampled).predict(fer_mean_unsup_lle_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_fer_lle_umap_unsup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train_fer2013))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(fer_mean_unsup_lle_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
fer_lle_umap_unsup_10_01_db_score = davies_bouldin_score(
    fer_mean_unsup_lle_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {fer_lle_umap_unsup_10_01_db_score:.2f}")

-----

#### LLE + UMAP Supervised 10 runs

In [None]:
# Save the projections, mean, and standard deviation for the training set
fer_sup_lle_umap_projections_train_10_01= np.load('fer_sup_lle_umap_projections_train_10_01.npy')
fer_mean_sup_lle_umap_projection_train_10_01= np.load('fer_mean_sup_lle_umap_projection_train_10_01.npy')
fer_std_sup_lle_umap_projection_train_10_01= np.load('fer_std_sup_lle_umap_projection_train_10_01.npy')

# Save the projections, mean, and standard deviation for the test set
fer_sup_lle_umap_projections_test_10_01= np.load('fer_sup_lle_umap_projections_test_10_01.npy')
fer_mean_sup_lle_umap_projection_test_10_01= np.load('fer_mean_sup_lle_umap_projection_test_10_01.npy')
fer_std_sup_lle_umap_projection_test_10_01= np.load('fer_std_sup_lle_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
fer_sup_lle_umap_projections_train_10_01 = []
fer_sup_lle_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_fer_lle, y_train_fer_emotions_sampled)
    fer_sup_lle_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_fer_lle)
    fer_sup_lle_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
fer_sup_lle_umap_projections_train_10_01 = np.array(fer_sup_lle_umap_projections_train_10_01)
fer_sup_lle_umap_projections_test_10_01 = np.array(fer_sup_lle_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
fer_mean_sup_lle_umap_projection_train_10_01 = np.mean(fer_sup_lle_umap_projections_train_10_01, axis=0)
fer_std_sup_lle_umap_projection_train_10_01 = np.std(fer_sup_lle_umap_projections_train_10_01, axis=0)

fer_mean_sup_lle_umap_projection_test_10_01 = np.mean(fer_sup_lle_umap_projections_test_10_01, axis=0)
fer_std_sup_lle_umap_projection_test_10_01 = np.std(fer_sup_lle_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('fer_sup_lle_umap_projections_train_10_01.npy', fer_sup_lle_umap_projections_train_10_01)
np.save('fer_mean_sup_lle_umap_projection_train_10_01.npy', fer_mean_sup_lle_umap_projection_train_10_01)
np.save('fer_std_sup_lle_umap_projection_train_10_01.npy', fer_std_sup_lle_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('fer_sup_lle_umap_projections_test_10_01.npy', fer_sup_lle_umap_projections_test_10_01)
np.save('fer_mean_sup_lle_umap_projection_test_10_01.npy', fer_mean_sup_lle_umap_projection_test_10_01)
np.save('fer_std_sup_lle_umap_projection_test_10_01.npy', fer_std_sup_lle_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
unique_labels = np.unique(y_train_fer_emotions_sampled)
cmap = plt.cm.get_cmap("tab10", len(unique_labels))

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    fer_mean_sup_lle_umap_projection_train_10_01[:, 0],
    fer_mean_sup_lle_umap_projection_train_10_01[:, 1],
    c=y_train_fer_emotions_sampled, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("LLE + Supervised UMAP Projection of FER2013 Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(0, 7))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(0, 7)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_fer_lle_umap_sup_10_01 = adjusted_rand_score(y_test_fer_emotions_sampled, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_sup_lle_umap_projection_train_10_01, y_train_fer_emotions_sampled).predict(fer_mean_sup_lle_umap_projection_test_10_01)) # second argument is y_test_pred_lle
print(f"ARI: {ari_fer_lle_umap_sup_10_01:.2f}")
# Silhouette Score
silhouette_fer_lle_umap_sup_10_01 = silhouette_score(fer_mean_sup_lle_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(fer_mean_sup_lle_umap_projection_train_10_01, y_train_fer_emotions_sampled).predict(fer_mean_sup_lle_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_fer_lle_umap_sup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train_fer2013))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(fer_mean_sup_lle_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
fer_lle_umap_sup_10_01_db_score = davies_bouldin_score(
    fer_mean_sup_lle_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {fer_lle_umap_sup_10_01_db_score:.2f}")

------

## RAF-DB Import and images preparation

Load images and labels

In [4]:
class EmotionsDataloader(object):
    def __init__(self, image_dir, label_file):
        self.image_dir = image_dir
        self.label_file = label_file

    def read_images_labels(self):  
        train_images, train_labels = [], []
        test_images, test_labels = [], []

        # Read the label file and match labels to images
        with open(self.label_file, 'r') as file:
            for line in file:
                parts = line.strip().split()
                if len(parts) == 2:
                    image_name, label = parts[0], int(parts[1])
                    aligned_image_name = f"{image_name.split('.')[0]}_aligned.jpg"
                    image_path = os.path.join(self.image_dir, aligned_image_name)

                    if os.path.exists(image_path):
                        image = Image.open(image_path).convert('L')  # Convert to grayscale
                        image = image.resize((48, 48))  # Resize to 48x48

                        # Check if the label belongs to train or test set
                        if "train" in image_name:
                            train_images.append(np.array(image))
                            train_labels.append(label)
                        elif "test" in image_name:
                            test_images.append(np.array(image))
                            test_labels.append(label)
                    else:
                        print(f"Image not found: {aligned_image_name}")

        print(f"Loaded {len(train_images)} training images and {len(test_images)} test images.")
        print(f"Loaded {len(train_labels)} training labels and {len(test_labels)} test labels.")

        return (
            (np.array(train_images), np.array(train_labels)),
            (np.array(test_images), np.array(test_labels))
        )

    def load_data(self):
        return self.read_images_labels()

In [None]:
# Set file paths
input_path = 'C:/Users/Lorenzo/OneDrive/Documents/DTU/Python/2024 Fall/MSc Thesis'
image_dir = os.path.join(input_path, 'extracted_data/Image/aligned')
label_file = os.path.join(input_path, 'extracted_data/EmoLabel/list_patition_label.txt')

# Instantiate and load the dataset
emotions_dataloader = EmotionsDataloader(image_dir, label_file)
(x_train, y_train), (x_test, y_test) = emotions_dataloader.load_data()

# Print dataset shapes
print(f"Training set shape: {x_train.shape}, {y_train.shape}")
print(f"Testing set shape: {x_test.shape}, {y_test.shape}")

# Display some random train and test images
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images) / cols) + 1
    plt.figure(figsize=(15, 10))
    for i, (image, title) in enumerate(zip(images, title_texts)):
        plt.subplot(rows, cols, i + 1)
        plt.imshow(image, cmap=plt.cm.gray)
        plt.title(title, fontsize=12)
        plt.axis('off')

# Show some random train and test images
images_to_show = []
titles_to_show = []

for i in range(10):
    idx = np.random.randint(0, len(x_train))
    images_to_show.append(x_train[idx])
    titles_to_show.append(f"Train[{idx}] = {y_train[idx]}")

for i in range(5):
    idx = np.random.randint(0, len(x_test))
    images_to_show.append(x_test[idx])
    titles_to_show.append(f"Test[{idx}] = {y_test[idx]}")

show_images(images_to_show, titles_to_show)

In [6]:
### YES ###

assert len(x_train) == len(y_train), "Mismatch in training images and labels!"
assert len(x_test) == len(y_test), "Mismatch in test images and labels!"

In [None]:
### YES ###

from sklearn.preprocessing import StandardScaler

# Step 1: Flatten the Images into 1D Vectors
x_train_flattened = x_train.reshape(x_train.shape[0], -1)  # Flatten to (num_samples, 2304)
x_test_flattened = x_test.reshape(x_test.shape[0], -1)    # Flatten to (num_samples, 2304)

# Verify shapes
print("Shape of x_train_flattened:", x_train_flattened.shape)  # (num_train_samples, 2304)
print("Shape of x_test_flattened:", x_test_flattened.shape)    # (num_test_samples, 2304)

# Step 2: Normalize the Flattened Data
scaler = StandardScaler()
x_train_emotion_norm = scaler.fit_transform(x_train_flattened)
x_test_emotion_norm = scaler.transform(x_test_flattened)

# Verify normalization
print("x_train_norma mean:", x_train_emotion_norm.mean(axis=0).mean())  # ~0
print("x_train_norma std:", x_test_emotion_norm.std(axis=0).mean())    # ~1

# Print final shapes
print("Final shape of x_train_norm:", x_train_emotion_norm.shape)
print("Final shape of x_test_norm:", x_test_emotion_norm.shape)

In [None]:
### YES ###

# Check for missing values
print(f"Missing values in x_train_emotion_norm: {np.isnan(x_train_emotion_norm).sum()}")
print(f"Missing values in x_test_emotion_norm: {np.isnan(x_test_emotion_norm).sum()}")

-----------------

### UMAP

#### Unsupervised UMAP 10 runs

In [2]:
# load the projections, mean, and standard deviation for the training set
raf_unsup_umap_projections_train_10_01= np.load('raf_unsup_umap_projections_train_10_01.npy')
raf_mean_unsup_umap_projection_train_10_01= np.load('raf_mean_unsup_umap_projection_train_10_01.npy')
raf_std_unsup_umap_projection_train_10_01= np.load('raf_std_unsup_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_unsup_umap_projections_test_10_01= np.load('raf_unsup_umap_projections_test_10_01.npy')
raf_mean_unsup_umap_projection_test_10_01= np.load('raf_mean_unsup_umap_projection_test_10_01.npy')
raf_std_unsup_umap_projection_test_10_01= np.load('raf_std_unsup_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
raf_unsup_umap_projections_train_10_01 = []
raf_unsup_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_emotion_norm)
    raf_unsup_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_emotion_norm)
    raf_unsup_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_unsup_umap_projections_train_10_01 = np.array(raf_unsup_umap_projections_train_10_01)
raf_unsup_umap_projections_test_10_01 = np.array(raf_unsup_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_unsup_umap_projection_train_10_01 = np.mean(raf_unsup_umap_projections_train_10_01, axis=0)
raf_std_unsup_umap_projection_train_10_01 = np.std(raf_unsup_umap_projections_train_10_01, axis=0)

raf_mean_unsup_umap_projection_test_10_01 = np.mean(raf_unsup_umap_projections_test_10_01, axis=0)
raf_std_unsup_umap_projection_test_10_01 = np.std(raf_unsup_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_unsup_umap_projections_train_10_01.npy', raf_unsup_umap_projections_train_10_01)
np.save('raf_mean_unsup_umap_projection_train_10_01.npy', raf_mean_unsup_umap_projection_train_10_01)
np.save('raf_std_unsup_umap_projection_train_10_01.npy', raf_std_unsup_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_unsup_umap_projections_test_10_01.npy', raf_unsup_umap_projections_test_10_01)
np.save('raf_mean_unsup_umap_projection_test_10_01.npy', raf_mean_unsup_umap_projection_test_10_01)
np.save('raf_std_unsup_umap_projection_test_10_01.npy', raf_std_unsup_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [107]:
# Load the projections, mean, and standard deviation for the training set
raf_unsup_umap_projections_train_10_01= np.load('raf_unsup_umap_projections_train_10_01.npy')
raf_mean_unsup_umap_projection_train_10_01= np.load('raf_mean_unsup_umap_projection_train_10_01.npy')
raf_std_unsup_umap_projection_train_10_01= np.load('raf_std_unsup_umap_projection_train_10_01.npy')

# Load the projections, mean, and standard deviation for the test set
raf_unsup_umap_projections_test_10_01= np.load('raf_unsup_umap_projections_test_10_01.npy')
raf_mean_unsup_umap_projection_test_10_01= np.load('raf_mean_unsup_umap_projection_test_10_01.npy')
raf_std_unsup_umap_projection_test_10_01= np.load('raf_std_unsup_umap_projection_test_10_01.npy')

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_unsup_umap_projection_train_10_01[:, 0],
    raf_mean_unsup_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Unsupervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_umap_unsup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_umap_unsup_10_01:.4f}")
# Silhouette Score
silhouette_raf_umap_unsup_10_01 = silhouette_score(raf_mean_unsup_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_umap_unsup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_unsup_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_mean_unsup_10_01_db_score = davies_bouldin_score(
    raf_mean_unsup_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_mean_unsup_10_01_db_score:.2f}")

#### Supervised UMAP 10 runs

In [11]:
# load the projections, mean, and standard deviation for the training set
raf_sup_umap_projections_train_10_01= np.load('raf_sup_umap_projections_train_10_01.npy')
raf_mean_sup_umap_projection_train_10_01= np.load('raf_mean_sup_umap_projection_train_10_01.npy')
raf_std_sup_umap_projection_train_10_01= np.load('raf_std_sup_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_sup_umap_projections_test_10_01= np.load('raf_sup_umap_projections_test_10_01.npy')
raf_mean_sup_umap_projection_test_10_01= np.load('raf_mean_sup_umap_projection_test_10_01.npy')
raf_std_sup_umap_projection_test_10_01= np.load('raf_std_sup_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
raf_sup_umap_projections_train_10_01 = []
raf_sup_umap_projections_test_10_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running Supervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(x_train_emotion_norm, y_train)
    raf_sup_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_emotion_norm)
    raf_sup_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_sup_umap_projections_train_10_01 = np.array(raf_sup_umap_projections_train_10_01)
raf_sup_umap_projections_test_10_01 = np.array(raf_sup_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_sup_umap_projection_train_10_01 = np.mean(raf_sup_umap_projections_train_10_01, axis=0)
raf_std_sup_umap_projection_train_10_01 = np.std(raf_sup_umap_projections_train_10_01, axis=0)

raf_mean_sup_umap_projection_test_10_01 = np.mean(raf_sup_umap_projections_test_10_01, axis=0)
raf_std_sup_umap_projection_test_10_01 = np.std(raf_sup_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_sup_umap_projections_train_10_01.npy', raf_sup_umap_projections_train_10_01)
np.save('raf_mean_sup_umap_projection_train_10_01.npy', raf_mean_sup_umap_projection_train_10_01)
np.save('raf_std_sup_umap_projection_train_10_01.npy', raf_std_sup_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_sup_umap_projections_test_10_01.npy', raf_sup_umap_projections_test_10_01)
np.save('raf_mean_sup_umap_projection_test_10_01.npy', raf_mean_sup_umap_projection_test_10_01)
np.save('raf_std_sup_umap_projection_test_10_01.npy', raf_std_sup_umap_projection_test_10_01)

# Output confirmation
print("Supervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_sup_umap_projection_train_10_01[:, 0],
    raf_mean_sup_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Supervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_raf_umap_sup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_umap_projection_train_10_01, y_train).predict(raf_mean_sup_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_umap_sup_10_01:.4f}")
# Silhouette Score
silhouette_raf_umap_sup_10_01 = silhouette_score(raf_mean_sup_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_umap_projection_train_10_01, y_train).predict(raf_mean_sup_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_umap_sup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_sup_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_mean_sup_10_01_db_score = davies_bouldin_score(
    raf_mean_sup_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_mean_sup_10_01_db_score:.2f}")

In [None]:
# # Step 1: Load the saved mean UMAP projections
# mean_projection = np.load('raf_mean_sup_umap_projection_10_01.npy')  # Shape: (n_samples, 2)

# # Step 2: Separate the mean projection by class
# classes = np.unique(y_train)
# class_gaussians = {}

# # Calculate the mean and covariance for each class
# for c in classes:
#     class_points = mean_projection[y_train == c]  # Filter by class
#     mean = np.mean(class_points, axis=0)
#     cov = np.cov(class_points, rowvar=False)
#     class_gaussians[c] = {"mean": mean, "cov": cov}

# # Step 3: Visualize Gaussian distributions
# plt.figure(figsize=(10, 8))

# # Plot UMAP embeddings for each class
# for c in classes:
#     class_points = mean_projection[y_train == c]
#     plt.scatter(class_points[:, 0], class_points[:, 1], label=f"Class {c}", alpha=0.5, s=10)

#     # Plot Gaussian contours
#     mean = class_gaussians[c]["mean"]
#     cov = class_gaussians[c]["cov"]
#     x, y = np.meshgrid(
#         np.linspace(mean[0] - 3, mean[0] + 3, 100), 
#         np.linspace(mean[1] - 3, mean[1] + 3, 100)
#     )
#     pos = np.dstack((x, y))
#     rv = multivariate_normal(mean, cov)
#     plt.contour(x, y, rv.pdf(pos), levels=5, alpha=0.8)

# plt.title("UMAP Mean Projections with Gaussian Distributions per Class")
# plt.xlabel("UMAP Component 1")
# plt.ylabel("UMAP Component 2")
# plt.legend()
# plt.show()

# # Step 4: Evaluate likelihood for a random point
# random_point = np.array([0, 0])  # Example point in UMAP space
# likelihoods = {c: multivariate_normal(class_gaussians[c]["mean"], class_gaussians[c]["cov"]).pdf(random_point)
#                for c in classes}

# print("Likelihoods for Random Point:", likelihoods)

------------------

### PCA + UMAP

In [None]:
# Step 2: Apply PCA
pca = PCA(0.95)
x_train_pca_emotions = pca.fit_transform(x_train_emotion_norm)
x_test_pca_emotions = pca.transform(x_test_emotion_norm)

In [None]:
print(f"Original number of features: {x_train_emotion_norm.shape[1]}")

In [35]:
# Save the projections, mean, and standard deviation
np.save('x_train_raf_pca_emotions.npy', x_train_pca_emotions)
np.save('x_test_raf_pca_emotions.npy', x_test_pca_emotions)

#### PCA + UMAP Unsupervised 10 runs

In [13]:
# load the projections, mean, and standard deviation for the training set
raf_unsup_pca_umap_projections_train_10_01= np.load('raf_unsup_pca_umap_projections_train_10_01.npy')
raf_mean_unsup_pca_umap_projection_train_10_01= np.load('raf_mean_unsup_pca_umap_projection_train_10_01.npy')
raf_std_unsup_pca_umap_projection_train_10_01= np.load('raf_std_unsup_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_unsup_pca_umap_projections_test_10_01= np.load('raf_unsup_pca_umap_projections_test_10_01.npy')
raf_mean_unsup_pca_umap_projection_test_10_01= np.load('raf_mean_unsup_pca_umap_projection_test_10_01.npy')
raf_std_unsup_pca_umap_projection_test_10_01= np.load('raf_std_unsup_pca_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
raf_unsup_pca_umap_projections_train_10_01 = []
raf_unsup_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_pca_emotions)
    raf_unsup_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_pca_emotions)
    raf_unsup_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_unsup_pca_umap_projections_train_10_01 = np.array(raf_unsup_pca_umap_projections_train_10_01)
raf_unsup_pca_umap_projections_test_10_01 = np.array(raf_unsup_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_unsup_pca_umap_projection_train_10_01 = np.mean(raf_unsup_pca_umap_projections_train_10_01, axis=0)
raf_std_unsup_pca_umap_projection_train_10_01 = np.std(raf_unsup_pca_umap_projections_train_10_01, axis=0)

raf_mean_unsup_pca_umap_projection_test_10_01 = np.mean(raf_unsup_pca_umap_projections_test_10_01, axis=0)
raf_std_unsup_pca_umap_projection_test_10_01 = np.std(raf_unsup_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_unsup_pca_umap_projections_train_10_01.npy', raf_unsup_pca_umap_projections_train_10_01)
np.save('raf_mean_unsup_pca_umap_projection_train_10_01.npy', raf_mean_unsup_pca_umap_projection_train_10_01)
np.save('raf_std_unsup_pca_umap_projection_train_10_01.npy', raf_std_unsup_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_unsup_pca_umap_projections_test_10_01.npy', raf_unsup_pca_umap_projections_test_10_01)
np.save('raf_mean_unsup_pca_umap_projection_test_10_01.npy', raf_mean_unsup_pca_umap_projection_test_10_01)
np.save('raf_std_unsup_pca_umap_projection_test_10_01.npy', raf_std_unsup_pca_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_unsup_pca_umap_projection_train_10_01[:, 0],
    raf_mean_unsup_pca_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("PCA + Unsupervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_pca_umap_unsup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_pca_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_pca_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_pca_umap_unsup_10_01:.2f}")
# Silhouette Score
silhouette_raf_pca_umap_unsup_10_01 = silhouette_score(raf_mean_unsup_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_pca_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_pca_umap_unsup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_unsup_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_mean_unsup_pca_umap_10_01_db_score = davies_bouldin_score(
    raf_mean_unsup_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_mean_unsup_pca_umap_10_01_db_score:.2f}")

#### PCA + UMAP Supervised 10 runs

In [15]:
# load the projections, mean, and standard deviation for the training set
raf_sup_pca_umap_projections_train_10_01= np.load('raf_sup_pca_umap_projections_train_10_01.npy')
raf_mean_sup_pca_umap_projection_train_10_01= np.load('raf_mean_sup_pca_umap_projection_train_10_01.npy')
raf_std_sup_pca_umap_projection_train_10_01= np.load('raf_std_sup_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_sup_pca_umap_projections_test_10_01= np.load('raf_sup_pca_umap_projections_test_10_01.npy')
raf_mean_sup_pca_umap_projection_test_10_01= np.load('raf_mean_sup_pca_umap_projection_test_10_01.npy')
raf_std_sup_pca_umap_projection_test_10_01= np.load('raf_std_sup_pca_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
raf_sup_pca_umap_projections_train_10_01 = []
raf_sup_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running Supervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(x_train_pca_emotions, y_train)
    raf_sup_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_pca_emotions)
    raf_sup_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_sup_pca_umap_projections_train_10_01 = np.array(raf_sup_pca_umap_projections_train_10_01)
raf_sup_pca_umap_projections_test_10_01 = np.array(raf_sup_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_sup_pca_umap_projection_train_10_01 = np.mean(raf_sup_pca_umap_projections_train_10_01, axis=0)
raf_std_sup_pca_umap_projection_train_10_01 = np.std(raf_sup_pca_umap_projections_train_10_01, axis=0)

raf_mean_sup_pca_umap_projection_test_10_01 = np.mean(raf_sup_pca_umap_projections_test_10_01, axis=0)
raf_std_sup_pca_umap_projection_test_10_01 = np.std(raf_sup_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_sup_pca_umap_projections_train_10_01.npy', raf_sup_pca_umap_projections_train_10_01)
np.save('raf_mean_sup_pca_umap_projection_train_10_01.npy', raf_mean_sup_pca_umap_projection_train_10_01)
np.save('raf_std_sup_pca_umap_projection_train_10_01.npy', raf_std_sup_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_sup_pca_umap_projections_test_10_01.npy', raf_sup_pca_umap_projections_test_10_01)
np.save('raf_mean_sup_pca_umap_projection_test_10_01.npy', raf_mean_sup_pca_umap_projection_test_10_01)
np.save('raf_std_sup_pca_umap_projection_test_10_01.npy', raf_std_sup_pca_umap_projection_test_10_01)

# Output confirmation
print("Supervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_sup_pca_umap_projection_train_10_01[:, 0],
    raf_mean_sup_pca_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("PCA + Supervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_raf_pca_umap_sup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_pca_umap_projection_train_10_01, y_train).predict(raf_mean_sup_pca_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_pca_umap_sup_10_01:.2f}")
# Silhouette Score
silhouette_raf_pca_umap_sup_10_01 = silhouette_score(raf_mean_sup_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_pca_umap_projection_train_10_01, y_train).predict(raf_mean_sup_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_pca_umap_sup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_sup_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_mean_sup_pca_umap_10_01_db_score = davies_bouldin_score(
    raf_mean_sup_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_mean_sup_pca_umap_10_01_db_score:.2f}")

------------

### LLE + UMAP

In [None]:
# Define parameters for LLE
n_neighbors = 10
n_components = 147

# Initialize the LLE model
lle = LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=n_components, method='standard', random_state=42)

# Fit and transform the training data
print("Running LLE on the training set...")
x_train_raf_lle = lle.fit_transform(x_train_emotion_norm)
print("LLE transformation on training set completed.")

# Transform the test data using the fitted LLE model
print("Running LLE on the test set...")
x_test_raf_lle = lle.transform(x_test_emotion_norm)
print("LLE transformation on test set completed.")

# Print shapes of transformed data
print(f"Shape of LLE-transformed training data: {x_train_raf_lle.shape}")
print(f"Shape of LLE-transformed test data: {x_test_raf_lle.shape}")

# Optional: Save the LLE-transformed data for later use
np.save('x_train_lle.npy', x_train_raf_lle)
np.save('x_test_lle.npy', x_test_raf_lle)

# Output confirmation
print("LLE-transformed data has been saved.")

#### LLE + UMAP Unsupervised 10 runs

In [17]:
# Save the projections, mean, and standard deviation for the training set
raf_unsup_lle_umap_projections_train_10_01= np.load('raf_unsup_lle_umap_projections_train_10_01.npy')
raf_mean_unsup_lle_umap_projection_train_10_01= np.load('raf_mean_unsup_lle_umap_projection_train_10_01.npy')
raf_std_unsup_lle_umap_projection_train_10_01= np.load('raf_std_unsup_lle_umap_projection_train_10_01.npy')

# Save the projections, mean, and standard deviation for the test set
raf_unsup_lle_umap_projections_test_10_01= np.load('raf_unsup_lle_umap_projections_test_10_01.npy')
raf_mean_unsup_lle_umap_projection_test_10_01= np.load('raf_mean_unsup_lle_umap_projection_test_10_01.npy')
raf_std_unsup_lle_umap_projection_test_10_01= np.load('raf_std_unsup_lle_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
raf_unsup_lle_umap_projections_train_10_01 = []
raf_unsup_lle_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_raf_lle)
    raf_unsup_lle_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_raf_lle)
    raf_unsup_lle_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_unsup_lle_umap_projections_train_10_01 = np.array(raf_unsup_lle_umap_projections_train_10_01)
raf_unsup_lle_umap_projections_test_10_01 = np.array(raf_unsup_lle_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_unsup_lle_umap_projection_train_10_01 = np.mean(raf_unsup_lle_umap_projections_train_10_01, axis=0)
raf_std_unsup_lle_umap_projection_train_10_01 = np.std(raf_unsup_lle_umap_projections_train_10_01, axis=0)

raf_mean_unsup_lle_umap_projection_test_10_01 = np.mean(raf_unsup_lle_umap_projections_test_10_01, axis=0)
raf_std_unsup_lle_umap_projection_test_10_01 = np.std(raf_unsup_lle_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_unsup_lle_umap_projections_train_10_01.npy', raf_unsup_lle_umap_projections_train_10_01)
np.save('raf_mean_unsup_lle_umap_projection_train_10_01.npy', raf_mean_unsup_lle_umap_projection_train_10_01)
np.save('raf_std_unsup_lle_umap_projection_train_10_01.npy', raf_std_unsup_lle_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_unsup_lle_umap_projections_test_10_01.npy', raf_unsup_lle_umap_projections_test_10_01)
np.save('raf_mean_unsup_lle_umap_projection_test_10_01.npy', raf_mean_unsup_lle_umap_projection_test_10_01)
np.save('raf_std_unsup_lle_umap_projection_test_10_01.npy', raf_std_unsup_lle_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_unsup_lle_umap_projection_train_10_01[:, 0],
    raf_mean_unsup_lle_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("LLE + Unsupervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_lle_umap_unsup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_lle_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_lle_umap_projection_test_10_01)) # second argument is y_test_pred_lle
print(f"ARI: {ari_raf_lle_umap_unsup_10_01:.2f}")
# Silhouette Score
silhouette_raf_lle_umap_unsup_10_01 = silhouette_score(raf_mean_unsup_lle_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_lle_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_lle_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_lle_umap_unsup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_unsup_lle_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_lle_umap_unsup_10_01_db_score = davies_bouldin_score(
    raf_mean_unsup_lle_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_lle_umap_unsup_10_01_db_score:.2f}")

#### LLE + UMAP Supervised 10 runs

In [19]:
# Load the projections, mean, and standard deviation for the training set
raf_sup_lle_umap_projections_train_10_01= np.load('raf_sup_lle_umap_projections_train_10_01.npy')
raf_mean_sup_lle_umap_projection_train_10_01= np.load('raf_mean_sup_lle_umap_projection_train_10_01.npy')
raf_std_sup_lle_umap_projection_train_10_01= np.load('raf_std_sup_lle_umap_projection_train_10_01.npy')

# Load the projections, mean, and standard deviation for the test set
raf_sup_lle_umap_projections_test_10_01= np.load('raf_sup_lle_umap_projections_test_10_01.npy')
raf_mean_sup_lle_umap_projection_test_10_01= np.load('raf_mean_sup_lle_umap_projection_test_10_01.npy')
raf_std_sup_lle_umap_projection_test_10_01= np.load('raf_std_sup_lle_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
raf_sup_lle_umap_projections_train_10_01 = []
raf_sup_lle_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_train_raf_lle,y_train)
    raf_sup_lle_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_test_raf_lle)
    raf_sup_lle_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_sup_lle_umap_projections_train_10_01 = np.array(raf_sup_lle_umap_projections_train_10_01)
raf_sup_lle_umap_projections_test_10_01 = np.array(raf_sup_lle_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_sup_lle_umap_projection_train_10_01 = np.mean(raf_sup_lle_umap_projections_train_10_01, axis=0)
raf_std_sup_lle_umap_projection_train_10_01 = np.std(raf_sup_lle_umap_projections_train_10_01, axis=0)

raf_mean_sup_lle_umap_projection_test_10_01 = np.mean(raf_sup_lle_umap_projections_test_10_01, axis=0)
raf_std_sup_lle_umap_projection_test_10_01 = np.std(raf_sup_lle_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_sup_lle_umap_projections_train_10_01.npy', raf_sup_lle_umap_projections_train_10_01)
np.save('raf_mean_sup_lle_umap_projection_train_10_01.npy', raf_mean_sup_lle_umap_projection_train_10_01)
np.save('raf_std_sup_lle_umap_projection_train_10_01.npy', raf_std_sup_lle_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_sup_lle_umap_projections_test_10_01.npy', raf_sup_lle_umap_projections_test_10_01)
np.save('raf_mean_sup_lle_umap_projection_test_10_01.npy', raf_mean_sup_lle_umap_projection_test_10_01)
np.save('raf_std_sup_lle_umap_projection_test_10_01.npy', raf_std_sup_lle_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_sup_lle_umap_projection_train_10_01[:, 0],
    raf_mean_sup_lle_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("LLE + Supervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_lle_umap_sup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_lle_umap_projection_train_10_01, y_train).predict(raf_mean_sup_lle_umap_projection_test_10_01)) # second argument is y_test_pred_lle
print(f"ARI: {ari_raf_lle_umap_sup_10_01:.2f}")
# Silhouette Score
silhouette_raf_lle_umap_sup_10_01 = silhouette_score(raf_mean_sup_lle_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_lle_umap_projection_train_10_01, y_train).predict(raf_mean_sup_lle_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_lle_umap_sup_10_01:.2f}")
# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_sup_lle_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_lle_umap_sup_10_01_db_score = davies_bouldin_score(
    raf_mean_sup_lle_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_lle_umap_sup_10_01_db_score:.2f}")

-----

### Gabor Filters + PCA + UMAP

#### Gabor filters + PCA + Unsupervised UMAP

In [9]:
# load the projections, mean, and standard deviation for the training set
raf_unsup_gabor_pca_umap_projections_train_10_01= np.load('raf_unsup_gabor_pca_umap_projections_train_10_01.npy')
raf_mean_unsup_gabor_pca_umap_projection_train_10_01= np.load('raf_mean_unsup_gabor_pca_umap_projection_train_10_01.npy')
raf_std_unsup_gabor_pca_umap_projection_train_10_01= np.load('raf_std_unsup_gabor_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_unsup_gabor_pca_umap_projections_test_10_01= np.load('raf_unsup_gabor_pca_umap_projections_test_10_01.npy')
raf_mean_unsup_gabor_pca_umap_projection_test_10_01= np.load('raf_mean_unsup_gabor_pca_umap_projection_test_10_01.npy')
raf_std_unsup_gabor_pca_umap_projection_test_10_01= np.load('raf_std_unsup_gabor_pca_umap_projection_test_10_01.npy')

In [51]:
# Create Gabor Kernels
def create_gabor_kernels():
    """Generates a set of Gabor kernels with different orientations and frequencies."""
    kernels = []
    ksize = 31  # Kernel size
    sigma = 4.0  # Standard deviation of the Gaussian envelope
    lambd = 10.0  # Wavelength of the sinusoidal factor
    gamma = 0.5  # Spatial aspect ratio
    for theta in np.arange(0, np.pi, np.pi / 4):  # 8 orientations
        kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lambd, gamma, psi=0, ktype=cv2.CV_32F)
        kernels.append(kernel)
    return kernels

# Apply Gabor Filters
def apply_gabor_filters(images, kernels):
    """Applies a set of Gabor filters to a batch of images."""
    gabor_features = []
    for image in images:
        image_2d = image.reshape(48, 48)  # Reshape back to 2D (assumes 48x48 images)
        responses = []
        for kernel in kernels:
            filtered = cv2.filter2D(image_2d, cv2.CV_32F, kernel)  # Apply Gabor filter
            responses.append(filtered.flatten())  # Flatten the filtered image
        gabor_features.append(np.concatenate(responses))  # Concatenate all filter responses
    return np.array(gabor_features)

In [None]:
# Generate Gabor kernels
gabor_kernels = create_gabor_kernels()
print(f"Generated {len(gabor_kernels)} Gabor kernels.")

In [None]:
# Apply Gabor filters to the training and test sets
x_train_gabor = apply_gabor_filters(x_train_emotion_norm, gabor_kernels)
x_test_gabor = apply_gabor_filters(x_test_emotion_norm, gabor_kernels)

print(f"Train Gabor feature shape: {x_train_gabor.shape}")
print(f"Test Gabor feature shape: {x_test_gabor.shape}")

In [56]:
scaler = StandardScaler()
x_raf_train_gabor = scaler.fit_transform(x_train_gabor)
x_raf_test_gabor = scaler.transform(x_test_gabor)

Applying PCA to Gabor filters

In [None]:
pca = PCA(0.95)
x_raf_train_pca_gabor = pca.fit_transform(x_raf_train_gabor)
x_raf_test_pca_gabor = pca.transform(x_raf_test_gabor)

print(f"Reduced train shape: {x_raf_train_pca_gabor.shape}")
print(f"Reduced test shape: {x_raf_test_pca_gabor.shape}")

In [58]:
np.save('x_raf_train_gabor_pca.npy', x_raf_train_pca_gabor)
np.save('x_raf_test_gabor_pca.npy', x_raf_test_pca_gabor)

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for each run (train and test)
raf_unsup_gabor_pca_umap_projections_train_10_01 = []
raf_unsup_gabor_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times for the training set
for run in range(n_runs):
    print(f"Running UMAP on Training Set - Iteration {run + 1}/{n_runs}...")
    # Create UMAP model
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=run)
    
    # Fit and transform the training data
    projection_train = umap_model.fit_transform(x_raf_train_pca_gabor)
    raf_unsup_gabor_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the same fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_raf_test_pca_gabor)
    raf_unsup_gabor_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_unsup_gabor_pca_umap_projections_train_10_01 = np.array(raf_unsup_gabor_pca_umap_projections_train_10_01)
raf_unsup_gabor_pca_umap_projections_test_10_01 = np.array(raf_unsup_gabor_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_unsup_gabor_pca_umap_projection_train_10_01 = np.mean(raf_unsup_gabor_pca_umap_projections_train_10_01, axis=0)
raf_std_unsup_gabor_pca_umap_projection_train_10_01 = np.std(raf_unsup_gabor_pca_umap_projections_train_10_01, axis=0)

raf_mean_unsup_gabor_pca_umap_projection_test_10_01 = np.mean(raf_unsup_gabor_pca_umap_projections_test_10_01, axis=0)
raf_std_unsup_gabor_pca_umap_projection_test_10_01 = np.std(raf_unsup_gabor_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_unsup_gabor_pca_umap_projections_train_10_01.npy', raf_unsup_gabor_pca_umap_projections_train_10_01)
np.save('raf_mean_unsup_gabor_pca_umap_projection_train_10_01.npy', raf_mean_unsup_gabor_pca_umap_projection_train_10_01)
np.save('raf_std_unsup_gabor_pca_umap_projection_train_10_01.npy', raf_std_unsup_gabor_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_unsup_gabor_pca_umap_projections_test_10_01.npy', raf_unsup_gabor_pca_umap_projections_test_10_01)
np.save('raf_mean_unsup_gabor_pca_umap_projection_test_10_01.npy', raf_mean_unsup_gabor_pca_umap_projection_test_10_01)
np.save('raf_std_unsup_gabor_pca_umap_projection_test_10_01.npy', raf_std_unsup_gabor_pca_umap_projection_test_10_01)

# Output confirmation
print("UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_unsup_gabor_pca_umap_projection_train_10_01[:, 0],
    raf_mean_unsup_gabor_pca_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Gabor + PCA + Unsupervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()


In [None]:
# ARI
ari_raf_gabor_pca_umap_unsup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_gabor_pca_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_gabor_pca_umap_projection_test_10_01)) # second argument is y_test_pred_gabor_pca
print(f"ARI: {ari_raf_gabor_pca_umap_unsup_10_01:.2f}")
# Silhouette Score
silhouette_raf_gabor_pca_umap_unsup_10_01 = silhouette_score(raf_mean_unsup_gabor_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_unsup_gabor_pca_umap_projection_train_10_01, y_train).predict(raf_mean_unsup_gabor_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_gabor_pca_umap_unsup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_unsup_gabor_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_unsup_gabor_pca_10_01_db_score = davies_bouldin_score(
    raf_mean_unsup_gabor_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_unsup_gabor_pca_10_01_db_score:.2f}")

#### Gabor filters + PCA + UMAP Supervised 10 runs

In [14]:
# load the projections, mean, and standard deviation for the training set
raf_sup_gabor_pca_umap_projections_train_10_01= np.load('raf_sup_gabor_pca_umap_projections_train_10_01.npy')
raf_mean_sup_gabor_pca_umap_projection_train_10_01= np.load('raf_mean_sup_gabor_pca_umap_projection_train_10_01.npy')
raf_std_sup_gabor_pca_umap_projection_train_10_01= np.load('raf_std_sup_gabor_pca_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_sup_gabor_pca_umap_projections_test_10_01= np.load('raf_sup_gabor_pca_umap_projections_test_10_01.npy')
raf_mean_sup_gabor_pca_umap_projection_test_10_01= np.load('raf_mean_sup_gabor_pca_umap_projection_test_10_01.npy')
raf_std_sup_gabor_pca_umap_projection_test_10_01= np.load('raf_std_sup_gabor_pca_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
raf_sup_gabor_pca_umap_projections_train_10_01 = []
raf_sup_gabor_pca_umap_projections_test_10_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running Supervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(x_raf_train_pca_gabor, y_train)
    raf_sup_gabor_pca_umap_projections_train_10_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(x_raf_test_pca_gabor)
    raf_sup_gabor_pca_umap_projections_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_sup_gabor_pca_umap_projections_train_10_01 = np.array(raf_sup_gabor_pca_umap_projections_train_10_01)
raf_sup_gabor_pca_umap_projections_test_10_01 = np.array(raf_sup_gabor_pca_umap_projections_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_mean_sup_gabor_pca_umap_projection_train_10_01 = np.mean(raf_sup_gabor_pca_umap_projections_train_10_01, axis=0)
raf_std_sup_gabor_pca_umap_projection_train_10_01 = np.std(raf_sup_gabor_pca_umap_projections_train_10_01, axis=0)

raf_mean_sup_gabor_pca_umap_projection_test_10_01 = np.mean(raf_sup_gabor_pca_umap_projections_test_10_01, axis=0)
raf_std_sup_gabor_pca_umap_projection_test_10_01 = np.std(raf_sup_gabor_pca_umap_projections_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_sup_gabor_pca_umap_projections_train_10_01.npy', raf_sup_gabor_pca_umap_projections_train_10_01)
np.save('raf_mean_sup_gabor_pca_umap_projection_train_10_01.npy', raf_mean_sup_gabor_pca_umap_projection_train_10_01)
np.save('raf_std_sup_gabor_pca_umap_projection_train_10_01.npy', raf_std_sup_gabor_pca_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_sup_gabor_pca_umap_projections_test_10_01.npy', raf_sup_gabor_pca_umap_projections_test_10_01)
np.save('raf_mean_sup_gabor_pca_umap_projection_test_10_01.npy', raf_mean_sup_gabor_pca_umap_projection_test_10_01)
np.save('raf_std_sup_gabor_pca_umap_projection_test_10_01.npy', raf_std_sup_gabor_pca_umap_projection_test_10_01)

# Output confirmation
print("Supervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
# Adjust colormap to have exactly 7 colors
cmap = plt.cm.get_cmap("tab10", 7)

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_mean_sup_gabor_pca_umap_projection_train_10_01[:, 0],
    raf_mean_sup_gabor_pca_umap_projection_train_10_01[:, 1],
    c=y_train, cmap=cmap, s=5, alpha=0.8
)

# Add title and labels
plt.title("Gabor + PCA + Supervised UMAP Projection of RAFDB Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks(range(1, 8))  # Ensure ticks align with labels
cbar.set_ticklabels([f"Emotion {label}" for label in range(1, 8)])  # Customize labels
cbar.set_label("Emotion Labels")

plt.show()

In [None]:
# ARI
ari_raf_gabor_pca_umap_sup_10_01 = adjusted_rand_score(y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_gabor_pca_umap_projection_train_10_01, y_train).predict(raf_mean_sup_gabor_pca_umap_projection_test_10_01)) # second argument is y_test_pred_gabor_pca
print(f"ARI: {ari_raf_gabor_pca_umap_sup_10_01:.2f}")
# Silhouette Score
silhouette_raf_gabor_pca_umap_sup_10_01 = silhouette_score(raf_mean_sup_gabor_pca_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_mean_sup_gabor_pca_umap_projection_train_10_01, y_train).predict(raf_mean_sup_gabor_pca_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_gabor_pca_umap_sup_10_01:.2f}")

# Use KMeans for clustering
n_clusters = len(np.unique(y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_mean_sup_gabor_pca_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_sup_gabor_pca_10_01_db_score = davies_bouldin_score(
    raf_mean_sup_gabor_pca_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_sup_gabor_pca_10_01_db_score:.2f}")

---------

-----

## Analysis of RAF-DB & FER2013 datasets combined

In [None]:
# RAF-DB images where uploaded again with a different name

#  Set file paths
input_path = 'C:/Users/Lorenzo/OneDrive/Documents/DTU/Python/2024 Fall/MSc Thesis'
image_dir = os.path.join(input_path, 'extracted_data/Image/aligned')
label_file = os.path.join(input_path, 'extracted_data/EmoLabel/list_patition_label.txt')

# Instantiate and load the dataset
emotions_dataloader = EmotionsDataloader(image_dir, label_file)
(x_train_RAF, y_train_RAF), (x_test_RAF, y_test_RAF) = emotions_dataloader.load_data()

# Print dataset shapes
print(f"Training set shape: {x_train_RAF.shape}, {y_train_RAF.shape}")
print(f"Testing set shape: {x_test_RAF.shape}, {y_test_RAF.shape}")

# Display some random train and test images
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images) / cols) + 1
    plt.figure(figsize=(15, 10))
    for i, (image, title) in enumerate(zip(images, title_texts)):
        plt.subplot(rows, cols, i + 1)
        plt.imshow(image, cmap=plt.cm.gray)
        plt.title(title, fontsize=12)
        plt.axis('off')

# Show some random train and test images
images_to_show = []
titles_to_show = []

for i in range(10):
    idx = np.random.randint(0, len(x_train_RAF))
    images_to_show.append(x_train_RAF[idx])
    titles_to_show.append(f"Train[{idx}] = {y_train_RAF[idx]}")

for i in range(5):
    idx = np.random.randint(0, len(x_test_RAF))
    images_to_show.append(x_test_RAF[idx])
    titles_to_show.append(f"Test[{idx}] = {y_test_RAF[idx]}")

show_images(images_to_show, titles_to_show)

In [None]:
# Convert Training Data (RAF)
train_raf_df = pd.DataFrame({
    "emotion": y_train_RAF,
    "pixels": [" ".join(map(str, x.flatten())) for x in x_train_RAF],  # Flatten images and store as strings
    "Dataset": "RAF",
    "Usage": "Training"
})

# Convert Testing Data (RAF)
test_raf_df = pd.DataFrame({
    "emotion": y_test_RAF,
    "pixels": [" ".join(map(str, x.flatten())) for x in x_test_RAF],  # Flatten images and store as strings
    "Dataset": "RAF",
    "Usage": "Testing"
})

# Combine RAF Train and Test
raf_combined_df = pd.concat([train_raf_df, test_raf_df], ignore_index=True)

# Verify the structure of the RAF DataFrame
print(raf_combined_df.head())
print("RAF Combined Shape:", raf_combined_df.shape)

In [None]:
# Combine RAF and FER2013 datasets
combined_df = pd.concat([raf_combined_df, df], ignore_index=True)

# Verify the combined DataFrame
print("Combined Dataset Shape:", combined_df.shape)
print(combined_df.head())

### Combining both datasets RAF-DB & FER2013

In [None]:
# Convert the 'pixels' column to numerical arrays for verification
combined_df["pixels"] = combined_df["pixels"].apply(lambda x: np.array(list(map(int, x.split()))))

# Check the shape of the first row to confirm it is flattened
print("Shape of a single image (pixels):", combined_df["pixels"].iloc[0].shape)  # Expected: (2304,)

In [None]:
# Check mean and standard deviation of pixel values
pixel_values = np.vstack(combined_df["pixels"].values)
print("Mean of pixel values:", pixel_values.mean())
print("Standard deviation of pixel values:", pixel_values.std())

In [None]:
# Normalize the pixel values
scaler = StandardScaler()
normalized_pixels = scaler.fit_transform(pixel_values)  # Normalize pixel values

# Update the 'pixels' column with normalized values
combined_df["pixels"] = list(normalized_pixels)

In [None]:
# Update 'Usage' column values to 'Testing' if they are 'PrivateTest' or 'PublicTest'
combined_df.loc[(combined_df["Usage"] == "PrivateTest") | (combined_df["Usage"] == "PublicTest"), "Usage"] = "Testing"

# Check if the update was successful
print(combined_df["Usage"].value_counts())


In [None]:
# Encode the 'Dataset' column as 0 (FER2013) and 1 (RAF)
combined_df["Dataset"] = combined_df["Dataset"].map({"FER2013": 0, "RAF": 1})

In [None]:
print(combined_df["Dataset"].isna().sum())  # Check for NaN values in the 'Dataset' column

In [None]:
print("Unique values in Dataset column after encoding:", combined_df["Dataset"].unique())

In [None]:
# Split into training and testing sets
merged_train_data = combined_df[combined_df["Usage"] == "Training"]
merged_test_data = combined_df[combined_df["Usage"] != "Training"]

# Extract X (pixels) and y (Dataset) for training and testing
merged_x_train = np.vstack(merged_train_data["pixels"].values)  # Convert list of arrays to 2D array
merged_y_train = merged_train_data["Dataset"].values

merged_x_test = np.vstack(merged_test_data["pixels"].values)  # Convert list of arrays to 2D array
merged_y_test = merged_test_data["Dataset"].values

# Verify shapes
print("merged_x_train shape:", merged_x_train.shape)  # Expected: (num_train_samples, 2304)
print("merged_y_train shape:", merged_y_train.shape)  # Expected: (num_train_samples,)
print("merged_x_test shape:", merged_x_test.shape)    # Expected: (num_test_samples, 2304)
print("merged_y_test shape:", merged_y_test.shape)    # Expected: (num_test_samples,)

------

#### Supervised UMAP

##### n_neighhours = 50

In [None]:
# Define parameters
n_neighbors = 50
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
raf_fer_sup_umap_train_50_01 = []
raf_fer_sup_umap_test_50_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running supervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(merged_x_train, merged_y_train)
    raf_fer_sup_umap_train_50_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(merged_x_test)
    raf_fer_sup_umap_test_50_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_fer_sup_umap_train_50_01 = np.array(raf_fer_sup_umap_train_50_01)
raf_fer_sup_umap_test_50_01 = np.array(raf_fer_sup_umap_test_50_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_fer_mean_sup_umap_projection_train_50_01 = np.mean(raf_fer_sup_umap_train_50_01, axis=0)
raf_fer_std_sup_umap_projection_train_50_01 = np.std(raf_fer_sup_umap_train_50_01, axis=0)

raf_fer_mean_sup_umap_projection_test_50_01 = np.mean(raf_fer_sup_umap_test_50_01, axis=0)
raf_fer_std_sup_umap_projection_test_50_01 = np.std(raf_fer_sup_umap_test_50_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_fer_sup_umap_train_50_01.npy', raf_fer_sup_umap_train_50_01)
np.save('raf_fer_mean_sup_umap_projection_train_50_01.npy', raf_fer_mean_sup_umap_projection_train_50_01)
np.save('raf_fer_std_sup_umap_projection_train_50_01.npy', raf_fer_std_sup_umap_projection_train_50_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_fer_sup_umap_test_50_01.npy', raf_fer_sup_umap_test_50_01)
np.save('raf_fer_mean_sup_umap_projection_test_50_01.npy', raf_fer_mean_sup_umap_projection_test_50_01)
np.save('raf_fer_std_sup_umap_projection_test_50_01.npy', raf_fer_std_sup_umap_projection_test_50_01)

# Output confirmation
print("supervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
from matplotlib.colors import ListedColormap

# Define a custom colormap with green and blue
custom_cmap = ListedColormap(["lightgreen", "pink"])

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_fer_mean_sup_umap_projection_train_50_01[:, 0],
    raf_fer_mean_sup_umap_projection_train_50_01[:, 1],
    c=merged_y_train,
    cmap=custom_cmap,
    s=5,
    alpha=0.8
)

# Add title and labels
plt.title("n_neighbours=50 Supervised UMAP Projection of RAFDB-FER2013 Merged Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks([0, 1])  # Ensure ticks align with both labels
cbar.set_ticklabels(["Original source 0 - (FER2013)", "Original source 1 - (RAF-DB)"])  # Customize labels
cbar.set_label("Image Original Source Labels")

plt.show()

In [None]:
# ARI
ari_raf_fer_umap_sup_50_01 = adjusted_rand_score(merged_y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_fer_mean_sup_umap_projection_train_50_01, merged_y_train).predict(raf_fer_mean_sup_umap_projection_test_50_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_fer_umap_sup_50_01:.4f}")
# Silhouette Score
silhouette_raf_fer_umap_sup_50_01 = silhouette_score(raf_fer_mean_sup_umap_projection_test_50_01, KNeighborsClassifier(n_neighbors=1).fit(raf_fer_mean_sup_umap_projection_train_50_01, merged_y_train).predict(raf_fer_mean_sup_umap_projection_test_50_01))
print(f"Silhouette Score: {silhouette_raf_fer_umap_sup_50_01:.2f}")
# Use KMeans for clustering
n_clusters = len(np.unique(merged_y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_fer_mean_sup_umap_projection_train_50_01)

# Compute Davies-Bouldin Index
raf_fer_sup_umap_projection_50_01_db_score = davies_bouldin_score(
    raf_fer_mean_sup_umap_projection_train_50_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_fer_sup_umap_projection_50_01_db_score:.2f}")

----------

##### n_neighbours = 10

In [None]:
# load the projections, mean, and standard deviation for the training set
raf_fer_sup_umap_train_10_01= np.load('raf_fer_sup_umap_train_10_01.npy')
raf_fer_mean_sup_umap_projection_train_10_01= np.load('raf_fer_mean_sup_umap_projection_train_10_01.npy')
raf_fer_std_sup_umap_projection_train_10_01= np.load('raf_fer_std_sup_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_fer_sup_umap_test_10_01= np.load('raf_fer_sup_umap_test_10_01.npy')
raf_fer_mean_sup_umap_projection_test_10_01= np.load('raf_fer_mean_sup_umap_projection_test_10_01.npy')
raf_fer_std_sup_umap_projection_test_10_01= np.load('raf_fer_std_sup_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
raf_fer_sup_umap_train_10_01 = []
raf_fer_sup_umap_test_10_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running Supervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(merged_x_train, merged_y_train)
    raf_fer_sup_umap_train_10_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(merged_x_test)
    raf_fer_sup_umap_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_fer_sup_umap_train_10_01 = np.array(raf_fer_sup_umap_train_10_01)
raf_fer_sup_umap_test_10_01 = np.array(raf_fer_sup_umap_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_fer_mean_sup_umap_projection_train_10_01 = np.mean(raf_fer_sup_umap_train_10_01, axis=0)
raf_fer_std_sup_umap_projection_train_10_01 = np.std(raf_fer_sup_umap_train_10_01, axis=0)

raf_fer_mean_sup_umap_projection_test_10_01 = np.mean(raf_fer_sup_umap_test_10_01, axis=0)
raf_fer_std_sup_umap_projection_test_10_01 = np.std(raf_fer_sup_umap_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_fer_sup_umap_train_10_01.npy', raf_fer_sup_umap_train_10_01)
np.save('raf_fer_mean_sup_umap_projection_train_10_01.npy', raf_fer_mean_sup_umap_projection_train_10_01)
np.save('raf_fer_std_sup_umap_projection_train_10_01.npy', raf_fer_std_sup_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_fer_sup_umap_test_10_01.npy', raf_fer_sup_umap_test_10_01)
np.save('raf_fer_mean_sup_umap_projection_test_10_01.npy', raf_fer_mean_sup_umap_projection_test_10_01)
np.save('raf_fer_std_sup_umap_projection_test_10_01.npy', raf_fer_std_sup_umap_projection_test_10_01)

# Output confirmation
print("Supervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
from matplotlib.colors import ListedColormap

# Define a custom colormap with green and blue
custom_cmap = ListedColormap(["lightgreen", "blue"])

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_fer_mean_sup_umap_projection_train_10_01[:, 0],
    raf_fer_mean_sup_umap_projection_train_10_01[:, 1],
    c=merged_y_train,
    cmap=custom_cmap,
    s=5,
    alpha=0.8
)

# Add title and labels
plt.title("Supervised UMAP Projection of RAFDB-FER2013 Merged Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks([0, 1])  # Ensure ticks align with both labels
cbar.set_ticklabels(["Original source 0 - (FER2013)", "Original source 1 - (RAF-DB)"])  # Customize labels
cbar.set_label("Image Original Source Labels")

plt.show()


In [None]:
# ARI
ari_raf_fer_umap_sup_10_01 = adjusted_rand_score(merged_y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_fer_mean_sup_umap_projection_train_10_01, merged_y_train).predict(raf_fer_mean_sup_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_fer_umap_sup_10_01:.4f}")
# Silhouette Score
silhouette_raf_fer_umap_sup_10_01 = silhouette_score(raf_fer_mean_sup_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_fer_mean_sup_umap_projection_train_10_01, merged_y_train).predict(raf_fer_mean_sup_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_fer_umap_sup_10_01:.2f}")
# Use KMeans for clustering
n_clusters = len(np.unique(merged_y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_fer_mean_sup_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_fer_sup_umap_projection_10_01_db_score = davies_bouldin_score(
    raf_fer_mean_sup_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_fer_sup_umap_projection_10_01_db_score:.2f}")

-------

#### Unsupervised UMAP

##### n_neighbors = 50

In [None]:
# Define parameters
n_neighbors = 50
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
raf_fer_unsup_umap_train_50_01 = []
raf_fer_unsup_umap_test_50_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running unsupervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(merged_x_train)
    raf_fer_unsup_umap_train_50_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(merged_x_test)
    raf_fer_unsup_umap_test_50_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_fer_unsup_umap_train_50_01 = np.array(raf_fer_unsup_umap_train_50_01)
raf_fer_unsup_umap_test_50_01 = np.array(raf_fer_unsup_umap_test_50_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_fer_mean_unsup_umap_projection_train_50_01 = np.mean(raf_fer_unsup_umap_train_50_01, axis=0)
raf_fer_std_unsup_umap_projection_train_50_01 = np.std(raf_fer_unsup_umap_train_50_01, axis=0)

raf_fer_mean_unsup_umap_projection_test_50_01 = np.mean(raf_fer_unsup_umap_test_50_01, axis=0)
raf_fer_std_unsup_umap_projection_test_50_01 = np.std(raf_fer_unsup_umap_test_50_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_fer_unsup_umap_train_50_01.npy', raf_fer_unsup_umap_train_50_01)
np.save('raf_fer_mean_unsup_umap_projection_train_50_01.npy', raf_fer_mean_unsup_umap_projection_train_50_01)
np.save('raf_fer_std_unsup_umap_projection_train_50_01.npy', raf_fer_std_unsup_umap_projection_train_50_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_fer_unsup_umap_test_50_01.npy', raf_fer_unsup_umap_test_50_01)
np.save('raf_fer_mean_unsup_umap_projection_test_50_01.npy', raf_fer_mean_unsup_umap_projection_test_50_01)
np.save('raf_fer_std_unsup_umap_projection_test_50_01.npy', raf_fer_std_unsup_umap_projection_test_50_01)

# Output confirmation
print("unsupervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
from matplotlib.colors import ListedColormap

# Define a custom colormap with green and blue
custom_cmap = ListedColormap(["lightgreen", "pink"])

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_fer_mean_unsup_umap_projection_train_50_01[:, 0],
    raf_fer_mean_unsup_umap_projection_train_50_01[:, 1],
    c=merged_y_train,
    cmap=custom_cmap,
    s=5,
    alpha=0.8
)

# Add title and labels
plt.title("n_neighbours=50 Unsupervised UMAP Projection of RAFDB-FER2013 Merged Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks([0, 1])  # Ensure ticks align with both labels
cbar.set_ticklabels(["Original source 0 - (FER2013)", "Original source 1 - (RAF-DB)"])  # Customize labels
cbar.set_label("Image Original Source Labels")

plt.show()

In [None]:
# ARI
ari_raf_fer_umap_unsup_50_01 = adjusted_rand_score(merged_y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_fer_mean_unsup_umap_projection_train_50_01, merged_y_train).predict(raf_fer_mean_unsup_umap_projection_test_50_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_fer_umap_unsup_50_01:.4f}")
# Silhouette Score
silhouette_raf_fer_umap_unsup_50_01 = silhouette_score(raf_fer_mean_unsup_umap_projection_test_50_01, KNeighborsClassifier(n_neighbors=1).fit(raf_fer_mean_unsup_umap_projection_train_50_01, merged_y_train).predict(raf_fer_mean_unsup_umap_projection_test_50_01))
print(f"Silhouette Score: {silhouette_raf_fer_umap_unsup_50_01:.2f}")
# Use KMeans for clustering
n_clusters = len(np.unique(merged_y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_fer_mean_unsup_umap_projection_train_50_01)

# Compute Davies-Bouldin Index
raf_fer_unsup_umap_projection_50_01_db_score = davies_bouldin_score(
    raf_fer_mean_unsup_umap_projection_train_50_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_fer_unsup_umap_projection_50_01_db_score:.2f}")

-------

##### n_neighbours= 10

In [None]:
# load the projections, mean, and standard deviation for the training set
raf_fer_unsup_umap_train_10_01= np.load('raf_fer_unsup_umap_train_10_01.npy')
raf_fer_mean_unsup_umap_projection_train_10_01= np.load('raf_fer_mean_unsup_umap_projection_train_10_01.npy')
raf_fer_std_unsup_umap_projection_train_10_01= np.load('raf_fer_std_unsup_umap_projection_train_10_01.npy')

# load the projections, mean, and standard deviation for the test set
raf_fer_unsup_umap_test_10_01= np.load('raf_fer_unsup_umap_test_10_01.npy')
raf_fer_mean_unsup_umap_projection_test_10_01= np.load('raf_fer_mean_unsup_umap_projection_test_10_01.npy')
raf_fer_std_unsup_umap_projection_test_10_01= np.load('raf_fer_std_unsup_umap_projection_test_10_01.npy')

In [None]:
# Define parameters
n_neighbors = 10
min_dist = 0.1
n_components = 2
n_runs = 10  # Number of runs

# Store UMAP projections for training and test sets
raf_fer_unsup_umap_train_10_01 = []
raf_fer_unsup_umap_test_10_01 = []

# Run UMAP multiple times
for run in range(n_runs):
    print(f"Running unsupervised UMAP - Iteration {run + 1}/{n_runs}...")

    # Create UMAP model
    umap_model = umap.UMAP(
        n_neighbors=n_neighbors, 
        min_dist=min_dist, 
        n_components=n_components, 
        random_state=run
    )

    # Fit and transform the training data with labels
    projection_train = umap_model.fit_transform(merged_x_train)
    raf_fer_unsup_umap_train_10_01.append(projection_train)
    
    # Transform the test data using the fitted model
    print(f"Running UMAP on Test Set - Iteration {run + 1}/{n_runs}...")
    projection_test = umap_model.transform(merged_x_test)
    raf_fer_unsup_umap_test_10_01.append(projection_test)

# Convert the list of projections to numpy arrays
raf_fer_unsup_umap_train_10_01 = np.array(raf_fer_unsup_umap_train_10_01)
raf_fer_unsup_umap_test_10_01 = np.array(raf_fer_unsup_umap_test_10_01)

# Calculate mean and standard deviation of projections across runs (train and test)
raf_fer_mean_unsup_umap_projection_train_10_01 = np.mean(raf_fer_unsup_umap_train_10_01, axis=0)
raf_fer_std_unsup_umap_projection_train_10_01 = np.std(raf_fer_unsup_umap_train_10_01, axis=0)

raf_fer_mean_unsup_umap_projection_test_10_01 = np.mean(raf_fer_unsup_umap_test_10_01, axis=0)
raf_fer_std_unsup_umap_projection_test_10_01 = np.std(raf_fer_unsup_umap_test_10_01, axis=0)

# Save the projections, mean, and standard deviation for the training set
np.save('raf_fer_unsup_umap_train_10_01.npy', raf_fer_unsup_umap_train_10_01)
np.save('raf_fer_mean_unsup_umap_projection_train_10_01.npy', raf_fer_mean_unsup_umap_projection_train_10_01)
np.save('raf_fer_std_unsup_umap_projection_train_10_01.npy', raf_fer_std_unsup_umap_projection_train_10_01)

# Save the projections, mean, and standard deviation for the test set
np.save('raf_fer_unsup_umap_test_10_01.npy', raf_fer_unsup_umap_test_10_01)
np.save('raf_fer_mean_unsup_umap_projection_test_10_01.npy', raf_fer_mean_unsup_umap_projection_test_10_01)
np.save('raf_fer_std_unsup_umap_projection_test_10_01.npy', raf_fer_std_unsup_umap_projection_test_10_01)

# Output confirmation
print("unsupervised UMAP projections for training and test sets, mean, and standard deviations have been saved.")

In [None]:
from matplotlib.colors import ListedColormap

# Define a custom colormap with green and blue
custom_cmap = ListedColormap(["lightgreen", "blue"])

# Create the scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    raf_fer_mean_unsup_umap_projection_train_10_01[:, 0],
    raf_fer_mean_unsup_umap_projection_train_10_01[:, 1],
    c=merged_y_train,
    cmap=custom_cmap,
    s=5,
    alpha=0.8
)

# Add title and labels
plt.title("Unsupervised UMAP Projection of RAFDB-FER2013 Merged Training Data (10 Runs)")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Add and configure the colorbar
cbar = plt.colorbar(scatter)
cbar.set_ticks([0, 1])  # Ensure ticks align with both labels
cbar.set_ticklabels(["Original source 0 - (FER2013)", "Original source 1 - (RAF-DB)"])  # Customize labels
cbar.set_label("Image Original Source Labels")

plt.show()

In [None]:
# ARI
ari_raf_fer_umap_unsup_10_01 = adjusted_rand_score(merged_y_test, KNeighborsClassifier(n_neighbors=1).fit(raf_fer_mean_unsup_umap_projection_train_10_01, merged_y_train).predict(raf_fer_mean_unsup_umap_projection_test_10_01)) # second argument is y_test_pred_pca
print(f"ARI: {ari_raf_fer_umap_unsup_10_01:.4f}")
# Silhouette Score
silhouette_raf_fer_umap_unsup_10_01 = silhouette_score(raf_fer_mean_unsup_umap_projection_test_10_01, KNeighborsClassifier(n_neighbors=1).fit(raf_fer_mean_unsup_umap_projection_train_10_01, merged_y_train).predict(raf_fer_mean_unsup_umap_projection_test_10_01))
print(f"Silhouette Score: {silhouette_raf_fer_umap_unsup_10_01:.2f}")
# Use KMeans for clustering
n_clusters = len(np.unique(merged_y_train))  # Number of clusters = number of unique labels
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# Fit KMeans on the training set UMAP projections
predicted_labels = kmeans.fit_predict(raf_fer_mean_unsup_umap_projection_train_10_01)

# Compute Davies-Bouldin Index
raf_fer_unsup_umap_projection_10_01_db_score = davies_bouldin_score(
    raf_fer_mean_unsup_umap_projection_train_10_01,  # Training set mean projections
    predicted_labels  # Labels from KMeans clustering
)

print(f"Davies-Bouldin Index: {raf_fer_unsup_umap_projection_10_01_db_score:.2f}")

In [None]:
# Define the custom colormap with green and blue
custom_cmap = ListedColormap(["lightgreen", "blue"])

# Create a figure with two subplots side by side
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Plot the unsupervised UMAP
scatter1 = axes[0].scatter(
    raf_fer_mean_unsup_umap_projection_train_10_01[:, 0],
    raf_fer_mean_unsup_umap_projection_train_10_01[:, 1],
    c=merged_y_train,
    cmap=custom_cmap,
    s=5,
    alpha=0.8
)
axes[0].set_title("Unsupervised UMAP Projection of RAFDB-FER2013 Merged Training Data (10 Runs)")
axes[0].set_xlabel("UMAP Component 1")
axes[0].set_ylabel("UMAP Component 2")

# Plot the supervised UMAP
scatter2 = axes[1].scatter(
    raf_fer_mean_sup_umap_projection_train_10_01[:, 0],
    raf_fer_mean_sup_umap_projection_train_10_01[:, 1],
    c=merged_y_train,
    cmap=custom_cmap,
    s=5,
    alpha=0.8
)
axes[1].set_title("Supervised UMAP Projection of RAFDB-FER2013 Merged Training Data (10 Runs)")
axes[1].set_xlabel("UMAP Component 1")
axes[1].set_ylabel("UMAP Component 2")
cbar2 = fig.colorbar(scatter2, ax=axes[1], orientation="vertical")
cbar2.set_ticks([0, 1])  # Ensure ticks align with both labels
cbar2.set_ticklabels(["Original source 0 - (FER2013)", "Original source 1 - (RAF-DB)"])  # Customize labels
cbar2.set_label("Image Original Source Labels")

# Adjust layout
plt.tight_layout()
plt.show()

---------

------