## Running the MIDAA on the Beta Cells dataset

In [10]:

import midaa as maa
import scanpy as sc
import pyro
import numpy as np
import magic
import scprep
from sklearn import decomposition
import torch
from scipy.spatial.distance import squareform, pdist



### Load the dataset

In [11]:

# 1. Load Data
print("Loading single-cell dataset...")
adata_beta_hfd = sc.read_h5ad('data/beta_cells_hfd.h5ad')

# show size of dataset
print(f"Dataset shape: {adata_beta_hfd.X.shape}")

adata_beta_hfd

Loading single-cell dataset...
Dataset shape: (3887, 16483)


AnnData object with n_obs × n_vars = 3887 × 16483
    obs: 'sample'
    var: 'human_name', 'ids'

### Subsample

In [12]:
# Dataset is too big, subsample for faster testing
adata_beta_hfd = adata_beta_hfd[:1000, :]
print(f"Subsampled dataset shape: {adata_beta_hfd.X.shape}")

Subsampled dataset shape: (1000, 16483)


### Apply Magic
Result: hfd_magic → imputed gene expression matrix with fewer zeros.

In [13]:
hfd_magic_op = magic.MAGIC(random_state=42, t=10)
hfd_magic = hfd_magic_op.fit_transform(adata_beta_hfd.to_df())

Calculating MAGIC...
  Running MAGIC on 1000 cells and 16483 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...




    Calculated PCA in 3.72 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.02 seconds.
    Calculating affinities...
    Calculated affinities in 0.05 seconds.
  Calculated graph and diffusion operator in 3.80 seconds.
  Running MAGIC with `solver='exact'` on 16483-dimensional data may take a long time. Consider denoising specific genes with `genes=<list-like>` or using `solver='approximate'`.
  Calculating imputation...
  Calculated imputation in 0.40 seconds.
Calculated MAGIC in 4.25 seconds.


In [14]:
hfd_pc_op = decomposition.PCA(n_components=20, random_state=42)
hfd_magic_pc = hfd_pc_op.fit_transform(hfd_magic)

# Normalize for AAnet model
hfd_magic_pc_norm = hfd_magic_pc / np.std(hfd_magic_pc[:, 0])

# Print everything to debug
print(f"Magic Imputed Data Shape: {hfd_magic.shape}")
print(f"PCA Reduced Data Shape: {hfd_magic_pc.shape}")
print(f"Normalized Data Shape: {hfd_magic_pc_norm.shape}")

Magic Imputed Data Shape: (1000, 16483)
PCA Reduced Data Shape: (1000, 20)
Normalized Data Shape: (1000, 20)


In [15]:


# hfd_magic_pc_norm is (n_cells x n_pcs)
adata_pca = sc.AnnData(X=hfd_magic_pc_norm)
print(f"AnnData object shape: {adata_pca.X.shape}")

# Tell MIDAA to treat it as normalized Gaussian data
input_matrix, norm_factors, input_distribution = maa.get_input_params_adata(adata_pca)

# Override distribution if needed
input_distribution = ["G"]  


AnnData object shape: (1000, 20)


In [None]:
import torch
import numpy as np

N_RUNS = 5
N_ARCHETYPES = 4

S_list = []  # Will store: archetype × samples (membership weights transposed)
C_list = []  # Will store: samples × archetypes (membership weights)
Archetypes_list = []  # Will store: archetypes × PCs (archetype coordinates)
L_list = []  # ELBO list

for run in range(N_RUNS):
    print(f"Run {run+1}/{N_RUNS}")

    # Fit MIDAA with correct parameters
    aa_result = maa.fit_MIDAA(
        input_matrix,  # Use the variable you already created
        norm_factors,  # Use the variable you already created
        input_distribution,  # Use the variable you already created
        narchetypes=N_ARCHETYPES,
        torch_seed=42 + run,
        steps=2000,
        # Remove nlatent - MIDAA infers this from input_matrix
    )

    # Extract the inferred quantities
    iq = aa_result["inferred_quantities"]
    
    # Get the membership matrix (A in MIDAA notation)
    # This should be samples × archetypes
    A_np = iq["A"]  # Should already be numpy array
    
    # Get the archetype coordinates
    C_archetypes = iq["archetypes_inferred"]  # archetypes × features
    
    # For stability metrics format:
    # C: samples × archetypes (membership matrix)
    C = A_np
    
    # S: archetypes × samples (transposed membership)
    S = C.T
    
    # Append to lists
    C_list.append(C)
    S_list.append(S)
    Archetypes_list.append(C_archetypes)
    L_list.append(aa_result.get("elbo_list", None))  # May not exist
    
    print(f"Run {run+1} - C shape: {C.shape}, S shape: {S.shape}, Archetypes shape: {C_archetypes.shape}")

Run 1/5


ELBO: 21327.84180  : 100%|██████████| 2000/2000 [02:17<00:00, 14.57it/s]


Run 1 - C shape: (1000, 4), S shape: (4, 1000), Archetypes shape: (4, 3)
Run 2/5


ELBO: 46625.48438  : 100%|██████████| 2000/2000 [02:29<00:00, 13.38it/s]


Run 2 - C shape: (1000, 4), S shape: (4, 1000), Archetypes shape: (4, 3)
Run 3/5


ELBO: 26475.56836  : 100%|██████████| 2000/2000 [02:24<00:00, 13.80it/s]


Run 3 - C shape: (1000, 4), S shape: (4, 1000), Archetypes shape: (4, 3)
Run 4/5


ELBO: 24286.65430  : 100%|██████████| 2000/2000 [02:28<00:00, 13.43it/s]


Run 4 - C shape: (1000, 4), S shape: (4, 1000), Archetypes shape: (4, 3)
Run 5/5


ELBO: 29243.59180  : 100%|██████████| 2000/2000 [02:35<00:00, 12.90it/s]


Run 5 - C shape: (1000, 4), S shape: (4, 1000), Archetypes shape: (4, 3)


RuntimeError: Parent directory results/beta_cells does not exist.

In [17]:
SAVE_PATH = "/Users/joaomata/Desktop/DTU/DeepLearning/ProjectDL/Midaa/results/midaa_aligned_5runs.pth"

#save all runs
torch.save({
    'C': C_list, 
    'S': S_list, 
    'Archetypes': Archetypes_list,
    'L': L_list,
    'X': hfd_magic_pc_norm
}, SAVE_PATH)

print(f"\nSaved 5-run MIDAA results to {SAVE_PATH}")

# Print summary
print("\nSummary of saved results:")
print(f"Number of runs: {len(C_list)}")
print(f"C[0] shape (samples × archetypes): {C_list[0].shape}")
print(f"S[0] shape (archetypes × samples): {S_list[0].shape}")
print(f"Archetypes[0] shape (archetypes × features): {Archetypes_list[0].shape}")



Saved 5-run MIDAA results to /Users/joaomata/Desktop/DTU/DeepLearning/ProjectDL/Midaa/results/midaa_aligned_5runs.pth

Summary of saved results:
Number of runs: 5
C[0] shape (samples × archetypes): (1000, 4)
S[0] shape (archetypes × samples): (4, 1000)
Archetypes[0] shape (archetypes × features): (4, 3)


## Visualize the results

In [20]:
# Open the saved results to verify
results = torch.load(SAVE_PATH, weights_only=False)
print("Saved Results Keys:", results.keys())
print("Number of Runs Saved:", len(results['C']))

# Print shapes if the first run to verify
print("Shape of C in first run:", results['C'][0].shape)
print("Shape of S in first run:", results['S'][0].shape)


Saved Results Keys: dict_keys(['C', 'S', 'Archetypes', 'L', 'X'])
Number of Runs Saved: 5
Shape of C in first run: (1000, 4)
Shape of S in first run: (4, 1000)


In [21]:
# Add this helper function (same as in your AAnet notebook)
def to_numpy(tensor):
    if isinstance(tensor, torch.Tensor):
        return tensor.detach().cpu().numpy()
    return np.array(tensor) if not isinstance(tensor, np.ndarray) else tensor


def preprocess(X):
    meanX = np.mean(X, axis=0)
    X_centered = X - meanX
    mSST = np.sum(np.mean(X_centered**2, axis=0))
    return X_centered, mSST


def calcMI(z1, z2):
    eps = 10e-16
    P = z1 @ z2.T
    PXY = P / P.sum()
    PXPY = np.outer(np.expand_dims(PXY.sum(1), axis=0), np.expand_dims(PXY.sum(0), axis=1))
    ind = np.nonzero(PXY > 0)
    MI = np.sum(PXY * np.log(eps + PXY / (eps + PXPY)))
    return MI

def calcNMI(z1, z2):
    NMI = (2 * calcMI(z1, z2)) / (calcMI(z1, z1) + calcMI(z2, z2))
    return NMI

def ArchetypeConsistency(XC1, XC2, mSST):
    D = squareform(pdist(np.hstack((XC1, XC2)).T, 'euclidean'))**2
    D = D[:XC1.shape[1], XC1.shape[1]:]
    # Greedy matching
    i = []
    j = []
    v = []
    K = XC1.shape[1]
    for k in range(K):
        min_index = np.unravel_index(np.argmin(D, axis=None), D.shape)
        i.append(min_index[0])
        j.append(min_index[1])
        v.append(D[i[-1], j[-1]])
        D[i[-1], :] = np.inf
        D[:, j[-1]] = np.inf
    consistency = 1 - np.mean(v) / mSST
    D2 = np.abs(np.corrcoef(np.hstack((XC1, XC2)).T))
    D2 = D2[:K, K:]
    ISI = 1 / (2 * K * (K - 1)) * (np.sum(D2 / np.max(D2, axis=1, keepdims=True) + D2 / np.max(D2, axis=0, keepdims=True)) - 2 * K)
    return consistency, ISI

# Load and prepare data
results = torch.load('results/midaa_SC_matrices_5runs.pth', weights_only=False)
C_matrices = [to_numpy(C) for C in results['C']]
S_matrices = [to_numpy(S) for S in results['S']]

# Your raw data (same preprocessing as AAnet)
X_raw = to_numpy(hfd_magic_pc_norm)  # Shape: (1000, 20)

# Now use the same stability functions from AAnet notebook
from itertools import combinations

def compute_stability_metrics(S_list, C_list, X_raw, N_RUNS):
    """Same function from AAnet notebook"""
    X_centered, mSST_val = preprocess(X_raw)

    nmi_matrix = np.zeros((N_RUNS, N_RUNS))
    consistency_matrix = np.zeros((N_RUNS, N_RUNS))
    isi_matrix = np.zeros((N_RUNS, N_RUNS))

    for i, j in combinations(range(N_RUNS), 2):
        S_i, S_j = S_list[i], S_list[j]
        C_i, C_j = C_list[i], C_list[j]

        nmi_matrix[i, j] = calcNMI(S_i, S_j)
        consistency_matrix[i, j], isi_matrix[i, j] = ArchetypeConsistency(C_i, C_j, mSST_val)

    # Fill symmetric and diagonal
    nmi_matrix += nmi_matrix.T
    consistency_matrix += consistency_matrix.T
    isi_matrix += isi_matrix.T
    np.fill_diagonal(nmi_matrix, 1)
    np.fill_diagonal(consistency_matrix, 1)
    np.fill_diagonal(isi_matrix, 1)

    return nmi_matrix, consistency_matrix, isi_matrix

# Compute metrics
nmi_midaa, consistency_midaa, isi_midaa = compute_stability_metrics(
    S_matrices, C_matrices, X_raw, N_RUNS
)

print("---")
print("MIDAA 5-run stability:")
print("---")
print("Mean NMI:", np.mean(nmi_midaa[np.triu_indices(N_RUNS, 1)]))
print("Mean Archetype Consistency:", np.mean(consistency_midaa[np.triu_indices(N_RUNS, 1)]))
print("Mean ISI:", np.mean(isi_midaa[np.triu_indices(N_RUNS, 1)]))

---
MIDAA 5-run stability:
---
Mean NMI: 0.4210680165500024
Mean Archetype Consistency: -100.7338648653396
Mean ISI: 0.4489291919328295
