In [1]:
# Imports
import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('..')

import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import numpy as np
import MENDER as mender

from utils.data_loader import DataLoader

In [2]:
# Load MERSCOPE Dataset from SODB
loader = DataLoader("Dataset13_MS_raw")
adata_dict = loader.load()
adata = list(adata_dict.values())[0]

Loading dataset: Dataset13_MS_raw
download experiment[Dataset13] in dataset[Dataset13_MS_raw]


100%|██████████| 1.64G/1.64G [22:23<00:00, 1.31MB/s]  


load experiment[Dataset13] in dataset[Dataset13_MS_raw]
Dataset loaded successfully.


In [5]:
# Check available batches and their slices
adata.obs[['batch', 'slice_id']].drop_duplicates()


Unnamed: 0,batch,slice_id
110883424764611924400221639916314253469-0,0,R1S1
277373730858255322904479591336292143718-1,1,R2S1
139968683432966769265787739231843442191-2,2,R3S1
149164679103246548309819743981609972453-3,3,R1S2
100442548580636641738686294721955425236-4,4,R2S2
158338042824236264719696604356349910479-5,5,R3S2
156852667528872626811117292962470921390-6,6,R1S3
222213390088484216253925626300058690969-7,7,R2S3
102664563492900048462363937849459428087-8,8,R3S3


In [6]:
# To avoid batch effect we focus on a single slice from the dataset
adata_slice = adata[adata.obs['slice_id'] == 'R1S1'].copy()

In [9]:
from MENDER import MENDER
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# Look at one slice at a time (prevent batch effect)
slice_ids = ['R1S1', 'R2S1', 'R3S1']  

# Look at different number of principle components for umap
n_components_list = [10, 20, 30, 40, 50]

for slice_id in slice_ids:
    print("Processing slice: ", slice_id)
    adata_slice = adata[adata.obs['slice_id'] == slice_id].copy()
    
    # PCA baseline
    adata_pca = adata_slice.copy()
    sc.pp.highly_variable_genes(adata_pca, flavor="seurat_v3", n_top_genes=4000)
    sc.pp.normalize_total(adata_pca, inplace=True)
    sc.pp.log1p(adata_pca)
    sc.pp.pca(adata_pca)
    sc.pp.neighbors(adata_pca)
    sc.tl.leiden(adata_pca, resolution=2.0, key_added='ct_pca')
    adata_pca.obs['ct_pca'] = adata_pca.obs['ct_pca'].astype('category')

    for n_components in n_components_list:
        print("### UMAP with ", n_components, " components ###")

        adata_umap = adata_slice.copy()
        sc.pp.highly_variable_genes(adata_umap, flavor="seurat_v3", n_top_genes=4000)
        sc.pp.normalize_total(adata_umap, inplace=True)
        sc.pp.log1p(adata_umap)
        sc.pp.pca(adata_umap, n_comps=n_components)
        sc.pp.neighbors(adata_umap, n_pcs=n_components)
        sc.tl.umap(adata_umap)
        sc.tl.leiden(adata_umap, resolution=2.0, key_added='ct_umap')
        adata_umap.obs['ct_umap'] = adata_umap.obs['ct_umap'].astype('category')

        # Compare CT with PCA
        ari = adjusted_rand_score(adata_pca.obs['ct_pca'], adata_umap.obs['ct_umap'])
        nmi = normalized_mutual_info_score(adata_pca.obs['ct_pca'], adata_umap.obs['ct_umap'])
        print(f"Cell group ARI (vs PCA): {ari:.3f}")
        print(f"Cell group NMI (vs PCA): {nmi:.3f}")

        # Compare CT with ground truth 
        ari_gt = adjusted_rand_score(adata_umap.obs['ct'], adata_umap.obs['ct_umap'])
        nmi_gt = normalized_mutual_info_score(adata_umap.obs['ct'], adata_umap.obs['ct_umap'])
        print(f"CT vs Ground Truth ARI: {ari_gt:.3f}")
        print(f"CT vs Ground Truth NMI: {nmi_gt:.3f}")

     


Processing slice:  R1S1
### UMAP with  10  components ###
Cell group ARI (vs PCA): 0.675
Cell group NMI (vs PCA): 0.766
CT vs Ground Truth ARI: 0.612
CT vs Ground Truth NMI: 0.737
### UMAP with  20  components ###
Cell group ARI (vs PCA): 0.753
Cell group NMI (vs PCA): 0.847
CT vs Ground Truth ARI: 0.682
CT vs Ground Truth NMI: 0.811
### UMAP with  30  components ###
Cell group ARI (vs PCA): 0.820
Cell group NMI (vs PCA): 0.880
CT vs Ground Truth ARI: 0.737
CT vs Ground Truth NMI: 0.830
### UMAP with  40  components ###
Cell group ARI (vs PCA): 0.888
Cell group NMI (vs PCA): 0.916
CT vs Ground Truth ARI: 0.779
CT vs Ground Truth NMI: 0.848
### UMAP with  50  components ###
Cell group ARI (vs PCA): 1.000
Cell group NMI (vs PCA): 1.000
CT vs Ground Truth ARI: 0.762
CT vs Ground Truth NMI: 0.841
Processing slice:  R2S1
### UMAP with  10  components ###
Cell group ARI (vs PCA): 0.635
Cell group NMI (vs PCA): 0.763
CT vs Ground Truth ARI: 0.586
CT vs Ground Truth NMI: 0.731
### UMAP with  2