In [1]:
import numpy as np
import scanpy as sc

from scib_metrics.benchmark import Benchmarker, BioConservation, BatchCorrection

%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
adata = sc.read(
    "data/lung_atlas.h5ad",
    backup_url="https://figshare.com/ndownloader/files/24539942",
)
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="cell_ranger", batch_key="batch")
sc.tl.pca(adata, n_comps=30, use_highly_variable=True)
adata = adata[:, adata.var.highly_variable].copy()
adata.obsm["Unintegrated"] = adata.obsm["X_pca"]
adata.write("data/lung_atlas.h5ad")

  mask_var_param, mask_var = _handle_mask_var(adata, mask_var, use_highly_variable)


In [6]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd
import yaml
import os

if os.path.exists("data/lung_atlas.h5ad"):
    adata = sc.read("data/lung_atlas.h5ad")
sce.pp.harmony_integrate(adata=adata, 
                         key=['batch'],
                         basis='X_pca',
                         max_iter_harmony = 20,
                         theta = None,
                         lamb = None,
                         sigma = 0.1, 
                         nclust = None,
                         tau = 0,
                         block_size = 0.05, 
                         max_iter_kmeans = 20,
                         epsilon_cluster = 1e-5,
                         epsilon_harmony = 1e-4, 
                         adjusted_basis='X_harmony')
adata.write("data/lung_atlas.h5ad")

2025-07-17 12:45:22,315 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...
2025-07-17 12:45:23,544 - harmonypy - INFO - sklearn.KMeans initialization complete.
2025-07-17 12:45:23,590 - harmonypy - INFO - Iteration 1 of 20
2025-07-17 12:45:26,577 - harmonypy - INFO - Iteration 2 of 20
2025-07-17 12:45:29,599 - harmonypy - INFO - Iteration 3 of 20
2025-07-17 12:45:32,561 - harmonypy - INFO - Iteration 4 of 20
2025-07-17 12:45:35,493 - harmonypy - INFO - Iteration 5 of 20
2025-07-17 12:45:38,386 - harmonypy - INFO - Iteration 6 of 20
2025-07-17 12:45:41,421 - harmonypy - INFO - Iteration 7 of 20
2025-07-17 12:45:43,601 - harmonypy - INFO - Converged after 7 iterations


In [7]:
import scvi

scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="batch")
vae = scvi.model.SCVI(adata, gene_likelihood="nb", n_layers=2, n_latent=30)
vae.train()
adata.obsm["scVI"] = vae.get_latent_representation()

  self.validate_field(adata)
  accelerator, lightning_devices, device = parse_device_args(
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/anaconda3/envs/bioinfo/lib/python3.12/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/opt/anaconda3/envs/bioinfo/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 2/246:   0%|          | 1/246 [00:14<1:00:04, 14.71s/it, v_num=1, train_loss_step=572, train_loss_epoch=651]


Detected KeyboardInterrupt, attempting graceful shutdown ...


In [None]:
lvae = scvi.model.SCANVI.from_scvi_model(
    vae,
    adata=adata,
    labels_key="cell_type",
    unlabeled_category="Unknown",
)
lvae.train(max_epochs=20, n_samples_per_label=100)
adata.obsm["scANVI"] = lvae.get_latent_representation()

In [None]:
bm = Benchmarker(
    adata,
    batch_key="batch",
    label_key="cell_type",
    bio_conservation_metrics=BioConservation(),
    batch_correction_metrics=BatchCorrection(),
    embedding_obsm_keys=["X_pca", "X_harmony"],
    n_jobs=6,
)
bm.benchmark()