In [None]:
import scanpy as sc
import numpy as np
import wcd_vae
from wcd_vae.scCRAFT.model import train_integration_model, obtain_embeddings
from wcd_vae.scCRAFT.utils import multi_resolution_cluster
import scvi
import scib 
import harmonypy as hm
import pandas as pd
import scanorama
import time
import bbknn
import scDML
from scDML import scDMLModel
from scDML.utils import print_dataset_information
import imap
from scib.utils import *
import torch
import scib

In [None]:
# set the torch random seed
torch.manual_seed(42)

In [None]:
adata = sc.read_h5ad("/workspaces/data/human_pancreas_norm_complexBatch.h5ad")
# reduce to two batches
adata = adata[adata.obs["tech"].isin(["celseq", "inDrop1"])].copy()
# remove all celltypes with less than 100 cells in each tech
celltype_counts = adata.obs.groupby(["celltype", "tech"]).size()
valid_celltypes = celltype_counts[celltype_counts >= 100].index.get_level_values(0).unique()
adata = adata[adata.obs["celltype"].isin(valid_celltypes)].copy()

adata.raw = adata
adata.layers["counts"] = adata.X.copy()
sc.pp.filter_cells(adata, min_genes=300)
sc.pp.filter_genes(adata, min_cells=5)
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000, batch_key='tech')
adata = adata[:, adata.var['highly_variable']]

In [None]:
multi_resolution_cluster(adata, resolution1 = 1, method = 'Leiden')
VAE = train_integration_model(adata, batch_key = 'tech', z_dim=256, d_coef = 1, epochs=1000, critic=True, disc_iter=10)
obtain_embeddings(adata, VAE.to("cuda:0"))
sc.pp.neighbors(adata, use_rep="X_scCRAFT")
sc.tl.umap(adata, min_dist=0.5)
sc.pl.umap(adata, color=["tech", "celltype"], frameon=False, ncols=1)

In [None]:
scib.me.silhouette(adata, label_key="celltype", embed="X_scCRAFT", scale=True)

In [None]:
scib.me.silhouette_batch(adata, batch_key="tech", label_key="celltype", embed="X_scCRAFT", scale=True)

In [None]:
VAE = train_integration_model(adata, batch_key = 'tech', z_dim=256, d_coef = 0.2, epochs=1000, critic=False, disc_iter=1)
obtain_embeddings(adata, VAE.to("cuda:0"))
sc.pp.neighbors(adata, use_rep="X_scCRAFT")
sc.tl.umap(adata, min_dist=0.5)
sc.pl.umap(adata, color=["tech", "celltype"], frameon=False, ncols=1)

In [None]:
scib.me.silhouette(adata, label_key="celltype", embed="X_scCRAFT", scale=True)

In [None]:
scib.me.silhouette_batch(adata, batch_key="tech", label_key="celltype", embed="X_scCRAFT", scale=True)

In [None]:
# scVI
adata = adata.copy()
adata.layers['counts'] = adata.X
scvi.model.SCVI.setup_anndata(adata, layer="counts", batch_key="tech") 
vae = scvi.model.SCVI(adata, n_layers=2, n_latent=50, gene_likelihood="nb")
vae.train()
adata.obsm["X_scVI"] = vae.get_latent_representation()
sc.pp.neighbors(adata, use_rep="X_scVI")
sc.tl.umap(adata, min_dist=0.5)
sc.pl.umap(adata, color=["tech", "celltype"], frameon=False, ncols=1)

In [None]:
# Harmony
sc.tl.pca(adata, n_comps=50)
data_mat = adata.obsm['X_pca']
meta_data = adata.obs

# Specify the variables to use (as in your original code)
vars_use = ['tech']

# Run Harmony
start_time = time.time() 
ho = hm.run_harmony(data_mat, meta_data, vars_use)
end_time = time.time()
training_time = end_time - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Convert the adjusted PCs to a DataFrame
res = pd.DataFrame(ho.Z_corr)
res.columns = ['X{}'.format(i + 1) for i in range(res.shape[1])]

# If you want to store the adjusted PCs back into the AnnData object
adata.obsm['X_harmony'] = res.values.T
sc.pp.neighbors(adata, use_rep="X_harmony")
sc.tl.umap(adata, min_dist=0.5)
sc.pl.umap(adata, color=["tech", "celltype"], frameon=False, ncols=1)

In [None]:
#Scanorama
# Save original order of cells
original_order = adata.obs_names.copy()

# Start timer
start_time = time.time()

# Your existing Scanorama correction process
split, categories = split_batches(adata.copy(), 'batch', return_categories=True)
corrected = scanorama.correct_scanpy(split, return_dimred=True)
corrected = anndata.AnnData.concatenate(
    *corrected, batch_key='batch', batch_categories=categories, index_unique=None
)

# Reorder corrected data to match original order
corrected = corrected[original_order]

# End timer
end_time = time.time()
training_time = end_time - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Replace adata with corrected data
adatas = corrected.copy()

adata.obsm['X_scanorama'] = adatas.obsm['X_scanorama']

# Proceed with your analysis (neighbors, UMAP, plotting)
sc.pp.neighbors(adata, n_pcs=30, use_rep="X_scanorama")
sc.tl.umap(adata)
sc.pl.umap(adata, color=["batch", "cell_type"], frameon=False, ncols=1)

In [None]:
start_time = time.time() 
bbknn.bbknn(adata, batch_key='batch')
end_time = time.time()
training_time = end_time - start_time
print(f"Training completed in {training_time:.2f} seconds")
sc.tl.umap(adata, min_dist=0.5)
sc.pl.umap(adata, color=["batch", "cell_type"], frameon=False, ncols=1)
adata.obsm['X_bbknn'] = adata.obsm['X_umap']

In [None]:
#iMAP

if type(adata.X) != type(np.array([])):
    adata.X = adata.X.toarray()
start_time = time.time() 
### Stage I
print('HI')
EC, ec_data = imap.stage1.iMAP_fast(adata, key='batch', n_epochs=50)
### Stage II
output_results = imap.stage2.integrate_data(adata, ec_data, key='batch', n_epochs=40)
output_results.shape
end_time = time.time()
print('total time talken', end_time-start_time)
adata_int = adata.copy()
adata_int.X = output_results

sc.tl.pca(adata_int, n_comps=50)
sc.pp.neighbors(adata_int, use_rep="X_pca")
sc.tl.umap(adata_int, min_dist=0.5)
sc.pl.umap(adata_int, color=["batch", "cell_type"], frameon=False, ncols=1)
adata.obsm['imap'] = adata_int.obsm['X_pca']

In [None]:
#scDML

start_time = time.time()
ncluster = len(adata.obs['cell_type'].unique())
scdml=scDMLModel()
adata_int = adata.copy()
adata_int=scdml.preprocess(adata_int, cluster_method="louvain",resolution=3.0,batch_key = 'batch')
scdml.integrate(adata_int,batch_key='batch',ncluster_list=[ncluster],
               expect_num_cluster=ncluster,merge_rule="rule2", out_dim=50)
end_time = time.time()
print('time taken to run :', end_time - start_time)
adata.obsm['scDML'] = adata_int.obsm['X_emb']
sc.pp.neighbors(adata, use_rep='scDML')
sc.tl.umap(adata)
sc.pl.umap(adata, color=["batch", "cell_type"], frameon=False, ncols=1)

In [None]:
# Seurat R pipeline
```R
library(Seurat)
library(anndata)
library(reticulate)
library(SeuratWrappers)
library(SeuratDisk)

Convert('/path/Lung_atlas_raw.h5ad', "h5seurat", assay = "RNA",
        overwrite = T, verbose = T)
seurat_obj <- LoadH5Seurat("/path/Lung_atlas_raw.h5seurat", assay = "RNA", meta.data = T)
saveRDS(seurat_obj, file = "/path/Lung_atlas_raw.rds")  

seurat_obj = readRDS("/path/Lung_atlas_raw.rds")
original_cell_order <- colnames(seurat_obj@assays$RNA@counts)
seurat_obj[["RNA"]] <- split(seurat_obj[["RNA"]], f = seurat_obj$batch)
seurat_obj <- SCTransform(seurat_obj)
seurat_obj <- RunPCA(seurat_obj, npcs = 50, verbose = F)
seurat_obj <- IntegrateLayers(
  object = seurat_obj, method = RPCAIntegration,
  new.reduction = "integrated.rpca", normalization.method = "SCT",
  verbose = FALSE
)

integrated_rpca_embeddings <- Embeddings(object = seurat_obj, reduction = "integrated.rpca")
pca_embeddings <- integrated_rpca_embeddings[, 1:50]
pca_embeddings_ordered <- pca_embeddings[match(original_cell_order, rownames(pca_embeddings)), ]
write.csv(pca_embeddings_ordered, file = "/path/Lung_atlas_seurat.csv", row.names = TRUE)
```

In [None]:
pca_embeddings = pd.read_csv('/path/Lung_atlas_seurat.csv', index_col=0)
adata.obsm['X_seurat'] = pca_embeddings.values
sc.pp.neighbors(adata, use_rep="X_seurat")
sc.tl.umap(adata, min_dist=0.5)
sc.pl.umap(adata, color=["batch", "cell_type"], frameon=False, ncols=1)