In [1]:
import scanpy as sc
import anndata as ad
import scanpy.external as sce
import pandas as pd
import matplotlib.pyplot as plt
import scvi
import torch
import scipy.sparse as sp

from rich import print
import warnings
warnings.filterwarnings("ignore")
import os

outdir = "/Users/jessica/Documents/GitHub/Jessica_notebooks/write/02_batch_annotation/PICA0001-PICA0007"
os.makedirs(outdir, exist_ok=True)
sc.settings.figdir = "/Users/jessica/Documents/GitHub/Jessica_notebooks/figures/PICA0001-PICA0007/02_annotation"



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_adata_cd4= sc.read_h5ad("/Users/jessica/Documents/GitHub/Jessica_notebooks/write/02_batch_annotation/PICA0001-PICA0007/PICA0001-PICA0007_preprocessed.h5ad")
raw_adata_cd4.var_names_make_unique()
print(raw_adata_cd4)

In [3]:
raw_adata_cd4.obs['cell_type'].value_counts()

cell_type
Naive/Central memory CD4 T cell    9527
Regulatory T cell                   267
CCR6+ memory CD4 T cell             222
Cytotoxic CD4 T cell                 43
Name: count, dtype: int64

#### 1. Batch integration with scVI

In [4]:
scvi.model.SCVI.setup_anndata(raw_adata_cd4, layer="counts", batch_key="pica_id")

In [5]:
model = scvi.model.SCVI(raw_adata_cd4, n_layers=2, n_latent=30, gene_likelihood="nb")

In [None]:
model.train()

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Epoch 26/400:   6%|▋         | 25/400 [09:23<3:15:36, 31.30s/it, v_num=1, train_loss_step=8.32e+3, train_loss_epoch=8.15e+3]

In [6]:
raw_adata_cd4.obsm["X_scVI"] = model.get_latent_representation()

#  latent representation 
sc.pp.neighbors(raw_adata_cd4, use_rep="X_scVI")
sc.tl.umap(raw_adata_cd4)


sc.pl.umap(raw_adata_cd4, color=[ "pica_id", 'cell_type'], save="_scvi_integrated_umap.png")

RuntimeError: Trying to query inferred values from an untrained model. Please train the model first.

In [None]:
sc.pl.umap(
    raw_adata_cd4,
    color=["log1p_total_counts", "pct_counts_mt", "log1p_n_genes_by_counts"],
    wspace=0.5,
    ncols=2,
    save="_scvi_qc.png"
)

In [None]:
# Save the scVI integrated batch adata
raw_adata_cd4.write_h5ad("/Users/jessica/Documents/GitHub/Jessica_notebooks/write/02_batch_annotation//PICA0001-PICA0007/PICA0001-PICA0007_scVI.h5ad", compression='gzip')

#### 2. Clustering
- resolutions
- known marker genes to see if they align with cluster size

In [None]:
# Test different resolutions for Leiden clustering
# Higher resolution will result in more clusters, lower resolution will result in fewer clusters
for res in [ 0.5, 1.0, 1.5, 2.0]:
    sc.tl.leiden(
        raw_adata_cd4, key_added=f"leiden_res_{res:3.1f}", resolution=res, flavor="igraph"
    )

In [None]:
sc.pl.umap(
    raw_adata_cd4,
    color=[ "leiden_res_0.5", "leiden_res_1.0","leiden_res_1.5", "leiden_res_2.0"],
    legend_loc="on data", legend_fontsize = 15
)

In [None]:
# Save the clustered adata
raw_adata_cd4.write_h5ad("/Users/jessica/Documents/GitHub/Jessica_notebooks/write/02_batch_annotation/PICA0001-PICA0007/PICA0001-PICA0007_scVI_cluster.h5ad", compression='gzip')
print(raw_adata_cd4)

In [None]:
raw_adata_cd4= sc.read_h5ad("/Users/jessica/Documents/GitHub/Jessica_notebooks/write/02_batch_annotation/PICA0001-PICA0007/PICA0001-PICA0007_scVI_cluster.h5ad")
print(raw_adata_cd4)

##### 3. CD4 subset validation

In [None]:
broad_marker_genes = {
    "B cell": ["BLNK", "CD19", "CD79A", "CD79B", "MS4A1", "CD37", "CD79A"],
    "Plasmablast": ["CD19", "CD24", "CD38"], # experiment
    "Dendritic cell": ["CLEC4C", "IL3RA", "NRP1", "CD1C", "CST3", "FCER1A", "CD123", "GZMB"],
    "Endothelial cell": ["CDH5", "SELE", "VWF"],
    "Fibroblast": ["COL1A1", "COL3A1", "FAP", "THY1"],
    'Neutrophils': ['S100A8', 'S100A9', 'CEACAM8', 'ELANE', 'CSF3R', 'MPO'],  # CEACAM8 is highly specific neutrophil marker
    'Eosinophils': ['IL5RA', 'CCR3', 'EPX', 'PRG2', 'GATA1', 'CLC'],  
    'Basophils': ['FCER1A', 'KIT', 'CPA3', 'HPGDS', 'ENPP3'],  
    "Classical monocyte": ["CD14", "FCGR3A", "LYZ", "FCGR1A", 'CCR2', 'S100A8', 'S100A9'], # CD14⁺ CD16/FCGR3A⁻
    "Intermediate monocyte": ["CD14", "FCGR3A", "HLA-DRA", "CCL2", 'ITGAX'], # CD14⁺ CD16⁺
    "Non-classical monocyte": ["CD14", "FCGR3A", "TREML4", "NR4A1", 'CX3CR1', 'HLA-DPB1'], # CD14⁻CD16⁺
    "Natural killer cell": ["CCL3", 'CD160', 'CD247', 'GNLY', 'GZMB', 'NKG7', 'FCGR3A', 'FCGR3B', 'KLRB1', 'KLRC1', 'KLRD1', 'KLRF1', 'KLRK1', 'NCAM1'],
    "Platelet": ["CD41", "ITGA2B", "CD34", "CD61", "PF4", 'PLA2G12A', 'PPBP'], # No megakaryocyte in blood
    'HSPC': ['CD34', 'CD38', 'THY1', 'KIT', 'PROM1', 'SLAMF1', 'ITGA6'],
    "T cell": ['CD3D', 'CD3E', 'CD3G', 'TRAC', 'TRBC1', 'TRDC'],
    "CD4+ memory T cell": ['CCR7', 'CD27', 'IL7R'],
    "CD4+ T cell": ['CD4', 'CTLA4', 'FOXP3', 'IL2RA'],
    "CD8+ T cell": ['CD4', 'CD8A', 'CD8B', 'GZMK'],
    "Naive T cell": ['CCR7', 'CD27', 'CD8A', 'CD8B'],
    "Natural killer T (NKT) cell": ['CD8A', 'CD8B', 'ZNF683'],
}

In [None]:
adata_full = raw_adata_cd4.raw.to_adata()
adata_full
present_marker_genes = {}

for celltype, markers in broad_marker_genes.items():
    present_markers = []
    for marker in markers:
        if marker in adata_full.var_names:
            present_markers.append(marker)
    present_marker_genes[celltype] = present_markers
print(present_marker_genes)

In [None]:
sc.pl.dotplot(raw_adata_cd4, present_marker_genes , groupby="leiden_res_1.0", standard_scale="var", save="_all_markers.png")

In [None]:
sc.pl.dotplot(raw_adata_cd4, ["CD19", "MS4A1", "CD79A", "CD79B", "IGHM", "JCHAIN", "TNFRSF13B"], groupby="leiden_res_1.0", standard_scale = "var", color_map = "Blues")

In [None]:
# Cluster 11 looks like B cell
sc.tl.rank_genes_groups(raw_adata_cd4, groupby="leiden_res_1.0", groups=['11'], method="wilcoxon")
sc.pl.rank_genes_groups(raw_adata_cd4, groupby="leiden_res_1.0", n_genes=10)

In [None]:
print(sc.get.rank_genes_groups_df(raw_adata_cd4, group='11').head(30)["names"])

In [None]:
# check core marker expression overlays for cluster 11
cluster_11_cells = raw_adata_cd4[raw_adata_cd4.obs['leiden_res_1.0'] == '11']
sc.pl.umap(
    cluster_11_cells,
    color=["leiden_res_1.0","CD4","CD3D","IL7R",     # CD4 T markers
           'CD19',"MS4A1","CD79A","CD79B","CD74", # B cell markers
           "NKG7","GNLY","GZMB"],   # NK / cytotoxic markers
    vmax="p99",
    cmap="Reds",
    size=30,
    save="_cluster11_marker_overlays.png"
)

In [None]:
sc.pl.violin(raw_adata_cd4, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], groupby= 'leiden_res_1.0', jitter=0.4, multi_panel=True)

In [None]:
raw_adata_cd4.obs['cell_type_cd4_res_1'] = raw_adata_cd4.obs['cell_type'].copy()
adata_cd4_k11 = raw_adata_cd4.copy()
sc.tl.leiden(adata_cd4_k11, resolution=1.0, restrict_to=('leiden_res_1.0', ['11']), key_added='leiden_res_1.0_k11')
sc.pl.umap(adata_cd4_k11, color=['leiden_res_1.0', 'leiden_res_1.0_k11','cell_type_cd4_res_1'], legend_loc="on data")


In [None]:
sc.pl.dotplot(adata_cd4_k11, present_marker_genes , groupby="leiden_res_1.0_k11", standard_scale="var")

Cluster 11 exhibited co-expression of canonical B-cell markers (MS4A1, CD79A) and pan-T-cell markers (CD3D, CD3E), suggesting the presence of T/B doublets. This hybrid signature, together with its distinct position on the UMAP, indicates technical doublets rather than a true biological subset. Consequently, this cluster was excluded from downstream analysis.

In [None]:
raw_adata_cd4.obs['leiden_res_1.0'] = raw_adata_cd4.obs['leiden_res_1.0'].astype(str)
raw_adata_cd4 = raw_adata_cd4[raw_adata_cd4.obs['leiden_res_1.0'] != '11', :].copy()


In [None]:
raw_adata_cd4.obs['cell_type_cd4_res_1'].value_counts()

In [None]:
sc.pl.umap(
    raw_adata_cd4,
    color="pica_id",
    size=2,
)

In [None]:
# save data without cluster 11
raw_adata_cd4.write_h5ad("/Users/jessica/Documents/GitHub/Jessica_notebooks/write/02_batch_annotation/PICA0001-PICA0007/PICA0001-PICA0007_preprocessed.h5ad", compression='gzip')


In [None]:
adata_cd4= sc.read_h5ad("/Users/jessica/Documents/GitHub/Jessica_notebooks/write/02_batch_annotation/PICA0001-PICA0007/PICA0001-PICA0007_preprocessed.h5ad")
adata_cd4.var_names_make_unique()
print(adata_cd4)

In [None]:
# run scvi again
scvi.model.SCVI.setup_anndata(adata_cd4, layer="counts", batch_key="pica_id")
model = scvi.model.SCVI(adata_cd4, n_layers=2, n_latent=30, gene_likelihood="nb")

In [None]:
model.train()

In [None]:
adata_cd4.obsm["X_scVI"] = model.get_latent_representation()

#  latent representation 
sc.pp.neighbors(adata_cd4, use_rep="X_scVI")
sc.tl.umap(adata_cd4)


sc.pl.umap(adata_cd4, color=[ "pica_id", 'cell_type'], save="_scvi_integrated_umap.png")

In [None]:
sc.pl.umap(
    adata_cd4,
    color=["log1p_total_counts", "pct_counts_mt", "log1p_n_genes_by_counts"],
    wspace=0.5,
    ncols=2,
    save="_scvi_qc.png"
)

In [None]:
# Save the scVI integrated batch adata
adata_cd4.write_h5ad("/Users/jessica/Documents/GitHub/Jessica_notebooks/write/02_batch_annotation//PICA0001-PICA0007/PICA0001-PICA0007_scVI.h5ad", compression='gzip')

In [None]:
# clustering
for res in [ 0.5, 1.0, 1.5, 2.0]:
    sc.tl.leiden(
        adata_cd4, key_added=f"leiden_res_{res:3.1f}", resolution=res, flavor="igraph"
    )

In [None]:
sc.pl.umap(
    adata_cd4,
    color=[ "leiden_res_0.5", "leiden_res_1.0","leiden_res_1.5", "leiden_res_2.0"],
    legend_loc="on data", legend_fontsize = 15
)

In [None]:
adata_cd4.write_h5ad("/Users/jessica/Documents/GitHub/Jessica_notebooks/write/02_batch_annotation/PICA0001-PICA0007/PICA0001-PICA0007_scVI_cluster.h5ad", compression='gzip')
print(adata_cd4)

In [None]:
sc.pl.dotplot(adata_cd4, present_marker_genes , groupby="leiden_res_1.0", standard_scale="var", save="_all_markers.png")

In [None]:
adata_cd4.write_h5ad("/Users/jessica/Documents/GitHub/Jessica_notebooks/write/02_batch_annotation//PICA0001-PICA0007/PICA0001-PICA0007_cd4_annotations.h5ad", compression='gzip')
print(adata_cd4)