In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [3]:
import sys
sys.path.append('..')

In [4]:
# Human Tonsil snRNAseq from SlideTags
# https://www.nature.com/articles/s41586-023-06837-4

adata = sc.read_h5ad('/Users/koush/Downloads/slidetags_human_tonsil/human_tonsil.h5ad')
adata = adata[:, [gene for gene in adata.var_names if not bool(re.search(r'\.\d+$', gene))]]
adata

View of AnnData object with n_obs × n_vars = 5778 × 19171
    obs: 'cell_type'
    obsm: 'spatial'

In [5]:
del adata.uns

In [6]:
extra_genes = np.unique(['FOXO1', 'BATF', 'BACH2', 'PRDM1', 'BCL6', 'FOXP3', 'SATB1', 'ID2', 'PAX5', 
    'CXCR4', 'CD83', 'CD86', 'AICDA', 'BCL2A1', 'BCL2', 'LMO2', 'CXCL13', 
    'CD80', 'TRAF3', 'CCL19', 'CCR7', 'CCL21', 'CD40LG', 'CD40', 'IRF4', 'IRF8', 
    'ITGA5', 'ITGB1', 'ITGAM', 'ITGB2', 'CCR6', 'CD19', 
    'BCL2', 'IL2', 'IL4', 'CD83', 'CD86', 'SDF4', 'SDF2', 'SDF2L1', 
    'S1PR1', 'BMS1P14', 'S1PR3', 'S1PR4', 'S1PR2', 'EBI3', 'CD28', 
    'MICOS10', 'ICOS', 'MICOS13', 'ICOSLG', 'PDCD11', 'PDCD1', 'CD274', 
    'PDCD1LG2', 'TICAM2', 'ICAM2', 'TICAM1', 'EPCAM', 'ICAM1', 'ICAM3', 
    'VCAM1', 'CXCR4', 'CXCR5',  'CXCL13', 'CXCL14', 'CXCL12', 'CR2', 'NFKBIZ', 
    'NFKB1', 'NFKBIL1', 'NFKBIE', 'NFKB2', 'NFKBIA', 'NFKBID', 'NFKBIB', 
    'IL6R', 'IL6ST', 'EGR1', 'EGR3', 'EGR2'])

len(extra_genes)

76

In [7]:
# [i for i in adata.var_names if 'EBI' in i]

In [8]:
# %matplotlib inline
# plt.rcParams['figure.figsize'] = (5, 5)
# plt.rcParams['font.size'] = 14
# plt.rcParams['figure.dpi'] = 100
# sc.pl.spatial(adata, color=['IL2', 'IL4'], ncols=10,
#     spot_size=50, frameon=False, linewidth=0.2, edgecolor='black')

In [9]:
import os, sys 
sys.path.append('../../src')

In [10]:
import celloracle as co
base_GRN = co.data.load_human_promoter_base_GRN()

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Loading prebuilt promoter base-GRN. Version: hg19_gimmemotifsv5_fpr2


In [11]:
from spaceoracle.tools.network import expand_paired_interactions
import commot as ct

df_ligrec = ct.pp.ligand_receptor_database(
        database='CellChat', 
        species='human', 
        signaling_type="Secreted Signaling"
    )
            
df_ligrec.columns = ['ligand', 'receptor', 'pathway', 'signaling']
df_ligrec = expand_paired_interactions(df_ligrec)

coms_df =df_ligrec.query('receptor.str.contains("CXCR") or ligand.str.contains("CXCL")')
genes_of_interest = list(np.unique(coms_df.ligand.unique().tolist() + \
                         coms_df.receptor.unique().tolist()))
# len(genes_of_interest)
genes_of_interest = genes_of_interest + list(extra_genes)
len(genes_of_interest)

100

In [12]:
n_top_genes = 3500
min_cells = 50
min_counts = 200

In [13]:
adata.var_names_make_unique()
adata.var["MT"] = adata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["MT"], inplace=True)
sc.pp.filter_cells(adata, min_counts=min_counts)
adata = adata[adata.obs["pct_counts_MT"] < 20].copy()
adata = adata[:, ~adata.var["MT"]]

sc.pp.filter_genes(adata, min_cells=min_cells)
# adata.layers["raw_count"] = adata.X.copy()
# sc.pp.normalize_total(adata, inplace=True)
# sc.pp.log1p(adata)
sc.pp.highly_variable_genes(
    adata, flavor="seurat", n_top_genes=n_top_genes)

adata = adata[:, (adata.var.highly_variable | adata.var_names.isin(genes_of_interest))]

adata.layers["normalized_count"] = adata.to_df().values.copy()
adata

AnnData object with n_obs × n_vars = 5778 × 3544
    obs: 'cell_type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_MT', 'log1p_total_counts_MT', 'pct_counts_MT', 'n_counts'
    var: 'MT', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg'
    obsm: 'spatial'
    layers: 'normalized_count'

In [15]:
adata.obs['cell_type'] = adata.obs['cell_type'].astype('category')

In [16]:
adata.obs['cell_type'].value_counts()

cell_type
B_germinal_center      1848
B_naive                1067
T_CD4                   779
B_memory                516
T_follicular_helper     294
plasma                  262
FDC                     255
T_CD8                   238
NK                      170
mDC                     137
myeloid                 102
pDC                      64
T_double_neg             46
Name: count, dtype: int64

In [17]:
adata.obs['cell_type_int'] = pd.Categorical(adata.obs['cell_type']).codes

In [18]:
adata.write_h5ad('/Users/koush/Projects/SpaceOracle/data/slidetags/snrna_human_tonsil.h5ad')

In [19]:
import scanpy as sc 
adata = sc.read_h5ad('/Users/koush/Projects/SpaceOracle/data/slidetags/snrna_human_tonsil.h5ad')
adata

AnnData object with n_obs × n_vars = 5778 × 3544
    obs: 'cell_type', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_MT', 'log1p_total_counts_MT', 'pct_counts_MT', 'n_counts', 'cell_type_int'
    var: 'MT', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg'
    obsm: 'spatial'
    layers: 'normalized_count'