In [1]:
import sys
sys.path.append('../../src')
import anndata
import scanpy as sc
import numpy as np
from spaceoracle import SpaceOracle
import commot as ct
import pandas as pd

In [2]:
adata = sc.read_h5ad('../../data/slideseq/day3_1.h5ad')

In [3]:
def expand_paired_interactions(df):
    expanded_rows = []
    for _, row in df.iterrows():
        ligands = row['ligand'].split('_')
        receptors = row['receptor'].split('_')
        
        for ligand in ligands:
            for receptor in receptors:
                new_row = row.copy()
                new_row['ligand'] = ligand
                new_row['receptor'] = receptor
                expanded_rows.append(new_row)
    
    return pd.DataFrame(expanded_rows)

In [4]:
df_ligrec = ct.pp.ligand_receptor_database(
        database='CellChat', 
        species='mouse', 
        signaling_type="Secreted Signaling"
    )
            
df_ligrec.columns = ['ligand', 'receptor', 'pathway', 'signaling']
df_ligrec = expand_paired_interactions(df_ligrec)


In [5]:
ilr2_df =df_ligrec.query('receptor.str.contains("Il2") or ligand.str.contains("Il2")')
genes_of_interest = list(np.unique(ilr2_df.ligand.unique().tolist() + ilr2_df.receptor.unique().tolist()))
# len(genes_of_interest)
genes_of_interest = genes_of_interest + ['Bach2', 'Prdm1', 'Bcl6', 'Foxp3', 'Satb1', 'Id2']

In [6]:
n_top_genes = 4000
min_cells = 10
min_counts = 250

In [7]:
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("mt-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
sc.pp.filter_cells(adata, min_counts=min_counts)
adata = adata[adata.obs["pct_counts_mt"] < 20].copy()
adata = adata[:, ~adata.var["mt"]]
sc.pp.filter_genes(adata, min_cells=min_cells)
adata.layers["raw_count"] = adata.X
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(
    adata, flavor="seurat", n_top_genes=n_top_genes)

adata = adata[:, (adata.var.highly_variable | adata.var_names.isin(genes_of_interest))]
adata.layers["normalized_count"] = adata.to_df().values

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [8]:
pd.Series(np.in1d(genes_of_interest, adata.var_names), index=genes_of_interest).sort_values(ascending=False)

Ebi3        True
Il10rb      True
Satb1       True
Foxp3       True
Bcl6        True
Prdm1       True
Bach2       True
Il7         True
Il6st       True
Il4         True
Il2rg       True
Il2rb       True
Il2ra       True
Il27ra      True
Il24        True
Id2         True
Il12b       True
Il12rb1     True
Il15        True
Il21r       True
Il21        True
Il20rb      True
Il17ra      True
Il17rb      True
Il2         True
Il23a      False
Il9        False
Il20       False
Il19       False
Il22ra1    False
Il20ra     False
Il27       False
Il25       False
Il22       False
Il23r      False
dtype: bool

In [9]:
genes_of_interest

['Ebi3',
 'Il10rb',
 'Il12b',
 'Il12rb1',
 'Il15',
 'Il17ra',
 'Il17rb',
 'Il19',
 'Il2',
 'Il20',
 'Il20ra',
 'Il20rb',
 'Il21',
 'Il21r',
 'Il22',
 'Il22ra1',
 'Il23a',
 'Il23r',
 'Il24',
 'Il25',
 'Il27',
 'Il27ra',
 'Il2ra',
 'Il2rb',
 'Il2rg',
 'Il4',
 'Il6st',
 'Il7',
 'Il9',
 'Bach2',
 'Prdm1',
 'Bcl6',
 'Foxp3',
 'Satb1',
 'Id2']

In [10]:
%%time

pcs = SpaceOracle.perform_PCA(adata)
SpaceOracle.knn_imputation(adata, pcs)

CPU times: user 9min 32s, sys: 14.7 s, total: 9min 47s
Wall time: 1min 45s


In [11]:
adata.obs = adata.obs.drop(columns=['n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes',
       'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts'])

adata.var.drop(columns=adata.var.columns, inplace=True)

del adata.uns['rctd_cluster_colors']
del adata.uns['hvg']
del adata.uns['cluster_colors']

In [12]:
import json
with open('../../data/celltype_assign.json', 'r') as f:
    cell_type_mapping = json.load(f)
adata.obs['rctd_celltypes'] = adata.obs['rctd_cluster'].astype(str).map(cell_type_mapping)

In [13]:
adata

AnnData object with n_obs × n_vars = 11567 × 4018
    obs: 'cluster', 'rctd_cluster', 'rctd_celltypes'
    uns: 'log1p'
    obsm: 'X_spatial', 'rctd_results', 'spatial'
    layers: 'raw_count', 'normalized_count', 'imputed_count'

In [14]:
adata.write_h5ad('../.cache/adata_train_new.h5ad')