In [21]:
import sys
sys.path.append('../../src')
import anndata
import scanpy as sc
import numpy as np
from spaceoracle import SpaceOracle
import commot as ct
import pandas as pd

In [3]:
adata = sc.read_h5ad('../../data/slideseq/day3_1.h5ad')

In [22]:
def expand_paired_interactions(df):
    expanded_rows = []
    for _, row in df.iterrows():
        ligands = row['ligand'].split('_')
        receptors = row['receptor'].split('_')
        
        for ligand in ligands:
            for receptor in receptors:
                new_row = row.copy()
                new_row['ligand'] = ligand
                new_row['receptor'] = receptor
                expanded_rows.append(new_row)
    
    return pd.DataFrame(expanded_rows)

In [23]:
df_ligrec = ct.pp.ligand_receptor_database(
        database='CellChat', 
        species='mouse', 
        signaling_type="Secreted Signaling"
    )
            
df_ligrec.columns = ['ligand', 'receptor', 'pathway', 'signaling']
df_ligrec = expand_paired_interactions(df_ligrec)


In [18]:
ilr2_df =df_ligrec.query('receptor.str.contains("Il2") or ligand.str.contains("Il2")')
genes_of_interest = np.unique(ilr2_df.ligand.unique().tolist() + ilr2_df.receptor.unique().tolist())
len(genes_of_interest)

29

In [24]:
n_top_genes = 5000
min_cells = 10
min_counts = 100

In [25]:
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("mt-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
sc.pp.filter_cells(adata, min_counts=min_counts)
adata = adata[adata.obs["pct_counts_mt"] < 20].copy()
adata = adata[:, ~adata.var["mt"]]
sc.pp.filter_genes(adata, min_cells=min_cells)
adata.layers["raw_count"] = adata.X
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(
    adata, flavor="seurat", n_top_genes=n_top_genes)

adata = adata[:, (adata.var.highly_variable | adata.var_names.isin(genes_of_interest))]
adata.layers["normalized_count"] = adata.to_df().values

# SpaceOracle.imbue_adata_with_space(adata_train, spatial_dim=spatial_dim, in_place=True)
pcs = SpaceOracle.perform_PCA(adata)
SpaceOracle.knn_imputation(adata, pcs)

In [67]:
adata.obs = adata.obs.drop(columns=['n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes',
       'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts'])

adata.var.drop(columns=adata.var.columns, inplace=True)

del adata.uns['rctd_cluster_colors']
del adata.uns['hvg']
del adata.uns['cluster_colors']

In [68]:
adata

AnnData object with n_obs × n_vars = 15870 × 5010
    obs: 'cluster', 'rctd_cluster'
    uns: 'log1p'
    obsm: 'X_spatial', 'rctd_results', 'spatial'
    layers: 'imputed_count', 'normalized_count', 'raw_count'

In [69]:
adata.write_h5ad('../.cache/adata_train.h5ad')

In [71]:
import json
with open('../../data/celltype_assign.json', 'r') as f:
    cell_type_mapping = json.load(f)
adata.obs['rctd_celltypes'] = adata.obs['rctd_cluster'].astype(str).map(cell_type_mapping)