In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src')
import scanpy as sc
import numpy as np
import commot as ct
import pandas as pd
from spaceoracle.oracles import BaseTravLR
import matplotlib.pyplot as plt
import celloracle as co

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [3]:
adata = sc.read_h5ad('../../data/slideseq/day3_1.h5ad')

In [7]:
import os

In [4]:
def expand_paired_interactions(df):
    expanded_rows = []
    for _, row in df.iterrows():
        ligands = row['ligand'].split('_')
        receptors = row['receptor'].split('_')
        
        for ligand in ligands:
            for receptor in receptors:
                new_row = row.copy()
                new_row['ligand'] = ligand
                new_row['receptor'] = receptor
                expanded_rows.append(new_row)
    
    return pd.DataFrame(expanded_rows)

In [5]:
df_ligrec = ct.pp.ligand_receptor_database(
        database='CellChat', 
        species='mouse', 
        signaling_type="Secreted Signaling"
    )
            
df_ligrec.columns = ['ligand', 'receptor', 'pathway', 'signaling']
df_ligrec = expand_paired_interactions(df_ligrec)


In [6]:
ilr2_df =df_ligrec.query('receptor.str.contains("Il2") or ligand.str.contains("Il2")')
genes_of_interest = list(np.unique(ilr2_df.ligand.unique().tolist() + ilr2_df.receptor.unique().tolist()))
# len(genes_of_interest)
genes_of_interest = genes_of_interest + ['Bach2', 'Prdm1', 'Bcl6', 'Foxp3', 'Satb1', 'Id2']

In [7]:
n_top_genes = 3000
min_cells = 10
min_counts = 250

In [8]:
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("mt-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
sc.pp.filter_cells(adata, min_counts=min_counts)
adata = adata[adata.obs["pct_counts_mt"] < 20].copy()
adata = adata[:, ~adata.var["mt"]]

sc.pp.filter_genes(adata, min_cells=min_cells)
adata.layers["raw_count"] = adata.X.copy()
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(
    adata, flavor="seurat", n_top_genes=n_top_genes)

adata = adata[:, (adata.var.highly_variable | adata.var_names.isin(genes_of_interest))]
adata.layers["normalized_count"] = adata.to_df().values

In [9]:
adata.shape

(11567, 3020)

In [10]:
adata.X.toarray().max()

5.3231463

In [11]:
adata.layers["raw_count"].toarray().sum(0)

array([509.,  13., 630., ...,  33.,  36., 242.], dtype=float32)

In [12]:
%%time

pcs = BaseTravLR.perform_PCA(adata)
BaseTravLR.knn_imputation(adata, pcs)

Calculating MAGIC...
  Running MAGIC on 11567 cells and 3020 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 2.09 seconds.
    Calculating KNN search...
    Calculated KNN search in 7.61 seconds.
    Calculating affinities...
    Calculated affinities in 7.54 seconds.
  Calculated graph and diffusion operator in 17.25 seconds.
  Calculating imputation...
  Calculated imputation in 1.03 seconds.
Calculated MAGIC in 18.30 seconds.
CPU times: user 1min 53s, sys: 9.74 s, total: 2min 3s
Wall time: 30.5 s


In [13]:
adata.obs = adata.obs.drop(columns=['n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes',
       'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts'])

adata.var.drop(columns=adata.var.columns, inplace=True)

del adata.uns['rctd_cluster_colors']
del adata.uns['hvg']
del adata.uns['cluster_colors']

In [14]:
import json
with open('../../data/celltype_assign.json', 'r') as f:
    cell_type_mapping = json.load(f)
adata.obs['rctd_celltypes'] = adata.obs['rctd_cluster'].astype(str).map(cell_type_mapping)

In [15]:
adata

AnnData object with n_obs × n_vars = 11567 × 3020
    obs: 'cluster', 'rctd_cluster', 'rctd_celltypes'
    uns: 'log1p'
    obsm: 'X_spatial', 'rctd_results', 'spatial'
    layers: 'raw_count', 'normalized_count', 'imputed_count'

In [16]:
adata.layers

Layers with keys: raw_count, normalized_count, imputed_count

In [17]:
adata.layers['imputed_count'].max(), adata.layers['normalized_count'].max(), adata.layers['raw_count'].max()

(3.7572141070762726, 5.3231463, 521.0)

In [18]:
adata.write_h5ad('../.cache/day3_lymph_rep_1.h5ad')

... storing 'rctd_celltypes' as categorical
