In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../src')
import scanpy as sc
import numpy as np
import commot as ct
import pandas as pd
from spaceoracle.oracles import BaseTravLR
import matplotlib.pyplot as plt
import celloracle as co

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [46]:
# adata = sc.read_h5ad('../../data/slideseq/day3_1.h5ad')
adata = sc.read_h5ad('../.cache/10X_tonsil.h5ad')

In [47]:
import os

In [48]:
def expand_paired_interactions(df):
    expanded_rows = []
    for _, row in df.iterrows():
        ligands = row['ligand'].split('_')
        receptors = row['receptor'].split('_')
        
        for ligand in ligands:
            for receptor in receptors:
                new_row = row.copy()
                new_row['ligand'] = ligand
                new_row['receptor'] = receptor
                expanded_rows.append(new_row)
    
    return pd.DataFrame(expanded_rows)

In [49]:
df_ligrec = ct.pp.ligand_receptor_database(
        database='CellChat', 
        species='human',
        signaling_type="Secreted Signaling"
    )
            
df_ligrec.columns = ['ligand', 'receptor', 'pathway', 'signaling']
df_ligrec = expand_paired_interactions(df_ligrec)


In [50]:
df_ligrec

Unnamed: 0,ligand,receptor,pathway,signaling
0,TGFB1,TGFBR1,TGFb,Secreted Signaling
0,TGFB1,TGFBR2,TGFb,Secreted Signaling
1,TGFB2,TGFBR1,TGFb,Secreted Signaling
1,TGFB2,TGFBR2,TGFb,Secreted Signaling
2,TGFB3,TGFBR1,TGFb,Secreted Signaling
...,...,...,...,...
1194,UTS2B,UTS2R,UROTENSIN,Secreted Signaling
1195,UTS2B,SSTR5,UROTENSIN,Secreted Signaling
1196,BAG6,NCR3,BAG,Secreted Signaling
1197,LGALS9,HAVCR2,GALECTIN,Secreted Signaling


In [51]:
# ilr2_df =df_ligrec.query('receptor.str.contains("Il2") or ligand.str.contains("Il2")')
# genes_of_interest = list(np.unique(ilr2_df.ligand.unique().tolist() + ilr2_df.receptor.unique().tolist()))
# genes_of_interest = genes_of_interest + ['Bach2', 'Prdm1', 'Bcl6', 'Foxp3', 'Satb1', 'Id2']

ilr2_df =df_ligrec.query('receptor.str.contains("IL2") or ligand.str.contains("IL2")')
genes_of_interest = list(np.unique(ilr2_df.ligand.unique().tolist() + ilr2_df.receptor.unique().tolist()))
genes_of_interest = genes_of_interest + ['PAX5', 'CD74', 'BACH2', 'PRDM1', 'BCL6', 'FOXP3', 'SATB1', 'ID2']


In [52]:
n_top_genes = 5000
min_cells = 5
min_counts = 100

In [53]:
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("mt-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
sc.pp.filter_cells(adata, min_counts=min_counts)
adata = adata[adata.obs["pct_counts_mt"] < 20].copy()
adata = adata[:, ~adata.var["mt"]]

sc.pp.filter_genes(adata, min_cells=min_cells)
adata.layers["raw_count"] = adata.X.copy()
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(
    adata, flavor="seurat", n_top_genes=n_top_genes)

adata = adata[:, (adata.var.highly_variable | adata.var_names.isin(genes_of_interest))]
adata.layers["normalized_count"] = adata.to_df().values

In [54]:
adata.shape

(4194, 5014)

In [55]:
adata.X.toarray().max()

9.495499

In [56]:
adata.layers["raw_count"].toarray().sum(0)

array([  445.,  1545.,  1996., ..., 16152., 12785.,   353.], dtype=float32)

In [57]:
%%time

pcs = BaseTravLR.perform_PCA(adata)
BaseTravLR.knn_imputation(adata, pcs)

Calculating MAGIC...
  Running MAGIC on 4194 cells and 5014 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 0.95 seconds.
    Calculating KNN search...
    Calculated KNN search in 0.71 seconds.
    Calculating affinities...
    Calculated affinities in 0.61 seconds.
  Calculated graph and diffusion operator in 2.30 seconds.
  Calculating imputation...
  Calculated imputation in 2.40 seconds.
Calculated MAGIC in 4.71 seconds.
CPU times: user 1min 54s, sys: 9.95 s, total: 2min 4s
Wall time: 17.6 s


In [58]:
adata.obs = adata.obs.drop(columns=['n_genes_by_counts',
       'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
       'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
       'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes',
       'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts'])

adata.var.drop(columns=adata.var.columns, inplace=True)

# del adata.uns['rctd_cluster_colors']
# del adata.uns['hvg']
# del adata.uns['cluster_colors']

In [59]:
# import json
# with open('../../data/celltype_assign.json', 'r') as f:
#     cell_type_mapping = json.load(f)
# adata.obs['rctd_celltypes'] = adata.obs['rctd_cluster'].astype(str).map(cell_type_mapping)

In [60]:
adata

AnnData object with n_obs × n_vars = 4194 × 5014
    obs: 'celltype'
    uns: 'spatial', 'log1p', 'hvg'
    obsm: 'spatial'
    layers: 'counts', 'raw_count', 'normalized_count', 'imputed_count'

In [61]:
adata.layers

Layers with keys: counts, raw_count, normalized_count, imputed_count

In [62]:
adata.layers['imputed_count'].max(), adata.layers['normalized_count'].max(), adata.layers['raw_count'].max()

(8.790769632761492, 9.495499, 21025.0)

In [63]:
# adata.write_h5ad('../.cache/day3_lymph_rep_1.h5ad')

adata.write_h5ad('../.cache/10X_tonsil_processed.h5ad')