In [1]:
%load_ext autoreload
%autoreload 2

In [98]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import sys 
import commot as ct

sys.path.append('../../src')
sys.path.append('..')

from spaceoracle.tools.network import expand_paired_interactions
from spaceoracle.tools.network import get_mouse_housekeeping_genes
from spaceoracle.oracles import BaseTravLR
from spaceoracle.tools.network import encode_labels

import celloracle as co

In [93]:
adata = sc.read_h5ad('/Users/koush/Projects/SpaceOracle/data/slideseq/mLND3_1.h5ad')
adata.obs = adata.obs.join(
    pd.read_csv('/Users/koush/Projects/SpaceOracle/data/slideseq/day3_1_rctd.csv', index_col=0))
adata = adata[adata.obs.spot_class.isin(['singlet', 'doublet_certain'])]
adata

View of AnnData object with n_obs × n_vars = 16280 × 24649
    obs: 'cluster', 'spot_class', 'first_type', 'second_type', 'first_class', 'second_class', 'min_score', 'singlet_score', 'conv_all', 'conv_doublet', '1', '2', '3', '4', '5', '6', '7'
    uns: 'cluster_colors'
    obsm: 'X_spatial', 'spatial'

In [94]:
base_GRN = co.data.load_mouse_promoter_base_GRN()

Loading prebuilt promoter base-GRN. Version: mm10_gimmemotifsv5_fpr2


In [95]:
df_ligrec = ct.pp.ligand_receptor_database(
        database='CellChat', 
        species='mouse', 
        signaling_type="Secreted Signaling"
    )
            
df_ligrec.columns = ['ligand', 'receptor', 'pathway', 'signaling']
df_ligrec = expand_paired_interactions(df_ligrec)

coms_df =df_ligrec.query('receptor.str.contains("Cxcr") or ligand.str.contains("Cxcr")')
genes_of_interest = list(np.unique(coms_df.ligand.unique().tolist() + \
                         coms_df.receptor.unique().tolist()))
genes_of_interest = genes_of_interest
len(genes_of_interest)

19

In [96]:
ilr2_df =df_ligrec.query('receptor.str.contains("Il2") or ligand.str.contains("Il2")')
genes_of_interest = list(np.unique(ilr2_df.ligand.unique().tolist() + ilr2_df.receptor.unique().tolist()))
# len(genes_of_interest)
genes_of_interest = genes_of_interest + ['Pax5', 'Bach2', 'Prdm1', 'Bcl6', 'Foxp3', 'Satb1', 'Id2']

In [99]:
housekeeping_db = get_mouse_housekeeping_genes()

In [100]:
n_top_genes = 4000
min_cells = 10
min_counts = 100

In [102]:
housekeeping_db

Unnamed: 0,Ensembl ID,Gene,Chromosome_scaffold,Transcript start,Transcript end
0,ENSMUST00000007921,0610009B22Rik,11,51685386,51688653
1,ENSMUST00000077783,0610030E20Rik,6,72347334,72353148
2,ENSMUST00000192144,0610030E20Rik,CHR_MG184_PATCH,72347333,72353147
3,ENSMUST00000125107,0610037L13Rik,4,107889899,107897802
4,ENSMUST00000106727,0610037L13Rik,4,107889863,107897118
...,...,...,...,...,...
3979,ENSMUST00000101388,Zxdb,X,94724569,94730187
3980,ENSMUST00000113539,Zxdc,6,90369492,90385084
3981,ENSMUST00000203493,Zxdc,6,90369613,90383103
3982,ENSMUST00000075117,Zxdc,6,90369494,90403490


In [103]:
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("mt-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
sc.pp.filter_cells(adata, min_counts=min_counts)
adata = adata[adata.obs["pct_counts_mt"] < 20].copy()
adata = adata[:, ~adata.var["mt"]]
adata = adata[:, ~adata.var_names.isin(housekeeping_db['Gene'].unique())]

sc.pp.filter_genes(adata, min_cells=min_cells)
adata.layers["raw_count"] = adata.X.copy()
sc.pp.normalize_total(adata, inplace=True)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(
    adata, flavor="seurat", n_top_genes=n_top_genes)

adata = adata[:, (adata.var.highly_variable | adata.var_names.isin(genes_of_interest))]

adata.layers["normalized_count"] = adata.to_df().values.copy()
adata

AnnData object with n_obs × n_vars = 16136 × 4016
    obs: 'cluster', 'spot_class', 'first_type', 'second_type', 'first_class', 'second_class', 'min_score', 'singlet_score', 'conv_all', 'conv_doublet', '1', '2', '3', '4', '5', '6', '7', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_counts'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'cluster_colors', 'log1p', 'hvg'
    obsm: 'X_spatial', 'spatial'
    layers: 'raw_count', 'normalized_count'

In [91]:
for key in ['n_genes_by_counts', 'log1p_n_genes_by_counts',
       'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes',
       'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes',
       'pct_counts_in_top_500_genes', 'total_counts_MT',
       'log1p_total_counts_MT', 'pct_counts_MT', 'n_counts']:
    if key in adata.obs.keys():
        adata.obs.pop(key)

for key in ['MT', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts',
       'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts',
       'n_cells', 'highly_variable', 'means', 'dispersions',
       'dispersions_norm']:
    if key in adata.var.keys():
        adata.var.pop(key)

del adata.uns

In [14]:
adata.obs['cell_type'] = adata.obs['cell_type'].astype('category')

adata.obs['cell_type_int'] = adata.obs['cell_type'].apply(
    lambda x: encode_labels(adata.obs['cell_type'], reverse_dict=True)[x])

In [22]:
adata

AnnData object with n_obs × n_vars = 5778 × 3540
    obs: 'cell_type', 'cell_type_int'
    obsm: 'spatial'
    layers: 'normalized_count'

In [23]:
pcs = BaseTravLR.perform_PCA(adata)
BaseTravLR.knn_imputation(adata, pcs, method='MAGIC')

Calculating MAGIC...
  Running MAGIC on 5778 cells and 3540 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 10.15 seconds.
    Calculating KNN search...
    Calculated KNN search in 1.93 seconds.
    Calculating affinities...
    Calculated affinities in 1.62 seconds.
  Calculated graph and diffusion operator in 13.73 seconds.
  Calculating imputation...
  Calculated imputation in 0.55 seconds.
Calculated MAGIC in 14.30 seconds.


In [24]:
adata

AnnData object with n_obs × n_vars = 5778 × 3540
    obs: 'cell_type', 'cell_type_int'
    obsm: 'spatial'
    layers: 'normalized_count', 'imputed_count'

In [25]:
adata.layers['imputed_count'].shape

(5778, 3540)

In [26]:
adata.write_h5ad('/Users/koush/Desktop/training_data/snrna_human_tonsil.h5ad')