In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [3]:
import sys
sys.path.append('..')

In [47]:
adata = sc.read_h5ad('/Users/koush/Projects/SpaceOracle/data/survey/mouse_kidney_13.h5ad')
adata

AnnData object with n_obs × n_vars = 8779 × 3512
    obs: 'batch', 'rxn', 'n_counts', 'ct1', 'ct2', 'ct3', 'high_q', 'ct4', 'chip', 'exp', 'cond', 'samp-id', 'samp-type', 'medulla_cortex', 'domain', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'cell_type', 'cell_type_int'
    var: 'n_counts', 'n_cells', 'mean', 'std', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'log1p'
    obsm: 'spatial'
    layers: 'imputed_count', 'normalized_count', 'raw_count'
    obsp: 'connectivities', 'distances'

In [48]:
import seaborn as sns

In [49]:
adata.obs['banksy_celltypes'] = pd.read_csv('banksy_celltypes.csv').banksy_celltypes.values
adata.obs['banksy_celltypes'] = adata.obs['banksy_celltypes'].astype('category')

In [50]:
out = pd.DataFrame(
    adata.obsm['spatial'], 
    columns=['x', 'y'], 
    index=adata.obs_names
).join(adata.obs[['domain', 'ct2', 'banksy_celltypes']])

In [55]:
osmogenes = ['Aqp2', 'Aqp4', 'Cryab', 'Elf5', 'Mal', 'Muc1',
        'Pax2', 'Sgk1', 'Slc14a2', 'Slc5a3']
hypoxia = ['Anxa2', 'Btg1', 'Cdkn1b', 'Ddit3', 'Hexa', 'Hoxb9', 'Hspa5', 'Tes']
markers = ['Wt1', 'Nphs2', 'Sord', 'Cyp2e1', 'Lrp2', 'Bst1', 'Slc4a11', 'Slc12a1', 
        'Umod', 'Slc12a3', 'Calb1', 'Aqp2', 'Aqp4']
nephron = ['Cldn1', 'Spp2', 'Lrp2', 'Aqp1', 'Sptssb', 'Slc12a1', 'Slc12a3', 'Calb1']
collecting_duct = ['Hsd11b2', 'Aqp4', 'Aqp2', 'Atp6v1g3', 'Gata3', 'Calb1']
vascular = ['Kdr', 'Cdh5', 'Vegfa', 'Emcn', 'Podxl']
hema = ['Cd52', 'Fcer1g']
immune = ['Thy1', 'Cd79a', 'Pax5']
stroma = ['Col1a1', 'Meis1', 'Postn']
podocyte = ['Pax8', 'Nphs1', 'Podxl', 'Meis1', 'Aldh1a2']

extra_genes = osmogenes + hypoxia + markers + \
    nephron + collecting_duct + vascular + hema + \
    immune + stroma + podocyte

In [56]:
import os, sys 
sys.path.append('../../src')

In [57]:
import celloracle as co
base_GRN = co.data.load_mouse_promoter_base_GRN()

Loading prebuilt promoter base-GRN. Version: mm10_gimmemotifsv5_fpr2


In [58]:
from spaceoracle.tools.network import expand_paired_interactions
import commot as ct

df_ligrec = ct.pp.ligand_receptor_database(
        database='CellChat', 
        species='mouse', 
        signaling_type="Secreted Signaling"
    )
            
df_ligrec.columns = ['ligand', 'receptor', 'pathway', 'signaling']
df_ligrec = expand_paired_interactions(df_ligrec)

coms_df =df_ligrec.query('receptor.str.contains("CXCR") or ligand.str.contains("CXCL")')
genes_of_interest = list(np.unique(coms_df.ligand.unique().tolist() + \
                         coms_df.receptor.unique().tolist()))
# len(genes_of_interest)
genes_of_interest = genes_of_interest + list(extra_genes)
len(genes_of_interest)

63

In [59]:
n_top_genes = 3000
min_cells = 10
min_counts = 200

In [60]:
adata.var_names_make_unique()
adata.var["mt"] = adata.var_names.str.startswith("mt-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], inplace=True)
sc.pp.filter_cells(adata, min_counts=min_counts)
adata = adata[adata.obs["pct_counts_mt"] < 20].copy()
adata = adata[:, ~adata.var["mt"]]

In [61]:
adata = adata[:, ~adata.var_names.str.contains('Rik')]
adata = adata[:, ~adata.var_names.str.contains(r'^Hb\w+-\w+$')]
adata = adata[:, ~adata.var_names.str.contains('Hp')]
adata = adata[:, ~adata.var_names.str.startswith('Rp')]
adata = adata[:, ~adata.var_names.str.startswith('n-R5s')]
adata = adata[:, ~adata.var_names.str.startswith('n-R5')]
adata = adata[:, ~adata.var_names.str.startswith('AA')]
adata = adata[:, ~adata.var_names.str.startswith('AB')]
adata = adata[:, ~adata.var_names.str.startswith('AC')]
adata = adata[:, ~adata.var_names.str.startswith('Gm')]
adata = adata[:, ~adata.var_names.str.startswith('Mir')]
adata = adata[:, adata.var.index.str.len() > 1]
adata = adata[:, [i for i in adata.var_names if not (i[:2].isupper() and i[:2].isalpha())]]
adata = adata[:, [gene for gene in adata.var_names if not gene[-4:].isdigit()]]

In [62]:
adata

View of AnnData object with n_obs × n_vars = 8521 × 3058
    obs: 'batch', 'rxn', 'n_counts', 'ct1', 'ct2', 'ct3', 'high_q', 'ct4', 'chip', 'exp', 'cond', 'samp-id', 'samp-type', 'medulla_cortex', 'domain', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'cell_type', 'cell_type_int', 'banksy_celltypes'
    var: 'n_counts', 'n_cells', 'mean', 'std', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'log1p'
    obsm: 'spatial'
    layers: 'imputed_count', 'normalized_count', 'raw_count'
    obsp: 'connectivities', 'distances'

In [63]:
# sc.pp.filter_genes(adata, min_cells=min_cells)
# adata.layers["raw_count"] = adata.X.copy()
# sc.pp.normalize_total(adata, inplace=True)
# sc.pp.log1p(adata)
# sc.pp.highly_variable_genes(
#     adata, flavor="seurat", n_top_genes=n_top_genes)

# adata = adata[:, (adata.var.highly_variable | adata.var_names.isin(genes_of_interest))]

# adata.layers["normalized_count"] = adata.to_df().values.copy()
# adata

In [64]:
adata.layers["normalized_count"].max()

array(7.434651, dtype=float32)

In [65]:
adata.obs['banksy_celltypes'] = adata.obs['banksy_celltypes'].astype('category')
adata.obs['cell_type'] = adata.obs['banksy_celltypes'].copy()

In [66]:
adata.obs['cell_type'].value_counts()

cell_type
Macrophage                   4650
Endothelial_Capilary         1135
Endothelial_AntiApoptosis     548
Memory_T                      428
B cells                       334
Eosinophil                    306
Endothelial_DVR               300
Epithelial                    230
Principal cells               166
DCs                           149
B_Memory                      114
Mast                           91
Neutrophil                     58
Monocytes                      12
Name: count, dtype: int64

In [67]:
adata.obs['cell_type_int'] = pd.Categorical(adata.obs['cell_type']).codes

In [68]:
adata.var.drop(columns=['mt'], inplace=True)

In [69]:
from spaceoracle.oracles import BaseTravLR


In [70]:
pcs = BaseTravLR.perform_PCA(adata)
BaseTravLR.knn_imputation(adata, pcs, method='MAGIC')

Calculating MAGIC...
  Running MAGIC on 8521 cells and 3058 genes.
  Calculating graph and diffusion operator...
    Calculating PCA...
    Calculated PCA in 7.54 seconds.
    Calculating KNN search...
    Calculated KNN search in 3.01 seconds.
    Calculating affinities...
    Calculated affinities in 2.80 seconds.
  Calculated graph and diffusion operator in 13.38 seconds.
  Calculating imputation...
  Calculated imputation in 0.74 seconds.
Calculated MAGIC in 14.13 seconds.


In [71]:
adata.layers['imputed_count'].shape

(8521, 3058)

In [72]:
adata.write_h5ad('/Users/koush/Projects/SpaceOracle/data/survey/mouse_kidney_13.h5ad')