# CellPhoneDB tentative scoring protocol
**B-cell signaling example**

conda env: p-sample5

In [1]:
# -- Load cellphone ranking functions
from dev_cpdb_functions import *
import scanpy as sc
import pandas as pd

In [2]:
path_adata = '/nfs/team292/rv6/Datasets/'

### Load scanpy object

In [3]:
adata = sc.read(path_adata+'adata_subset_Bcells_stroma.h5ad')
adata.shape

(36445, 33712)

Input files required for the cpdb scoring pipeline. \
The input files I'm using correspond to the `*_input.csv` files generated by `cellphonedb generate`.

In [4]:
inp_complex = pd.read_csv('data/complex_input.csv',
                          sep = ',',
                          index_col = 0)

inp_genes = pd.read_csv('data/gene_input.csv',
                        sep = ',')

inp_interactions = pd.read_csv('data/interaction_input.csv',
                               sep = ',')

### Downsample clusters
Protocol is not memory optimized, thus downsampling might be of interest (or request more memory).

In [5]:
# -- Name of column containing the cluster name
cluster_id_col = 'cell.labels'

# -- Percentage of cells that you want to keep for each cluster
# -- Values between 0-1
downsamp_percentage = 1

In [6]:
# -- Downsample each cluster to the specifcied percentage
adata_obs = adata.obs.groupby(cluster_id_col).sample(frac = downsamp_percentage)
adata = adata[list(adata_obs.index)]

adata

  res = method(*args, **kwargs)


View of AnnData object with n_obs × n_vars = 36445 × 33712
    obs: 'cell.labels', 'doublets', 'fetal.ids', 'gender', 'is_doublet', 'is_doublet_poptrim', 'is_doublet_wolock', 'lanes', 'nGene', 'nUMI', 'orig.ident', 'percent.mito', 'processing.type', 'scrublet_cluster_score', 'scrublet_score', 'sequencing.type', 'sort.ids', 'april_cell.labels', 'cell.labels_20200708', 'cell.labels_20200713', 'cell.labels_20200718', 'nk_meta', 'mito.threshold'
    var: 'gene_ids-1', 'feature_types-1'
    obsm: 'X_orig_pca', 'X_pca', 'X_umap'

### Convert sparse normalized matrix to dense matrix
This should be optimized to use the sparse matrix rather than dense.

In [7]:
# -- Tranpose matrix so genes are in columns and rows are samples
norm_matrix = pd.DataFrame(adata.X.todense(),
                           columns = list(adata.var.index),
                           index = list(adata.obs.index)).transpose()

metadata = adata.obs

# -- Remove scanoy object to save some memory
del adata

### Apply functions to rank interactions

##### **Step 1**: filter genes expressed in less than min_perc_cell of cells in a given cluster.

In [8]:
cpdb_f = filter_genes_cluster(matrix = norm_matrix,
                              metadata = metadata,
                              min_perc_cell = 0.1,
                              cell_column_name = 'cell.labels')

##### **Step 2**: calculate the gene's mean expression per cluster.

In [9]:
cpdb_fm = mean_expression_cluster(matrix = cpdb_f,
                                  metadata = metadata,
                                  cell_column_name = 'cell.labels')

##### **Step 3**: scale the gene's mean expression across clusters.

In [10]:
cpdb_fms = scale_expression(cpdb_fm,
                            upper_range = 10)

##### **Step 4**: filter genes expressed in less than min_perc_cell of cells in a given cluster.

In [11]:
cpdb_fmsh = heteromer_geometric_expression(matrix = cpdb_fms,
                                           cellphone_genes = inp_genes,
                                           cellphone_complex = inp_complex)

(33712, 28)
(1130, 28)


##### **Step 5**: calculate the ligand-receptor score and cry.

In [12]:
cpdb_scoring = score_product(matrix = cpdb_fmsh,
                             cellphone_genes = inp_genes,
                             cellphone_interactions = inp_interactions)

### List all cell-pairs comparison
Results are stored as as dictionary of dataframes, each dataframe is named after the cells being analyzed for cell-cell communication. \
Beware you will find `cell_A|cell_B` but not `cell_B|cell_A`. Each dataframe contains the parteners swapped to compare interactions in both directions.

In [16]:
list(cpdb_scoring.keys())[0:10]

['tip EC|tip EC',
 'LMPP|tip EC',
 'muscle|tip EC',
 'pre pro B progenitor|tip EC',
 'arteriolar fibroblast|tip EC',
 'stromal macrophage|tip EC',
 'naive B cell|tip EC',
 'chondrocyte|tip EC',
 'early osteoblast|tip EC',
 'osteoblast|tip EC']

### Example of how to query results
Ordering results by the score

In [17]:
example_table = cpdb_scoring['endosteal fibroblast|osteoclast'].sort_values('Score',
                                                                            ascending = False)

In [18]:
example_table.head(20)

Unnamed: 0,osteoclast,endosteal fibroblast,Score,id_cp_interaction
3220,ProstaglandinE2_byPTGES2,PTGER4,100.0,1385
3221,ProstaglandinE2_byPTGES2,PTGER2,100.0,1432
1456,WNT3,FZD10_LRP6,100.0,223
3225,ProstaglandinE2_byPTGES3,PTGER2,100.0,1433
2674,Cholesterol_byDHCR24,RORA,100.0,166
2374,ADGRE5,CD55,100.0,1064
3224,ProstaglandinE2_byPTGES3,PTGER4,100.0,1386
3574,integrin_aVb3_complex,FGF2,100.0,1081
1423,TNFRSF12A,TNFSF12,100.0,772
1458,WNT3,FZD1_LRP6,98.076824,256


____