Run tensor cell2cell on outputs of various communication scoring tools

env name: cci_dt


In [9]:
import os
import warnings
warnings.simplefilter('ignore')
from collections import OrderedDict
from tqdm import tqdm
import itertools
import numpy as np
import pandas as pd

import cell2cell as c2c

seed = 888
np.random.seed(seed)

rev_path = '/data3/hratch/tc2c_analyses_1/natcomm_revisions/'
scores_path = rev_path + 'interim/tc2c_external_inputs/liana/liana_outputs/'

cp_delim = '-'
lr_delim = '^'

In [10]:
import sys
sys.path.insert(1, '/home/hratch/Projects/CCC/tc2c_analyses_1/notebooks/natcomm_revisions/')
from utility import edgelist_to_communication_matrix, matrix_to_interaction_tensor
from subset import subset_tensor
from corrindex import correlation_index

In [11]:
score_methods = {file_name.split('_')[-1].split('.csv')[0] for file_name in os.listdir(scores_path)}

# score_methods = {'natmi', 'sca', 'cellchat'}


samples = {file_name.split('_')[0] for file_name in os.listdir(scores_path)}

# Generate Tensor for Each Method

In [12]:
# map the method-specific column names for the edge list
score_labels = {'natmi': {'score': 'edge_specificity', 
                            'sender': 'source', 'receiver': 'target', 
                            'ligand': 'ligand.complex', 'receptor': 'receptor.complex'}, 
               'cellchat': {'score': 'prob', 
                            'sender': 'source', 'receiver': 'target', 
                            'ligand': 'ligand', 'receptor': 'receptor'}, 
               'cellphonedb': {'score': 'lr.mean', 
                            'sender': 'source', 'receiver': 'target', 
                            'ligand': 'ligand.complex', 'receptor': 'receptor.complex'}, 
               'sca': {'score': 'LRscore', 
                            'sender': 'source', 'receiver': 'target', 
                            'ligand': 'ligand.complex', 'receptor': 'receptor.complex'}
               }
#                'connectome': {'score': 'weight_sc', 
#                             'sender': 'source', 'receiver': 'target', 
#                             'ligand': 'ligand.complex', 'receptor': 'receptor.complex'}, 
#                'logfc': {'score': 'logfc_comb', 
#                             'sender': 'source', 'receiver': 'target', 
#                             'ligand': 'ligand.complex', 'receptor': 'receptor.complex'}}

In [13]:
def drop_redundant_complexes(edge_list, score_method):
    """some complexes generated redundant rows due to distinct ligands or receptors
    double check that the communication score is the same b/w redundant complex rows
    then drop those duplicated scores."""
    
    dup_cols = list(score_labels[score_method].values())
    dup_cols.remove(score_labels[score_method]['score'])
    dup_idx = edge_list[edge_list[dup_cols].duplicated(subset = dup_cols)].index.tolist()

    for i in dup_idx:
        dup_val = edge_list.loc[i, dup_cols]
        unique_scores = edge_list[(edge_list[dup_cols[0]] == dup_val.loc[dup_cols[0]]) & 
                 (edge_list[dup_cols[1]] == dup_val.loc[dup_cols[1]]) & 
                 (edge_list[dup_cols[2]] == dup_val.loc[dup_cols[2]]) & 
                 (edge_list[dup_cols[3]] == dup_val.loc[dup_cols[3]])][score_labels[score_method]['score']].unique()
        if len(unique_scores) > 1:
            raise ValueError('Unexpected inconsistency in communication scores b/w complexs')


    el = edge_list.drop_duplicates(subset = dup_cols) 
    return el

In [21]:
# convert LIANA edgelist to communication matrix format for all contexts/methods
score_matrices = dict()

for score_method in score_methods:
    for sample in samples:
        edge_list = pd.read_csv(scores_path + ''.join([sample, '_communication_scores_', score_method, '.csv']))
        edge_list = drop_redundant_complexes(edge_list, score_method)


        cm = edgelist_to_communication_matrix(edge_list, 
                                        score_col = score_labels[score_method]['score'], 
                                        sender_cell_col = score_labels[score_method]['sender'], 
                                        receiver_cell_col = score_labels[score_method]['receiver'], 
                                        ligand_col = score_labels[score_method]['ligand'], 
                                        receptor_col = score_labels[score_method]['receptor'], 
                                             fillna_val=0)
        if score_method in score_matrices:
            score_matrices[score_method][sample] = cm
        else:
            score_matrices[score_method] = {sample: cm}

In [None]:
# HERE: check on sparsity

In [22]:
tensors = {}
for method, scores in score_matrices.items():
    tensors[method] = matrix_to_interaction_tensor(scores=scores, 
                             lr_how = 'inner', cell_how='inner')

100%|██████████| 12/12 [00:15<00:00,  1.33s/it]
  exec(code_obj, self.user_global_ns, self.user_ns)
100%|██████████| 12/12 [00:15<00:00,  1.32s/it]
100%|██████████| 12/12 [00:13<00:00,  1.09s/it]


In [23]:
# take inner across all tensors to have consistent comparisons
# and order the indeces
context_order = ['C51', 'C52', 'C100', 'C141', 'C142', 'C144', 'C145', 'C143', 'C146', 'C148', 'C149', 'C152']

# add the cell2cell tensors
# match formatting of other tensors
c2c_tensor = c2c.io.load_variable_with_pickle(rev_path + 'processed/Tensor-BALF.pkl')
c2c_tensor.order_names[0] = context_order 
c2c_tensor.order_names[1] = ['_'.join(lr.split('&')) for lr in c2c_tensor.order_names[1]] # match liana formatting (complexes joing by _ rather than &)
tensors['cell2cell'] = c2c_tensor

inner_lr = set.intersection(*map(set,[tensor.order_names[1] for tensor in tensors.values()]))
inner_cells = set.intersection(*map(set,[tensor.order_names[2] for tensor in tensors.values()]))

for method, tensor in tensors.items():
    tensors[method] = subset_tensor(interaction_tensor = tensor, 
                                     subset_dict = {0: context_order, 
                                                    1: sorted(inner_lr), 
                                                   2: sorted(inner_cells), 
                                                   3: sorted(inner_cells)}, 
                                   original_order=False)

In [24]:
[tensor.tensor.shape for tensor in tensors.values()]

[(12, 172, 6, 6), (12, 172, 6, 6), (12, 172, 6, 6), (12, 172, 6, 6)]

In [25]:
# HERE: double check here that c2c LR has good overlap with rest of  methods

# Run decomposition

In [26]:
rank = 10 # same tensor rank as in original BALF
for method, tensor in tqdm(tensors.items()):
    tensor.compute_tensor_factorization(rank=rank,
                                        init='svd',
                                        random_state=seed)

100%|██████████| 4/4 [00:58<00:00, 14.68s/it]


# Calculate CorrIndex
assess consistency between decompositions

In [27]:
corrindex_df = pd.DataFrame(columns = tensors.keys(), index = tensors.keys()) # initialize

In [29]:
for method_1 in corrindex_df.index:
    for method_2 in corrindex_df.columns:
        corrindex_df.loc[method_1, method_2] = correlation_index(list(tensors[method_1].factors.values()), 
                                                        list(tensors[method_2].factors.values()), 
                                                                method = 'max_score')

In [30]:
corrindex_df # worse with higher thresholding

Unnamed: 0,sca,natmi,cellchat,cell2cell
sca,0.0,0.328983,0.412228,0.203697
natmi,0.328983,0.0,0.615927,0.378826
cellchat,0.412228,0.615927,0.0,0.402552
cell2cell,0.203697,0.378826,0.402552,0.0


In [24]:
# cellphonedb
# visualization 
# send erick edge list to test on
# cellchat clusterfuck figure

Unnamed: 0,cellchat,natmi,sca,cellphonedb,cell2cell
cellchat,0.0,0.248849,0.250414,0.223994,0.211958
natmi,0.248849,0.0,0.112135,0.11906,0.111536
sca,0.250414,0.112135,0.0,0.080795,0.109123
cellphonedb,0.223994,0.11906,0.080795,0.0,0.087406
cell2cell,0.211958,0.111536,0.109123,0.087406,0.0
