Run tensor cell2cell on outputs of various communication scoring tools

env name: cci_dt


In [1]:
import os
from collections import OrderedDict
import itertools
import numpy as np
import pandas as pd

import cell2cell as c2c

seed = 888
np.random.seed(seed)

rev_path = '/data3/hratch/tc2c_analyses_1/natcomm_revisions/'
scores_path = rev_path + 'interim/tc2c_external_inputs/liana/liana_outputs/'

cp_delim = '-'
lr_delim = '&'

In [2]:
import sys
sys.path.insert(1, '/home/hratch/Projects/CCC/tc2c_analyses_1/notebooks/natcomm_revisions/')
from utility import edgelist_to_communication_matrix, matrix_to_interaction_tensor

In [3]:
score_methods = {file_name.split('_')[-1].split('.csv')[0] for file_name in os.listdir(scores_path)}
samples = {file_name.split('_')[0] for file_name in os.listdir(scores_path)}

In [5]:
# map the method-specific column names for the edge list
score_labels = {'natmi': {'score': 'edge_specificity', 
                            'sender': 'source', 'receiver': 'target', 
                            'ligand': 'ligand.complex', 'receptor': 'receptor.complex'}, 
               'cellchat': {'score': 'prob', 
                            'sender': 'source', 'receiver': 'target', 
                            'ligand': 'ligand', 'receptor': 'receptor'}, 
               'cellphonedb': {'score': 'lr.mean', 
                            'sender': 'source', 'receiver': 'target', 
                            'ligand': 'ligand.complex', 'receptor': 'receptor.complex'}, 
               'sca': {'score': 'LRscore', 
                            'sender': 'source', 'receiver': 'target', 
                            'ligand': 'ligand.complex', 'receptor': 'receptor.complex'}
               }
#                'connectome': {'score': 'weight_sc', 
#                             'sender': 'source', 'receiver': 'target', 
#                             'ligand': 'ligand.complex', 'receptor': 'receptor.complex'}, 
#                'logfc': {'score': 'logfc_comb', 
#                             'sender': 'source', 'receiver': 'target', 
#                             'ligand': 'ligand.complex', 'receptor': 'receptor.complex'}}

In [6]:
def drop_redundant_complexes(edge_list, score_method):
    """some complexes generated redundant rows due to distinct ligands or receptors
    double check that the communication score is the same b/w redundant complex rows
    then drop those duplicated scores."""
    
    dup_cols = list(score_labels[score_method].values())
    dup_cols.remove(score_labels[score_method]['score'])
    dup_idx = edge_list[edge_list[dup_cols].duplicated(subset = dup_cols)].index.tolist()

    for i in dup_idx:
        dup_val = edge_list.loc[i, dup_cols]
        unique_scores = edge_list[(edge_list[dup_cols[0]] == dup_val.loc[dup_cols[0]]) & 
                 (edge_list[dup_cols[1]] == dup_val.loc[dup_cols[1]]) & 
                 (edge_list[dup_cols[2]] == dup_val.loc[dup_cols[2]]) & 
                 (edge_list[dup_cols[3]] == dup_val.loc[dup_cols[3]])][score_labels[score_method]['score']].unique()
        if len(unique_scores) > 1:
            raise ValueError('Unexpected inconsistency in communication scores b/w complexs')


    el = edge_list.drop_duplicates(subset = dup_cols) 
    return el

In [None]:
# convert edgelist to communication matrix for all contexts/methods
score_matrices = dict()

for score_method in score_methods:
    for sample in samples:
        edge_list = pd.read_csv(scores_path + ''.join([sample, '_communication_scores_', score_method, '.csv']))
        edge_list = drop_redundant_complexes(edge_list, score_method)


        cm = edgelist_to_communication_matrix(edge_list, 
                                        score_col = score_labels[score_method]['score'], 
                                        sender_cell_col = score_labels[score_method]['sender'], 
                                        receiver_cell_col = score_labels[score_method]['receiver'], 
                                        ligand_col = score_labels[score_method]['ligand'], 
                                        receptor_col = score_labels[score_method]['receptor'])
        if score_method in score_matrices:
            score_matrices[score_method][sample] = cm
        else:
            score_matrices[score_method] = {sample: cm}

In [26]:
tensors = {}
for method, scores in score_matrices.items():
    tensors[method] = matrix_to_interaction_tensor(scores=scores, 
                             lr_how = 'inner', cell_how='inner')

100%|██████████| 12/12 [00:02<00:00,  4.45it/s]
  exec(code_obj, self.user_global_ns, self.user_ns)
100%|██████████| 12/12 [00:02<00:00,  4.60it/s]
100%|██████████| 12/12 [00:00<00:00, 79.83it/s]
100%|██████████| 12/12 [00:02<00:00,  4.43it/s]


In [None]:
# fix matrix_to_interaction_tensor context orders
# take inner of all tensor shapes

In [33]:
[tensor.tensor.shape for tensor in tensors.values()]


[(6, 6, 54, 12), (6, 6, 53, 12), (5, 5, 3, 12), (6, 6, 54, 12)]

In [29]:
set(tensors['cellchat'].order_names[2]).difference(tensors['natmi'].order_names[2])

set()

In [30]:
liana::

dict_keys(['cellphonedb', 'natmi', 'cellchat', 'sca'])

In [None]:
tensors['natmi'].tensor.shape

SyntaxError: invalid syntax (<ipython-input-34-ec21299bde3e>, line 1)

In [None]:
# reshape tensor
# make sure consistent, and with correct order 

Transform into 4D Interaction Tensor
Make all communication matrices consistent -- contain all same LR and CC pairs in the same order

In [26]:
# # get all possible cell type pairs -- union of all LR pairs and CC pairs measured by all methods

# # only necessary bc multiple pipelines, otherwise can feed directly into "matrix_to_communication_tensor" with how = 'outer'
# score_methods = [cellchat, ]
# method_names = ['cellchat', ]


# cell_pairs = []
# lr_pairs = []
# for score_method in score_methods:
#     cp = [df.columns.tolist() for df in score_method.values()]
#     cp = [item for sublist in cp for item in sublist]
#     cell_pairs += cp
    
#     lr = [df.index.tolist() for df in score_method.values()]
#     lr = [item for sublist in lr for item in sublist]
#     lr_pairs += lr

In [27]:
# cell_pairs = sorted(set(cell_pairs))
# lr_pairs = sorted(set(lr_pairs))



In [32]:
# def create_nan_vals_all(df, lr_pairs, cell_pairs):
#     """Adds nan communication scores if pipeline did not include a certain sender-receiver cell pair"""
#     df = pd.concat([df, pd.DataFrame(index = set(lr_pairs).difference(df.index),columns = df.columns)])
#     for col in set(cell_pairs).difference(df.columns):
#         df[col] = float('nan')
    
#     df = df.loc[lr_pairs, cell_pairs] # make the order the same
#     return df

# tensors = {}
# for idx, score_method in enumerate(score_methods):
#     print(method_names[idx])
#     scores = OrderedDict({k: create_nan_vals_all(df=score_method[k], lr_pairs=lr_pairs, cell_pairs=cell_pairs) for k in samples})
#     score_methods[idx] = scores
#     tensors[method_names[idx]] = matrix_to_interaction_tensor(scores, how = 'outer', 
#                                                              cp_delim='-', lr_delim='&')

  0%|          | 0/12 [00:00<?, ?it/s]

cellchat


100%|██████████| 12/12 [00:24<00:00,  2.05s/it]
