In [35]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from tqdm.notebook import tqdm
#import igraph as ig
from functions import network_functions, utility_functions
import importlib
from IPython.display import display
from joblib import Parallel, delayed
import itertools

In [7]:
raw_data_dir = '../data/raw/'
gene_exp_dir = '../data/processed/gene_expression/'
org_pairs_dir = '../data/processed/organotropism_pairs/'
intercell_net_dir = '../data/processed/intercell_networks/'

In [19]:
metastasis_datasets = ['autopsy', 'hcmdb']
tissue_datasets = ['gtex', 'consensus']
network_types = ['all', 'curated']

# Computing intercellular interactions between tissue pairs
This part of our work will be the first step towards building tissue specific PPI networks. We will start by looking at 
our hypothesis is that intercellular interactions established between metastasizing cells and the cells pre-metastatic niche are essential for metastasis development. We expect to find more intercell interactions between organotropism pairs than between control pairs.

Workflow:
* import tissue pairs, gene expression calls, intercell interactions data
* select intercell interactions genes
* compute number of intercell interactions between tissue pairs using expression calls
* compute weighted (normalized by the max value) intercell interactions between tissue pairs 

# Intercellular interactions networks with gene calls

## Tissue labels & match

**Tissue id for file naming**
* We will set a index column to create a integer id for each tissue.
* The id is dependent on the tissue database so the same number might not correspond to a similar tissue in both databases.
* This will allow us to name the intercellular network files in an simple and unambiguous way

In [14]:
tissues = pd.read_csv(raw_data_dir+'tissue_match.csv').reset_index().set_index('tissue')
tissues.head()

Unnamed: 0_level_0,index,gtex,consensus
tissue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
adipose_tissue,0,Adipose - Subcutaneous,adipose tissue
adipose_tissue,1,Adipose - Visceral (Omentum),
adrenal_gland,2,Adrenal Gland,adrenal gland
appendix,3,,appendix
artery,4,Artery - Aorta,


In [25]:
# we will use two intercellular interactions datasets:
# all interactions
# only manually curated interactions

intercell_graph = {}
for net_type in network_types:
    if net_type == 'all':
        label = '_'
    else:
        label = f'_{net_type}_'
    
    intercell_graph[net_type] = pd.read_csv(intercell_net_dir+f'intercell{label}graph.csv')
    print(f'{net_type}: {intercell_graph[net_type].shape[0]} interactions')

all: 10170 interactions
curated: 6190 interactions


## Ungrouped Tissue Networks

In [30]:
# build intercell networks

for tissue_dataset in tqdm(tissue_datasets, desc='tissue_dataset'):
    calls = pd.read_csv(gene_exp_dir+f'{tissue_dataset}/records.csv')

    # choose calls
    calls = calls.pivot_table(values='call_0.4_0.9', index='gene_id', columns='tissue')
    # Transform zeros in NaN
    calls = calls.where(calls==1)

    for net_type in tqdm(network_types, desc='network_type'):

        graph = intercell_graph[net_type]
        directory = intercell_net_dir+f'{net_type}/{tissue_dataset}/ungrouped'
        utility_functions.check_dir(directory)
        
        network_functions.build_intercell_networks(
            tissues[['index', tissue_dataset]].dropna(), 
            calls,
            graph,
            tissue_column=tissue_dataset,
            directory=directory,
            sep='_'
        )

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

## Grouped Tissue Networks

In [32]:
# build intercell networks

for tissue_dataset in tqdm(tissue_datasets, desc='tissue_dataset'):
    calls = pd.read_csv(gene_exp_dir+f'{tissue_dataset}/grouped_records.csv')

    # choose calls
    calls = calls.pivot_table(values='call_0.4_0.9', index='gene_id', columns='tissue')
    # Transform zeros in NaN
    calls = calls.where(calls==1)

    for net_type in tqdm(network_types, desc='network_type'):

        graph = intercell_graph[net_type]
        directory = intercell_net_dir+f'{net_type}/{tissue_dataset}/grouped'
        utility_functions.check_dir(directory)
        
        network_functions.build_grouped_intercell_networks( 
            calls,
            graph,
            directory=directory,
            sep='-'
        )

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

## Controlled comparison network stats
In the controlled comparison we will not be using the grouped tissues networks since each tissue/organ appears the same amount of times in the organotropism vs control groups. That means the number of sub-tissues is balanced between groups and does not skew the results

### Compute number of intercellular interactions for each tissue pair

In [36]:
# compute the number of intercell interactions for each pair

network_stats = []

for net_type in tqdm(network_types, desc='network_type'):
    for tissue_dataset in tqdm(tissue_datasets, desc='tissue_dataset'):
    
        directory = intercell_net_dir+f'{net_type}/{tissue_dataset}/ungrouped'
        stats = network_functions.compute_intercell_interactions(
            directory,
            extra_labels=[
                ('interactions', net_type),
                ('tissue_dataset', tissue_dataset)
            ]
        )
        network_stats.extend(stats)
        
network_stats = pd.DataFrame(network_stats)
network_stats.head(2)

network_type:   0%|          | 0/2 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

networks:   0%|          | 0/1353 [00:00<?, ?it/s]

networks:   0%|          | 0/1417 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

networks:   0%|          | 0/1353 [00:00<?, ?it/s]

networks:   0%|          | 0/1417 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,simple_interactions,directed_interactions,direction,interactions,tissue_dataset
0,Breast - Mammary Tissue,Nerve - Tibial,8796,4055,c_to_m,all,gtex
1,Breast - Mammary Tissue,Nerve - Tibial,8796,4104,m_to_c,all,gtex


In [37]:
# split directed and undirected stats
network_stats_undir = network_stats.drop(['directed_interactions', 'direction'], axis=1)\
    .drop_duplicates(ignore_index=True)
network_stats_dir = network_stats.drop('simple_interactions', axis=1)
network_stats_undir.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,simple_interactions,interactions,tissue_dataset
0,Breast - Mammary Tissue,Nerve - Tibial,8796,all,gtex
1,Colon - Sigmoid,Ovary,7039,all,gtex


In [38]:
# to get all tissue pairs, we'll create a dataframe where we reverse the tissue order
network_stats_undir_rev = network_stats_undir.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)

# for directed interactions we also have to change the entry's direction
network_stats_dir_rev = network_stats_dir.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)
display(network_stats_dir_rev.head())
network_stats_dir_rev['direction'] = np.where(network_stats_dir_rev['direction']=='c_to_m', 'm_to_c', 'c_to_m')
network_stats_dir_rev.head()

Unnamed: 0,metastasis_tissue,cancer_tissue,directed_interactions,direction,interactions,tissue_dataset
0,Breast - Mammary Tissue,Nerve - Tibial,4055,c_to_m,all,gtex
1,Breast - Mammary Tissue,Nerve - Tibial,4104,m_to_c,all,gtex
2,Colon - Sigmoid,Ovary,3275,c_to_m,all,gtex
3,Colon - Sigmoid,Ovary,3231,m_to_c,all,gtex
4,Brain - Cortex,Skin - Not Sun Exposed (Suprapubic),2884,c_to_m,all,gtex


Unnamed: 0,metastasis_tissue,cancer_tissue,directed_interactions,direction,interactions,tissue_dataset
0,Breast - Mammary Tissue,Nerve - Tibial,4055,m_to_c,all,gtex
1,Breast - Mammary Tissue,Nerve - Tibial,4104,c_to_m,all,gtex
2,Colon - Sigmoid,Ovary,3275,m_to_c,all,gtex
3,Colon - Sigmoid,Ovary,3231,c_to_m,all,gtex
4,Brain - Cortex,Skin - Not Sun Exposed (Suprapubic),2884,m_to_c,all,gtex


In [44]:
# Concatenate reverse dataframes
network_stats_undir_network = pd.concat(
    [network_stats_undir, network_stats_undir_rev], ignore_index=True)
network_stats_dir_network = pd.concat(
    [network_stats_dir, network_stats_dir_rev], ignore_index=True)

network_stats_dir_network.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,directed_interactions,direction,interactions,tissue_dataset
0,Breast - Mammary Tissue,Nerve - Tibial,4055,c_to_m,all,gtex
1,Breast - Mammary Tissue,Nerve - Tibial,4104,m_to_c,all,gtex


### Compute Jaccard index for each tissue pair

The Jaccard index, also known as the Jaccard similarity coefficient, is a statistic used for gauging the similarity and diversity of sample sets:
$$
J(C,M)=\frac{|C\cap{M}|}{|C\cup{M}|}=\frac{|C\cap{M}|}{|C|+|M|-|C\cap{M}|},
$$

$|C\cap{M}|:$ number of intercellular interactions between cancer (C) and metastasis (C) tissues

$|C\cup{M}|:$ total number of intercellular interactions that cancer (C) and metastasis (C) tissues can form

#### Undirected jaccard

In [46]:
jaccard_records = []

for tissue_dataset in tqdm(tissue_datasets, desc='tissue_dataset'):
    
    calls = pd.read_csv(gene_exp_dir+f'{tissue_dataset}/records.csv')
    calls = calls.pivot_table(values='call_0.4_0.9', index='gene_id', columns='tissue')

    for net_type in tqdm(network_types, desc='network_type'):
        
        graph = intercell_graph[net_type]

        df = network_stats_undir_network[
            (network_stats_undir_network.tissue_dataset==tissue_dataset) & 
            (network_stats_undir_network.interactions==net_type)
            ]
        
        unique_pairs = df[
            ['cancer_tissue', 'metastasis_tissue', 'simple_interactions']].drop_duplicates().values

        for pair in tqdm(unique_pairs, desc='pairs'):
            
            tissue_pair = pair[:2]
            intersection = pair[2]
            jaccard = network_functions.jaccard_index(
                tissue_pair, calls, graph, intersection)
            
            row = dict(
                cancer_tissue=tissue_pair[0],
                metastasis_tissue=tissue_pair[1],
                tissue_dataset=tissue_dataset,
                interactions=net_type,
                jaccard=jaccard)
            jaccard_records.append(row)
            
jaccard = pd.DataFrame(jaccard_records)
jaccard.head()

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

network_type:   0%|          | 0/2 [00:00<?, ?it/s]

pairs:   0%|          | 0/2706 [00:00<?, ?it/s]

pairs:   0%|          | 0/2706 [00:00<?, ?it/s]

network_type:   0%|          | 0/2 [00:00<?, ?it/s]

pairs:   0%|          | 0/2834 [00:00<?, ?it/s]

pairs:   0%|          | 0/2834 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,tissue_dataset,interactions,jaccard
0,Breast - Mammary Tissue,Nerve - Tibial,gtex,all,0.306973
1,Colon - Sigmoid,Ovary,gtex,all,0.231462
2,Brain - Cortex,Skin - Not Sun Exposed (Suprapubic),gtex,all,0.20643
3,Artery - Coronary,Prostate,gtex,all,0.310907
4,Cervix - Ectocervix,Colon - Sigmoid,gtex,all,0.263154


In [47]:
# merge datasets: 
undir_stats_jaccard = pd.merge(
    network_stats_undir_network,
    jaccard,
    on=['cancer_tissue', 'metastasis_tissue', 'tissue_dataset', 'interactions'], how='left')
undir_stats_jaccard.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,simple_interactions,interactions,tissue_dataset,jaccard
0,Breast - Mammary Tissue,Nerve - Tibial,8796,all,gtex,0.306973
1,Colon - Sigmoid,Ovary,7039,all,gtex,0.231462


In [49]:
undir_stats_jaccard.to_csv(intercell_net_dir + 'undirected_network_stats.csv', index=False)

#### Directed jaccard

In [51]:
jaccard_records = []

for tissue_dataset in tqdm(tissue_datasets, desc='tissue_dataset'):
    
    calls = pd.read_csv(gene_exp_dir+f'{tissue_dataset}/records.csv')
    calls = calls.pivot_table(values='call_0.4_0.9', index='gene_id', columns='tissue')

    for net_type in tqdm(network_types, desc='network_type'):
        
        graph = intercell_graph[net_type]

        for direction in tqdm(network_stats_dir_network.direction.unique(), desc='direction'):
                           
            df = network_stats_dir_network[
                (network_stats_dir_network.tissue_dataset==tissue_dataset) & 
                (network_stats_dir_network.interactions==net_type) &
                (network_stats_dir_network.direction==direction)
            ]
            unique_pairs = df[
                ['cancer_tissue', 'metastasis_tissue', 'directed_interactions']
            ].drop_duplicates().values
        
            for pair in tqdm(unique_pairs):
                tissue_pair = pair[:2]
                intersection = pair[2]
                jaccard = network_functions.jaccard_index(
                    tissue_pair, calls, graph, intersection, direction=direction)

                row = dict(
                    cancer_tissue=tissue_pair[0],
                    metastasis_tissue=tissue_pair[1],
                    tissue_dataset=tissue_dataset,
                    interactions=net_type,
                    direction=direction,
                    jaccard=jaccard)
                jaccard_records.append(row)
            
jaccard = pd.DataFrame(jaccard_records)
jaccard.head()

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

network_type:   0%|          | 0/2 [00:00<?, ?it/s]

direction:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2706 [00:00<?, ?it/s]

  0%|          | 0/2706 [00:00<?, ?it/s]

direction:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2706 [00:00<?, ?it/s]

  0%|          | 0/2706 [00:00<?, ?it/s]

network_type:   0%|          | 0/2 [00:00<?, ?it/s]

direction:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

direction:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

  0%|          | 0/2834 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,tissue_dataset,interactions,direction,jaccard
0,Breast - Mammary Tissue,Nerve - Tibial,gtex,all,c_to_m,0.296613
1,Colon - Sigmoid,Ovary,gtex,all,c_to_m,0.226628
2,Brain - Cortex,Skin - Not Sun Exposed (Suprapubic),gtex,all,c_to_m,0.194313
3,Artery - Coronary,Prostate,gtex,all,c_to_m,0.301182
4,Cervix - Ectocervix,Colon - Sigmoid,gtex,all,c_to_m,0.257788


In [52]:
dir_stats_jaccard = pd.merge(
    network_stats_dir_network,
    jaccard,
    on=['cancer_tissue', 'metastasis_tissue', 'tissue_dataset', 'interactions', 'direction'], 
    how='left')
dir_stats_jaccard.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,directed_interactions,direction,interactions,tissue_dataset,jaccard
0,Breast - Mammary Tissue,Nerve - Tibial,4055,c_to_m,all,gtex,0.296613
1,Breast - Mammary Tissue,Nerve - Tibial,4104,m_to_c,all,gtex,0.301277


In [53]:
dir_stats_jaccard.to_csv(intercell_net_dir + 'directed_network_stats.csv', index=False)

## Cancer-wise Comparison network stats
In the cancer-wise analysis we need to use the grouped tissues. Since we are correlating network stats with frequency of metastasis, organs with several tissues can influence the final result  

### Compute number of intercellular interactions for each tissue pair

In [55]:
# compute the number of intercell interactions for each pair
importlib.reload(network_functions)
network_stats = []

for net_type in tqdm(network_types, desc='network_type'):
    for tissue_dataset in tqdm(tissue_datasets, desc='tissue_dataset'):
    
        directory = f'{intercell_net_dir}{net_type}/{tissue_dataset}/grouped'

        stats = network_functions.compute_intercell_interactions(
            directory,
            extra_labels=[
                ('interactions', net_type),
                ('tissue_dataset', tissue_dataset)
            ]
        )
        network_stats.extend(stats)
        
network_stats = pd.DataFrame(network_stats)
network_stats.head(2)

network_type:   0%|          | 0/2 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

networks:   0%|          | 0/528 [00:00<?, ?it/s]

networks:   0%|          | 0/861 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

networks:   0%|          | 0/528 [00:00<?, ?it/s]

networks:   0%|          | 0/861 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,simple_interactions,directed_interactions,direction,interactions,tissue_dataset
0,adrenal_gland,fallopian_tube,7405,3447,c_to_m,all,gtex
1,adrenal_gland,fallopian_tube,7405,3452,m_to_c,all,gtex


In [56]:
# split directed and undirected stats
network_stats_undir = network_stats.drop(['directed_interactions', 'direction'], axis=1).drop_duplicates(ignore_index=True)
network_stats_dir = network_stats.drop('simple_interactions', axis=1)
network_stats_undir.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,simple_interactions,interactions,tissue_dataset
0,adrenal_gland,fallopian_tube,7405,all,gtex
1,fibroblasts,stomach,5951,all,gtex


In [61]:
# to get all tissue pairs, we'll create a dataframe where we reverse the tissue order
network_stats_undir_rev = network_stats_undir.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)

# for directed interactions we also have to change the entry's direction
network_stats_dir_rev = network_stats_dir.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)
display(network_stats_dir_rev.head())
network_stats_dir_rev['direction'] = np.where(network_stats_dir_rev['direction']=='c_to_m', 'm_to_c', 'c_to_m')
network_stats_dir_rev.head()

Unnamed: 0,metastasis_tissue,cancer_tissue,directed_interactions,direction,interactions,tissue_dataset
0,adrenal_gland,fallopian_tube,3447,c_to_m,all,gtex
1,adrenal_gland,fallopian_tube,3452,m_to_c,all,gtex
2,fibroblasts,stomach,2884,c_to_m,all,gtex
3,fibroblasts,stomach,2651,m_to_c,all,gtex
4,liver,uterus,2676,c_to_m,all,gtex


Unnamed: 0,metastasis_tissue,cancer_tissue,directed_interactions,direction,interactions,tissue_dataset
0,adrenal_gland,fallopian_tube,3447,m_to_c,all,gtex
1,adrenal_gland,fallopian_tube,3452,c_to_m,all,gtex
2,fibroblasts,stomach,2884,m_to_c,all,gtex
3,fibroblasts,stomach,2651,c_to_m,all,gtex
4,liver,uterus,2676,m_to_c,all,gtex


In [64]:
# Concatenate reverse dataframes
network_stats_undir_network = pd.concat(
    [network_stats_undir, network_stats_undir_rev], ignore_index=True)
network_stats_dir_network = pd.concat(
    [network_stats_dir, network_stats_dir_rev], ignore_index=True)

network_stats_dir_network.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,directed_interactions,direction,interactions,tissue_dataset
0,adrenal_gland,fallopian_tube,3447,c_to_m,all,gtex
1,adrenal_gland,fallopian_tube,3452,m_to_c,all,gtex


### Compute Jaccard index for each tissue pair

The Jaccard index, also known as the Jaccard similarity coefficient, is a statistic used for gauging the similarity and diversity of sample sets:
$$
J(C,M)=\frac{|C\cap{M}|}{|C\cup{M}|}=\frac{|C\cap{M}|}{|C|+|M|-|C\cap{M}|},
$$

$|C\cap{M}|:$ number of intercellular interactions between cancer (C) and metastasis (C) tissues

$|C\cup{M}|:$ total number of intercellular interactions that cancer (C) and metastasis (C) tissues can form

#### Undirected jaccard

In [68]:
importlib.reload(network_functions)
jaccard_records = []

for tissue_dataset in tqdm(tissue_datasets, desc='tissue_dataset'):
    
    calls = pd.read_csv(gene_exp_dir+f'{tissue_dataset}/grouped_records.csv')
    calls = calls.pivot_table(values='call_0.4_0.9', index='gene_id', columns='tissue')

    for net_type in tqdm(network_types, desc='network_type'): 

        graph = intercell_graph[net_type]

        df = network_stats_undir_network[
            (network_stats_undir_network.tissue_dataset==tissue_dataset) &
            (network_stats_undir_network.interactions==net_type)]
        
        unique_pairs = df[
            ['cancer_tissue', 'metastasis_tissue', 'simple_interactions']].drop_duplicates().values

        for pair in tqdm(unique_pairs, desc='pairs'):
            
            tissue_pair = pair[:2]
            intersection = pair[2]
            jaccard = network_functions.jaccard_index(
                tissue_pair, calls, graph, intersection)
            
            row = dict(
                cancer_tissue=tissue_pair[0],
                metastasis_tissue=tissue_pair[1],
                tissue_dataset=tissue_dataset,
                interactions=net_type,
                jaccard=jaccard)
            jaccard_records.append(row)
            
jaccard = pd.DataFrame(jaccard_records)
jaccard.head()

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

network_type:   0%|          | 0/2 [00:00<?, ?it/s]

pairs:   0%|          | 0/1056 [00:00<?, ?it/s]

pairs:   0%|          | 0/1056 [00:00<?, ?it/s]

network_type:   0%|          | 0/2 [00:00<?, ?it/s]

pairs:   0%|          | 0/1722 [00:00<?, ?it/s]

pairs:   0%|          | 0/1722 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,tissue_dataset,interactions,jaccard
0,adrenal_gland,fallopian_tube,gtex,all,0.246464
1,fibroblasts,stomach,gtex,all,0.188927
2,liver,uterus,gtex,all,0.178377
3,nerve,pancreas,gtex,all,0.202711
4,fallopian_tube,ovary,gtex,all,0.260603


In [69]:
# merge datasets: 
undir_stats_jaccard = pd.merge(
    network_stats_undir_network,
    jaccard,
    on=['cancer_tissue', 'metastasis_tissue', 'tissue_dataset', 'interactions'], how='left')
undir_stats_jaccard.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,simple_interactions,interactions,tissue_dataset,jaccard
0,adrenal_gland,fallopian_tube,7405,all,gtex,0.246464
1,fibroblasts,stomach,5951,all,gtex,0.188927


In [71]:
undir_stats_jaccard.to_csv(intercell_net_dir + 'undirected_grouped_network_stats.csv', index=False)

#### Directed jaccard

In [72]:
importlib.reload(network_functions)

jaccard_records = []
for tissue_dataset in tqdm(tissue_datasets, desc='tissue_dataset'):
    
    calls = pd.read_csv(gene_exp_dir+f'{tissue_dataset}/grouped_records.csv')
    calls = calls.pivot_table(values='call_0.4_0.9', index='gene_id', columns='tissue')

    for net_type in tqdm(network_types, desc='network_type'): 

        graph = intercell_graph[net_type]

        for direction in tqdm(network_stats_dir_network.direction.unique(), desc='direction'):
                
            df = network_stats_dir_network[
                (network_stats_dir_network.tissue_dataset==tissue_dataset) & 
                (network_stats_dir_network.interactions==net_type) &
                (network_stats_dir_network.direction==direction)
            ]
            unique_pairs = df[
                ['cancer_tissue', 'metastasis_tissue', 'directed_interactions']
            ].drop_duplicates().values
        
            for pair in tqdm(unique_pairs):
                tissue_pair = pair[:2]
                intersection = pair[2]
                jaccard = network_functions.jaccard_index(
                    tissue_pair, calls, graph, intersection, direction=direction)

                row = dict(
                    cancer_tissue=tissue_pair[0],
                    metastasis_tissue=tissue_pair[1],
                    tissue_dataset=tissue_dataset,
                    interactions=net_type,
                    direction=direction,
                    jaccard=jaccard)
                jaccard_records.append(row)
            
jaccard = pd.DataFrame(jaccard_records)
jaccard.head()

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

network_type:   0%|          | 0/2 [00:00<?, ?it/s]

direction:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1056 [00:00<?, ?it/s]

  0%|          | 0/1056 [00:00<?, ?it/s]

direction:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1056 [00:00<?, ?it/s]

  0%|          | 0/1056 [00:00<?, ?it/s]

network_type:   0%|          | 0/2 [00:00<?, ?it/s]

direction:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1722 [00:00<?, ?it/s]

  0%|          | 0/1722 [00:00<?, ?it/s]

direction:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1722 [00:00<?, ?it/s]

  0%|          | 0/1722 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,tissue_dataset,interactions,direction,jaccard
0,adrenal_gland,fallopian_tube,gtex,all,c_to_m,0.241403
1,fibroblasts,stomach,gtex,all,c_to_m,0.194313
2,liver,uterus,gtex,all,c_to_m,0.177807
3,nerve,pancreas,gtex,all,c_to_m,0.198918
4,fallopian_tube,ovary,gtex,all,c_to_m,0.255738


In [73]:
dir_stats_jaccard = pd.merge(
    network_stats_dir_network,
    jaccard,
    on=['cancer_tissue', 'metastasis_tissue', 'tissue_dataset', 'interactions', 'direction'], 
    how='left')
dir_stats_jaccard.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,directed_interactions,direction,interactions,tissue_dataset,jaccard
0,adrenal_gland,fallopian_tube,3447,c_to_m,all,gtex,0.241403
1,adrenal_gland,fallopian_tube,3452,m_to_c,all,gtex,0.241838


In [74]:
dir_stats_jaccard.to_csv(intercell_net_dir + 'directed_grouped_network_stats.csv', index=False)

# Random intercellular networks with gene calls

We want to keep the same proportion of source/target genes in the random networks. For that we will use 3 distinct intercellular gene pools:
* genes that can be both source and target
* source only genes
* target only genes

Additionally each gene has a probability of being choosen proportional to the number of times it is expressed in all tissues.

## Undirected interactions

### Build ungrouped intercell networks

In [None]:
importlib.reload(network_functions)
# load intercell network stats
network_stats = pd.read_csv(intercell_net_dir+'undirected_network_stats.csv')

random_network_stats = []

for tissue_dataset in tqdm(tissue_datasets, desc='tissue_dataset'):
    
    calls = pd.read_csv(gene_exp_dir+f'{tissue_dataset}/records.csv')
    calls = calls.pivot_table(values='call_0.4_0.9', index='gene_id', columns='tissue')
    
    for net_type in tqdm(network_types, desc='network_type'):

        graph = intercell_graph[net_type]

        stats = network_functions.build_random_intercell_networks(
            tissues[tissue_dataset].dropna(), 
            calls,
            graph,
            weights=True,
            extra_labels=[
                ('interactions', net_type),
                ('tissue_dataset', tissue_dataset)
            ],
            random_state=42,
            n_jobs=-1
        )

        random_network_stats.extend(stats)

random_network_stats = pd.DataFrame(random_network_stats)

# to get all tissue pairs, we'll create a dataframe where we reverse the tissue order
random_net_rev = random_network_stats.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)

# Concatenate reverse dataframes
random_net_full = pd.concat(
    [random_network_stats, random_net_rev], ignore_index=True)

# merge random_net with normal_net and compute z-score
network_stats_update = pd.merge(
    random_net_full,
    network_stats,
    on=['cancer_tissue', 'metastasis_tissue', 'tissue_dataset', 'interactions'])

# compute z-score
network_stats_update['zscore'] =\
    (network_stats_update['simple_interactions']-network_stats_update['mean'])\
        /network_stats_update['std']

# update network stats file excluding the distribution column
network_stats_update.to_csv(
    intercell_net_dir+'undirected_network_stats.csv', 
    index=False,
    columns=network_stats_update.columns.drop('dist'))

### Build grouped intercell networks

In [23]:
importlib.reload(network_functions)
network_stats = []
for inter_kind in tqdm(['all', 'curated']):
    if inter_kind == 'all':
        inter_graph = intercell
    else:
        inter_graph = intercell_curated

    for d in tqdm(['GTEx', 'Consensus']):

        stats = network_functions.build_random_grouped_intercell_networks(
            tissues[d].dropna(),
            grouped_calls[d],
            inter_graph,
            weights=True,
            extra_labels=[
                ('interactions', inter_kind),
                ('tissue_dataset', d)
            ],
            n_jobs=20
        )

        network_stats.extend(stats)

network_stats = pd.DataFrame(network_stats)
network_stats.head()

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,dist,mean,std,interactions,tissue_dataset
0,adipose_tissue,adrenal_gland,"[5038, 4760, 5114, 4726, 4901, 5142, 5212, 497...",5076.164,166.97993,all,GTEx
1,adipose_tissue,artery,"[5890, 5907, 5869, 5789, 5932, 5810, 5830, 562...",5783.81,169.063786,all,GTEx
2,adipose_tissue,bladder,"[5939, 6177, 6186, 6120, 6141, 5780, 6227, 582...",6034.651,175.971137,all,GTEx
3,adipose_tissue,blood,"[3708, 3740, 3781, 3668, 3533, 3322, 3729, 374...",3639.668,149.683018,all,GTEx
4,adipose_tissue,brain,"[4981, 4856, 4648, 4834, 4764, 5064, 4793, 458...",4837.178,167.847432,all,GTEx


In [25]:
network_stats.to_csv(net_dir+'random_grouped_network_stats_weights.csv', index=False)

#### **Compute the z-score for each tissue pair**

In [25]:
# load random network stats
random_net = pd.read_csv(
    net_dir+'random_grouped_network_stats.csv', 
    converters={'dist': functions.str_to_array})
# load random network stats with weights
random_net_weights = pd.read_csv(
    net_dir+'random_grouped_network_stats_weights.csv',
    converters={'dist': functions.str_to_array})
display(random_net.info())
random_net_weights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2778 entries, 0 to 2777
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cancer_tissue      2778 non-null   object 
 1   metastasis_tissue  2778 non-null   object 
 2   dist               2778 non-null   object 
 3   mean               2778 non-null   float64
 4   std                2778 non-null   float64
 5   interactions       2778 non-null   object 
 6   tissue_dataset     2778 non-null   object 
dtypes: float64(2), object(5)
memory usage: 152.0+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2778 entries, 0 to 2777
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cancer_tissue      2778 non-null   object 
 1   metastasis_tissue  2778 non-null   object 
 2   dist               2778 non-null   object 
 3   mean               2778 non-null   float64
 4   std                2778 non-null   float64
 5   interactions       2778 non-null   object 
 6   tissue_dataset     2778 non-null   object 
dtypes: float64(2), object(5)
memory usage: 152.0+ KB


In [26]:
# load intercell network stats
normal_net = pd.read_csv(net_dir+'undirected_grouped_network_stats.csv')
normal_net.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,simple_interactions,interactions,tissue_dataset,jaccard
0,spinal_cord,testis,6956,all,GTEx,0.440337
1,colorectum,salivary_gland,7274,all,GTEx,0.463371


In [27]:
# to get all tissue pairs, we'll create a dataframe where we reverse the tissue order
random_net_rev = random_net.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)

random_net_weights_rev = random_net_weights.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)

display(random_net_rev.info())
random_net_weights_rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2778 entries, 0 to 2777
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   metastasis_tissue  2778 non-null   object 
 1   cancer_tissue      2778 non-null   object 
 2   dist               2778 non-null   object 
 3   mean               2778 non-null   float64
 4   std                2778 non-null   float64
 5   interactions       2778 non-null   object 
 6   tissue_dataset     2778 non-null   object 
dtypes: float64(2), object(5)
memory usage: 152.0+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2778 entries, 0 to 2777
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   metastasis_tissue  2778 non-null   object 
 1   cancer_tissue      2778 non-null   object 
 2   dist               2778 non-null   object 
 3   mean               2778 non-null   float64
 4   std                2778 non-null   float64
 5   interactions       2778 non-null   object 
 6   tissue_dataset     2778 non-null   object 
dtypes: float64(2), object(5)
memory usage: 152.0+ KB


In [28]:
# Concatenate reverse dataframes
random_net_full = pd.concat(
    [random_net, random_net_rev], ignore_index=True)

random_net_weights_full = pd.concat(
    [random_net_weights, random_net_weights_rev], ignore_index=True)

display(random_net_full.info())
random_net_weights_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5556 entries, 0 to 5555
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cancer_tissue      5556 non-null   object 
 1   metastasis_tissue  5556 non-null   object 
 2   dist               5556 non-null   object 
 3   mean               5556 non-null   float64
 4   std                5556 non-null   float64
 5   interactions       5556 non-null   object 
 6   tissue_dataset     5556 non-null   object 
dtypes: float64(2), object(5)
memory usage: 304.0+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5556 entries, 0 to 5555
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cancer_tissue      5556 non-null   object 
 1   metastasis_tissue  5556 non-null   object 
 2   dist               5556 non-null   object 
 3   mean               5556 non-null   float64
 4   std                5556 non-null   float64
 5   interactions       5556 non-null   object 
 6   tissue_dataset     5556 non-null   object 
dtypes: float64(2), object(5)
memory usage: 304.0+ KB


In [29]:
# merge random_net with normal_net and compute z-score
random_net_zscore = pd.merge(
    random_net_full,
    normal_net,
    on=['cancer_tissue', 'metastasis_tissue', 'tissue_dataset', 'interactions'])

random_net_zscore['zscore'] =\
    (random_net_zscore['simple_interactions']-random_net_zscore['mean'])\
        /random_net_zscore['std']

display(random_net_zscore.info())

# with weights
random_net_weights_zscore = pd.merge(
    random_net_weights_full,
    normal_net,
    on=['cancer_tissue', 'metastasis_tissue', 'tissue_dataset', 'interactions'])

random_net_weights_zscore['zscore'] =\
    (random_net_weights_zscore['simple_interactions']-random_net_weights_zscore['mean'])\
        /random_net_weights_zscore['std']

random_net_weights_zscore.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5556 entries, 0 to 5555
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   cancer_tissue        5556 non-null   object 
 1   metastasis_tissue    5556 non-null   object 
 2   dist                 5556 non-null   object 
 3   mean                 5556 non-null   float64
 4   std                  5556 non-null   float64
 5   interactions         5556 non-null   object 
 6   tissue_dataset       5556 non-null   object 
 7   simple_interactions  5556 non-null   int64  
 8   jaccard              5556 non-null   float64
 9   zscore               5556 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 477.5+ KB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5556 entries, 0 to 5555
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   cancer_tissue        5556 non-null   object 
 1   metastasis_tissue    5556 non-null   object 
 2   dist                 5556 non-null   object 
 3   mean                 5556 non-null   float64
 4   std                  5556 non-null   float64
 5   interactions         5556 non-null   object 
 6   tissue_dataset       5556 non-null   object 
 7   simple_interactions  5556 non-null   int64  
 8   jaccard              5556 non-null   float64
 9   zscore               5556 non-null   float64
dtypes: float64(4), int64(1), object(5)
memory usage: 477.5+ KB


In [30]:
# save network stats excluding the distribution column
random_net_zscore.to_csv(
    net_dir+'undirected_grouped_network_stats_zscore.csv', 
    index=False,
    columns=random_net_zscore.columns.drop('dist'))

random_net_weights_zscore.to_csv(
    net_dir+'undirected_grouped_network_stats_weights_zscore.csv', 
    index=False,
    columns=random_net_zscore.columns.drop('dist'))

In [13]:
# convert random distributions to an array an save
dist = np.array(random_net_zscore.dist.to_list())
with open(net_dir+'random_grouped_dist.npy', 'wb') as f:
    np.save(f, dist)
    
dist = np.array(random_net_weights_zscore.dist.to_list())
with open(net_dir+'random_grouped_dist_weights.npy', 'wb') as f:
    np.save(f, dist)

## Directed interactions

### Build Ungrouped intercell networks

#### **Without gene weights**
In this approach, the genes in each gene pool have the same probability of being picked up to integrate the intercellular interactions network

In [23]:
importlib.reload(network_functions)
network_stats = []
for inter_kind in tqdm(['all', 'curated']):
    if inter_kind == 'all':
        inter_graph = intercell
    else:
        inter_graph = intercell_curated

    for d in tqdm(['GTEx', 'Consensus']):

        stats = network_functions.build_random_intercell_networks(
            tissues[d].dropna(), 
            calls[d],
            inter_graph,
            directed_graph=True,
            weights=False,
            extra_labels=[
                ('interactions', inter_kind),
                ('tissue_dataset', d)
            ],
            n_jobs=20
        )

        network_stats.extend(stats)

network_stats = pd.DataFrame(network_stats)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
network_stats.head()

Unnamed: 0,cancer_tissue,metastasis_tissue,direction,dist,mean,std,interactions,tissue_dataset
0,Adipose - Subcutaneous,Adrenal Gland,c_to_m,"[2426, 2338, 2394, 2249, 2495, 2474, 2575, 254...",2445.311,144.224104,all,GTEx
1,Adipose - Subcutaneous,Adrenal Gland,m_to_c,"[2433, 2258, 2496, 2381, 2524, 2465, 2410, 227...",2424.068,131.671498,all,GTEx
2,Adipose - Subcutaneous,Artery - Aorta,c_to_m,"[2659, 2725, 2657, 2929, 2802, 2789, 2573, 257...",2754.391,140.14581,all,GTEx
3,Adipose - Subcutaneous,Artery - Aorta,m_to_c,"[2937, 2968, 2732, 2564, 2719, 2756, 2918, 247...",2799.521,149.228763,all,GTEx
4,Adipose - Subcutaneous,Artery - Coronary,c_to_m,"[2915, 2984, 3029, 2856, 2910, 2774, 2970, 284...",2887.185,140.398892,all,GTEx


In [26]:
network_stats.to_csv(net_dir+'directed_random_network_stats.csv', index=False)

#### **With gene weights**
In this approach, the genes in each gene pool have a probability of being choosed equal to the number of times each gene is expressed in all tissues divided by the sum of counts of all genes

In [23]:
importlib.reload(network_functions)
network_stats = []
for inter_kind in tqdm(['all', 'curated']):
    if inter_kind == 'all':
        inter_graph = intercell
    else:
        inter_graph = intercell_curated

    for d in tqdm(['GTEx', 'Consensus']):

        stats = network_functions.build_random_intercell_networks(
            tissues[d].dropna(), 
            calls[d],
            inter_graph,
            directed_graph=True,
            weights=True,
            extra_labels=[
                ('interactions', inter_kind),
                ('tissue_dataset', d)
            ],
            n_jobs=20
        )

        network_stats.extend(stats)

network_stats = pd.DataFrame(network_stats)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
network_stats.head()

Unnamed: 0,cancer_tissue,metastasis_tissue,direction,dist,mean,std,interactions,tissue_dataset
0,Adipose - Subcutaneous,Adrenal Gland,c_to_m,"[2426, 2338, 2394, 2249, 2495, 2474, 2575, 254...",2445.311,144.224104,all,GTEx
1,Adipose - Subcutaneous,Adrenal Gland,m_to_c,"[2433, 2258, 2496, 2381, 2524, 2465, 2410, 227...",2424.068,131.671498,all,GTEx
2,Adipose - Subcutaneous,Artery - Aorta,c_to_m,"[2659, 2725, 2657, 2929, 2802, 2789, 2573, 257...",2754.391,140.14581,all,GTEx
3,Adipose - Subcutaneous,Artery - Aorta,m_to_c,"[2937, 2968, 2732, 2564, 2719, 2756, 2918, 247...",2799.521,149.228763,all,GTEx
4,Adipose - Subcutaneous,Artery - Coronary,c_to_m,"[2915, 2984, 3029, 2856, 2910, 2774, 2970, 284...",2887.185,140.398892,all,GTEx


In [26]:
network_stats.to_csv(net_dir+'directed_random_network_stats_weights.csv', index=False)

#### **Compute the zscore for each tissue pair**

In [19]:
# load random network stats
random_net = pd.read_csv(
    net_dir+'directed_random_network_stats.csv',
    converters={'dist': functions.str_to_array}
)
random_net_weights = pd.read_csv(
    net_dir+'directed_random_network_stats_weights.csv',
    converters={'dist': functions.str_to_array}
)
display(random_net.head(2))
random_net_weights.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,direction,dist,mean,std,interactions,tissue_dataset
0,Adipose - Subcutaneous,Adrenal Gland,c_to_m,"[2426, 2338, 2394, 2249, 2495, 2474, 2575, 254...",2445.311,144.224104,all,GTEx
1,Adipose - Subcutaneous,Adrenal Gland,m_to_c,"[2433, 2258, 2496, 2381, 2524, 2465, 2410, 227...",2424.068,131.671498,all,GTEx


Unnamed: 0,cancer_tissue,metastasis_tissue,direction,dist,mean,std,interactions,tissue_dataset
0,Adipose - Subcutaneous,Adrenal Gland,c_to_m,"[2644, 2359, 2297, 2396, 2526, 2481, 2500, 240...",2430.288,112.323431,all,GTEx
1,Adipose - Subcutaneous,Adrenal Gland,m_to_c,"[2460, 2308, 2433, 2216, 2460, 2059, 2481, 222...",2392.589,105.672277,all,GTEx


In [20]:
# load intercell network stats
normal_net = pd.read_csv(net_dir+'directed_network_stats.csv')
normal_net.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,directed_interactions,direction,interactions,tissue_dataset,jaccard
0,Pancreas,Brain - Spinal cord (cervical c-1),2413,c_to_m,all,GTEx,0.336213
1,Pancreas,Brain - Spinal cord (cervical c-1),2089,m_to_c,all,GTEx,0.308431


In [21]:
# to get all tissue pairs, we'll create a dataframe where we reverse the tissue order
# for directed interactions we also have to change the entry's direction
random_net_rev = random_net.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)
display(random_net_rev.head(2))
random_net_rev['direction'] = np.where(random_net_rev['direction']=='c_to_m', 'm_to_c', 'c_to_m')
display(random_net_rev.head(2))

random_net_weights_rev = random_net_weights.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)
display(random_net_weights_rev.head(2))
random_net_weights_rev['direction'] = np.where(random_net_weights_rev['direction']=='c_to_m', 'm_to_c', 'c_to_m')
random_net_weights_rev.head(2)

Unnamed: 0,metastasis_tissue,cancer_tissue,direction,dist,mean,std,interactions,tissue_dataset
0,Adipose - Subcutaneous,Adrenal Gland,c_to_m,"[2426, 2338, 2394, 2249, 2495, 2474, 2575, 254...",2445.311,144.224104,all,GTEx
1,Adipose - Subcutaneous,Adrenal Gland,m_to_c,"[2433, 2258, 2496, 2381, 2524, 2465, 2410, 227...",2424.068,131.671498,all,GTEx


Unnamed: 0,metastasis_tissue,cancer_tissue,direction,dist,mean,std,interactions,tissue_dataset
0,Adipose - Subcutaneous,Adrenal Gland,m_to_c,"[2426, 2338, 2394, 2249, 2495, 2474, 2575, 254...",2445.311,144.224104,all,GTEx
1,Adipose - Subcutaneous,Adrenal Gland,c_to_m,"[2433, 2258, 2496, 2381, 2524, 2465, 2410, 227...",2424.068,131.671498,all,GTEx


Unnamed: 0,metastasis_tissue,cancer_tissue,direction,dist,mean,std,interactions,tissue_dataset
0,Adipose - Subcutaneous,Adrenal Gland,c_to_m,"[2644, 2359, 2297, 2396, 2526, 2481, 2500, 240...",2430.288,112.323431,all,GTEx
1,Adipose - Subcutaneous,Adrenal Gland,m_to_c,"[2460, 2308, 2433, 2216, 2460, 2059, 2481, 222...",2392.589,105.672277,all,GTEx


Unnamed: 0,metastasis_tissue,cancer_tissue,direction,dist,mean,std,interactions,tissue_dataset
0,Adipose - Subcutaneous,Adrenal Gland,m_to_c,"[2644, 2359, 2297, 2396, 2526, 2481, 2500, 240...",2430.288,112.323431,all,GTEx
1,Adipose - Subcutaneous,Adrenal Gland,c_to_m,"[2460, 2308, 2433, 2216, 2460, 2059, 2481, 222...",2392.589,105.672277,all,GTEx


In [22]:
# Concatenate reverse dataframes
random_net_full = pd.concat(
    [random_net, random_net_rev], ignore_index=True)
print(random_net.shape)
print(random_net_full.shape)

random_net_weights_full = pd.concat(
    [random_net_weights, random_net_weights_rev], ignore_index=True)
print(random_net_weights.shape)
random_net_weights_full.shape

(11080, 8)
(22160, 8)
(11080, 8)


(22160, 8)

In [25]:
# merge random_net with normal_net and compute z-score
random_net_zscore = pd.merge(random_net_full, normal_net)
random_net_zscore['zscore'] =\
    (random_net_zscore['directed_interactions']-random_net_zscore['mean'])\
        /random_net_zscore['std']
display(random_net_zscore.head(2))

random_net_weights_zscore = pd.merge(random_net_weights_full, normal_net)
random_net_weights_zscore['zscore'] =\
    (random_net_weights_zscore['directed_interactions']-random_net_weights_zscore['mean'])\
        /random_net_weights_zscore['std']
random_net_weights_zscore.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,direction,dist,mean,std,interactions,tissue_dataset,directed_interactions,jaccard,zscore
0,Adipose - Subcutaneous,Adrenal Gland,c_to_m,"[2426, 2338, 2394, 2249, 2495, 2474, 2575, 254...",2445.311,144.224104,all,GTEx,3184,0.416264,5.121814
1,Adipose - Subcutaneous,Adrenal Gland,m_to_c,"[2433, 2258, 2496, 2381, 2524, 2465, 2410, 227...",2424.068,131.671498,all,GTEx,3130,0.406388,5.361312


Unnamed: 0,cancer_tissue,metastasis_tissue,direction,dist,mean,std,interactions,tissue_dataset,directed_interactions,jaccard,zscore
0,Adipose - Subcutaneous,Adrenal Gland,c_to_m,"[2644, 2359, 2297, 2396, 2526, 2481, 2500, 240...",2430.288,112.323431,all,GTEx,3184,0.416264,6.710194
1,Adipose - Subcutaneous,Adrenal Gland,m_to_c,"[2460, 2308, 2433, 2216, 2460, 2059, 2481, 222...",2392.589,105.672277,all,GTEx,3130,0.406388,6.978283


In [32]:
# save network stats excluding the distribution column
random_net_zscore.to_csv(
    net_dir+'directed_network_stats_zscore.csv', 
    index=False,
    columns=random_net_zscore.columns.drop('dist'))

random_net_weights_zscore.to_csv(
    net_dir+'directed_network_stats_weights_zscore.csv', 
    index=False,
    columns=random_net_zscore.columns.drop('dist'))

In [33]:
# convert random distributions to an array an save
dist = np.array(random_net_zscore.dist.to_list())
with open(net_dir+'directed_random_dist.npy', 'wb') as f:
    np.save(f, dist)
    
dist = np.array(random_net_weights_zscore.dist.to_list())
with open(net_dir+'directed_random_dist_weights.npy', 'wb') as f:
    np.save(f, dist)

### Build grouped intercell networks

#### **Without gene weights**
In this approach, the genes in each gene pool have the same probability of being picked up to integrate the intercellular interactions network

In [11]:
importlib.reload(network_functions)
network_stats = []
for inter_kind in tqdm(['all', 'curated']):
    if inter_kind == 'all':
        inter_graph = intercell
    else:
        inter_graph = intercell_curated

    for d in tqdm(['GTEx', 'Consensus']):

        stats = network_functions.build_random_grouped_intercell_networks(
            tissues[d].dropna(), 
            grouped_calls[d],
            inter_graph,
            directed_graph=True,
            weights=False,
            extra_labels=[
                ('interactions', inter_kind),
                ('tissue_dataset', d)
            ],
            n_jobs=20
        )

        network_stats.extend(stats)

network_stats = pd.DataFrame(network_stats)
network_stats.head()

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,direction,dist,mean,std,interactions,tissue_dataset
0,adipose_tissue,adrenal_gland,c_to_m,"[2268, 2438, 2183, 2407, 2572, 2530, 2357, 219...",2334.778,137.263567,all,GTEx
1,adipose_tissue,adrenal_gland,m_to_c,"[2126, 2265, 2161, 2564, 2435, 2377, 2497, 216...",2347.254,129.325579,all,GTEx
2,adipose_tissue,artery,c_to_m,"[2420, 2627, 2649, 2718, 2598, 2413, 2405, 271...",2631.002,146.450906,all,GTEx
3,adipose_tissue,artery,m_to_c,"[2672, 2725, 2428, 2930, 2694, 2599, 2801, 274...",2714.864,144.843079,all,GTEx
4,adipose_tissue,bladder,c_to_m,"[2581, 2801, 2953, 2689, 2663, 2966, 2568, 274...",2774.448,139.996855,all,GTEx


In [13]:
network_stats.to_csv(net_dir+'directed_random_grouped_network_stats.csv', index=False)

#### **With gene weights**
In this approach, the genes in each gene pool have a probability of being choosed equal to the number of times each gene is expressed in all tissues divided by the sum of counts of all genes

In [14]:
importlib.reload(network_functions)
network_stats = []
for inter_kind in tqdm(['all', 'curated']):
    if inter_kind == 'all':
        inter_graph = intercell
    else:
        inter_graph = intercell_curated

    for d in tqdm(['GTEx', 'Consensus']):

        stats = network_functions.build_random_grouped_intercell_networks(
            tissues[d].dropna(), 
            grouped_calls[d],
            inter_graph,
            directed_graph=True,
            weights=True,
            extra_labels=[
                ('interactions', inter_kind),
                ('tissue_dataset', d)
            ],
            n_jobs=20
        )

        network_stats.extend(stats)

network_stats = pd.DataFrame(network_stats)
network_stats.head()

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,direction,dist,mean,std,interactions,tissue_dataset
0,adipose_tissue,adrenal_gland,c_to_m,"[2220, 2383, 2301, 2440, 2414, 2072, 2291, 269...",2342.385,106.064908,all,GTEx
1,adipose_tissue,adrenal_gland,m_to_c,"[2346, 2540, 2414, 2366, 2307, 2209, 2315, 230...",2350.158,104.585606,all,GTEx
2,adipose_tissue,artery,c_to_m,"[2706, 2574, 2795, 2516, 2570, 2395, 2447, 264...",2636.964,106.379917,all,GTEx
3,adipose_tissue,artery,m_to_c,"[2751, 2602, 2676, 2721, 2669, 2554, 2510, 271...",2719.389,108.676712,all,GTEx
4,adipose_tissue,bladder,c_to_m,"[2832, 2833, 2552, 2784, 2806, 2466, 2599, 286...",2768.166,109.169723,all,GTEx


In [15]:
network_stats.to_csv(net_dir+'directed_random_grouped_network_stats_weights.csv', index=False)

#### **Compute the zscore for each tissue pair**

In [15]:
# load random network stats
random_net = pd.read_csv(
    net_dir+'directed_random_grouped_network_stats.csv',
    converters={'dist': functions.str_to_array}
)
random_net_weights = pd.read_csv(
    net_dir+'directed_random_grouped_network_stats_weights.csv',
    converters={'dist': functions.str_to_array}
)
display(random_net.info())
random_net_weights.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5556 entries, 0 to 5555
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cancer_tissue      5556 non-null   object 
 1   metastasis_tissue  5556 non-null   object 
 2   direction          5556 non-null   object 
 3   dist               5556 non-null   object 
 4   mean               5556 non-null   float64
 5   std                5556 non-null   float64
 6   interactions       5556 non-null   object 
 7   tissue_dataset     5556 non-null   object 
dtypes: float64(2), object(6)
memory usage: 347.4+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5556 entries, 0 to 5555
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   cancer_tissue      5556 non-null   object 
 1   metastasis_tissue  5556 non-null   object 
 2   direction          5556 non-null   object 
 3   dist               5556 non-null   object 
 4   mean               5556 non-null   float64
 5   std                5556 non-null   float64
 6   interactions       5556 non-null   object 
 7   tissue_dataset     5556 non-null   object 
dtypes: float64(2), object(6)
memory usage: 347.4+ KB


In [19]:
# load intercell network stats
normal_net = pd.read_csv(net_dir+'directed_grouped_network_stats.csv')
normal_net.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11112 entries, 0 to 11111
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   cancer_tissue          11112 non-null  object 
 1   metastasis_tissue      11112 non-null  object 
 2   directed_interactions  11112 non-null  int64  
 3   direction              11112 non-null  object 
 4   interactions           11112 non-null  object 
 5   tissue_dataset         11112 non-null  object 
 6   jaccard                11112 non-null  float64
dtypes: float64(1), int64(1), object(5)
memory usage: 607.8+ KB


In [20]:
# to get all tissue pairs, we'll create a dataframe where we reverse the tissue order
# for directed interactions we also have to change the entry's direction
random_net_rev = random_net.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)
display(random_net_rev.info())
random_net_rev['direction'] = np.where(random_net_rev['direction']=='c_to_m', 'm_to_c', 'c_to_m')
display(random_net_rev.info())

random_net_weights_rev = random_net_weights.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)
display(random_net_weights_rev.info())
random_net_weights_rev['direction'] = np.where(random_net_weights_rev['direction']=='c_to_m', 'm_to_c', 'c_to_m')
random_net_weights_rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5556 entries, 0 to 5555
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   metastasis_tissue  5556 non-null   object 
 1   cancer_tissue      5556 non-null   object 
 2   direction          5556 non-null   object 
 3   dist               5556 non-null   object 
 4   mean               5556 non-null   float64
 5   std                5556 non-null   float64
 6   interactions       5556 non-null   object 
 7   tissue_dataset     5556 non-null   object 
dtypes: float64(2), object(6)
memory usage: 347.4+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5556 entries, 0 to 5555
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   metastasis_tissue  5556 non-null   object 
 1   cancer_tissue      5556 non-null   object 
 2   direction          5556 non-null   object 
 3   dist               5556 non-null   object 
 4   mean               5556 non-null   float64
 5   std                5556 non-null   float64
 6   interactions       5556 non-null   object 
 7   tissue_dataset     5556 non-null   object 
dtypes: float64(2), object(6)
memory usage: 347.4+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5556 entries, 0 to 5555
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   metastasis_tissue  5556 non-null   object 
 1   cancer_tissue      5556 non-null   object 
 2   direction          5556 non-null   object 
 3   dist               5556 non-null   object 
 4   mean               5556 non-null   float64
 5   std                5556 non-null   float64
 6   interactions       5556 non-null   object 
 7   tissue_dataset     5556 non-null   object 
dtypes: float64(2), object(6)
memory usage: 347.4+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5556 entries, 0 to 5555
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   metastasis_tissue  5556 non-null   object 
 1   cancer_tissue      5556 non-null   object 
 2   direction          5556 non-null   object 
 3   dist               5556 non-null   object 
 4   mean               5556 non-null   float64
 5   std                5556 non-null   float64
 6   interactions       5556 non-null   object 
 7   tissue_dataset     5556 non-null   object 
dtypes: float64(2), object(6)
memory usage: 347.4+ KB


In [21]:
# Concatenate reverse dataframes
random_net_full = pd.concat(
    [random_net, random_net_rev], ignore_index=True)
print(random_net.shape)
print(random_net_full.shape)

random_net_weights_full = pd.concat(
    [random_net_weights, random_net_weights_rev], ignore_index=True)
print(random_net_weights.shape)
random_net_weights_full.shape

(5556, 8)
(11112, 8)
(5556, 8)


(11112, 8)

In [22]:
# merge random_net with normal_net and compute z-score
random_net_zscore = pd.merge(random_net_full, normal_net)
random_net_zscore['zscore'] =\
    (random_net_zscore['directed_interactions']-random_net_zscore['mean'])\
        /random_net_zscore['std']
display(random_net_zscore.info())

random_net_weights_zscore = pd.merge(random_net_weights_full, normal_net)
random_net_weights_zscore['zscore'] =\
    (random_net_weights_zscore['directed_interactions']-random_net_weights_zscore['mean'])\
        /random_net_weights_zscore['std']
random_net_weights_zscore.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11112 entries, 0 to 11111
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   cancer_tissue          11112 non-null  object 
 1   metastasis_tissue      11112 non-null  object 
 2   direction              11112 non-null  object 
 3   dist                   11112 non-null  object 
 4   mean                   11112 non-null  float64
 5   std                    11112 non-null  float64
 6   interactions           11112 non-null  object 
 7   tissue_dataset         11112 non-null  object 
 8   directed_interactions  11112 non-null  int64  
 9   jaccard                11112 non-null  float64
 10  zscore                 11112 non-null  float64
dtypes: float64(4), int64(1), object(6)
memory usage: 1.0+ MB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11112 entries, 0 to 11111
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   cancer_tissue          11112 non-null  object 
 1   metastasis_tissue      11112 non-null  object 
 2   direction              11112 non-null  object 
 3   dist                   11112 non-null  object 
 4   mean                   11112 non-null  float64
 5   std                    11112 non-null  float64
 6   interactions           11112 non-null  object 
 7   tissue_dataset         11112 non-null  object 
 8   directed_interactions  11112 non-null  int64  
 9   jaccard                11112 non-null  float64
 10  zscore                 11112 non-null  float64
dtypes: float64(4), int64(1), object(6)
memory usage: 1.0+ MB


In [23]:
# save network stats excluding the distribution column
random_net_zscore.to_csv(
    net_dir+'directed_grouped_network_stats_zscore.csv', 
    index=False,
    columns=random_net_zscore.columns.drop('dist'))

random_net_weights_zscore.to_csv(
    net_dir+'directed_grouped_network_stats_weights_zscore.csv', 
    index=False,
    columns=random_net_zscore.columns.drop('dist'))

In [24]:
# convert random distributions to an array an save
dist = np.array(random_net_zscore.dist.to_list())
with open(net_dir+'directed_grouped_random_dist.npy', 'wb') as f:
    np.save(f, dist)
    
dist = np.array(network_stats_weights_zscore.dist.to_list())
with open(net_dir+'directed_grouped_random_dist_weights.npy', 'wb') as f:
    np.save(f, dist)

# Weighted intercellular interactions networks

## Compute weighted intercellular networks

In [38]:
# Compute weights for each gene
gene_weights = {}
for d in gene_expression:
    
    if d == 'Consensus':
        val = 'log2(nTPM)'
    else:
        val = 'log2(TPM)'
    
    x = gene_expression[d].pivot_table(values=val, columns='Tissue', index='Gene name')
    
    gene_weights[d] = dict(
        exp = x,
        max_norm = x.transform(lambda x: x/x.max(), axis=1),
        quantile = (x.rank(axis=1)-1)/(x.shape[1]-1),
        z_score = stats.zscore(x, axis=1),
    )

In [39]:
importlib.reload(network_functions)
for mode in ['undirected', 'directed']:

    if mode == 'directed':
        directions = ['c_to_m', 'm_to_c']
    else:
        directions=False
    
    network_weights = []
    for interactions in ['all', 'curated']:
        
        if interactions == 'all':
            inter = intercell
        else:
            inter = intercell_curated
            
        for tissue_dataset in ['GTEx', 'Consensus']:
            
            tissues_ = tissues[tissue_dataset].dropna()
            pair_ids = itertools.combinations(tissues_.index.unique().to_list(), 2)
            gene_weights_ = gene_weights[tissue_dataset]
            
            for pair in pair_ids:

                t1 = tissues_.loc[[pair[0]]].to_list()
                t2 = tissues_.loc[[pair[1]]].to_list()
                
                for weight_type, weight_values in gene_weights_.items():
                    for t1_ in t1:
                        for t2_ in t2:

                            weights = network_functions.weighted_intercell_network(
                                pair=(t1_, t2_),
                                weights=weight_values,
                                interactions=inter,
                                direction=directions,
                                extra_labels=[
                                    ('tissue_dataset', tissue_dataset),
                                    ('interactions', interactions),
                                    ('gene_weights', weight_type)
                                ]
                            )
                            network_weights.extend(weights)
    
    network_weights = pd.DataFrame(network_weights)
    network_weights.to_csv(f'intercell_networks/{mode}_weighted_networks.csv', index=False)

## Compute organotropism pairs

In [40]:
weighted_undir = pd.read_csv(net_dir+'undirected_weighted_networks.csv')
weighted_dir = pd.read_csv(net_dir+'directed_weighted_networks.csv')
pairs_records = pd.read_csv(org_pairs_dir+'pairs_records.csv')
weighted_undir.head()

Unnamed: 0,tissue1,tissue2,interaction_weight,value,tissue_dataset,interactions,gene_weights
0,Adipose - Subcutaneous,Adrenal Gland,product,158804.321649,GTEx,all,exp
1,Adipose - Subcutaneous,Adrenal Gland,min,27664.16715,GTEx,all,exp
2,Adipose - Visceral (Omentum),Adrenal Gland,product,159162.050487,GTEx,all,exp
3,Adipose - Visceral (Omentum),Adrenal Gland,min,28223.668737,GTEx,all,exp
4,Adipose - Subcutaneous,Adrenal Gland,product,3048.617902,GTEx,all,max_norm


In [41]:
# to get all tissue pairs, we'll create a dataframe where we reverse the tissue order
weighted_undir_rev = weighted_undir.copy().rename({
    'tissue1':'tissue2',
    'tissue2':'tissue1',
}, axis=1)

# for directed interactions we also have to change the entry's direction
weighted_dir_rev = weighted_dir.copy().rename({
    'tissue1':'tissue2',
    'tissue2':'tissue1',
}, axis=1)
display(weighted_dir_rev.head())
weighted_dir_rev['direction'] = np.where(weighted_dir_rev['direction']=='c_to_m', 'm_to_c', 'c_to_m')
weighted_dir_rev.head()

Unnamed: 0,tissue2,tissue1,interaction_weight,direction,value,tissue_dataset,interactions,gene_weights
0,Adipose - Subcutaneous,Adrenal Gland,product,c_to_m,74309.645691,GTEx,all,exp
1,Adipose - Subcutaneous,Adrenal Gland,product,m_to_c,73631.905368,GTEx,all,exp
2,Adipose - Subcutaneous,Adrenal Gland,min,c_to_m,12874.984888,GTEx,all,exp
3,Adipose - Subcutaneous,Adrenal Gland,min,m_to_c,12854.877643,GTEx,all,exp
4,Adipose - Visceral (Omentum),Adrenal Gland,product,c_to_m,75491.108825,GTEx,all,exp


Unnamed: 0,tissue2,tissue1,interaction_weight,direction,value,tissue_dataset,interactions,gene_weights
0,Adipose - Subcutaneous,Adrenal Gland,product,m_to_c,74309.645691,GTEx,all,exp
1,Adipose - Subcutaneous,Adrenal Gland,product,c_to_m,73631.905368,GTEx,all,exp
2,Adipose - Subcutaneous,Adrenal Gland,min,m_to_c,12874.984888,GTEx,all,exp
3,Adipose - Subcutaneous,Adrenal Gland,min,c_to_m,12854.877643,GTEx,all,exp
4,Adipose - Visceral (Omentum),Adrenal Gland,product,m_to_c,75491.108825,GTEx,all,exp


In [42]:
# Concatenate reverse dataframes
weighted_undir_network = pd.concat(
    [weighted_undir, weighted_undir_rev], ignore_index=True).rename({
    'tissue1': 'cancer_tissue',
    'tissue2': 'metastasis_tissue'
}, axis=1)
weighted_dir_network = pd.concat(
    [weighted_dir, weighted_dir_rev], ignore_index=True).rename({
    'tissue1': 'cancer_tissue',
    'tissue2': 'metastasis_tissue'
}, axis=1)
weighted_dir_network.head(2)

Unnamed: 0,cancer_tissue,metastasis_tissue,interaction_weight,direction,value,tissue_dataset,interactions,gene_weights
0,Adipose - Subcutaneous,Adrenal Gland,product,c_to_m,74309.645691,GTEx,all,exp
1,Adipose - Subcutaneous,Adrenal Gland,product,m_to_c,73631.905368,GTEx,all,exp


In [43]:
# merge pairs_records with weighted networks
weighted_undir_pairs = pd.merge(
    pairs_records,
    weighted_undir_network,
    on=['cancer_tissue', 'metastasis_tissue', 'tissue_dataset'])
print(weighted_undir_pairs.shape)
weighted_dir_pairs = pd.merge(
    pairs_records,
    weighted_dir_network,
    on=['cancer_tissue', 'metastasis_tissue', 'tissue_dataset'])
print(weighted_dir_pairs.shape)

(120672, 14)
(241344, 15)


In [44]:
# export results
weighted_undir_pairs.to_csv('intercell_networks/undirected_weighted_network_stats.csv', index=False)
weighted_dir_pairs.to_csv('intercell_networks/directed_weighted_network_stats.csv', index=False)

# Random weighted intercellular networks (z-score)

Since the z-score will be used in the cancer-wise analysis, we will use only grouped gene expression.
Weights used:
* **gene** - normalization by the maximum expression value.
* **interaction** - product of gene weights.

In [11]:
# Compute weights for each gene
gene_weights = {}
for d in grouped_gene_expression:
    
    if d == 'Consensus':
        val = 'log2(nTPM)'
    else:
        val = 'log2(TPM)'
    
    x = grouped_gene_expression[d].pivot_table(values=val, columns='Tissue', index='Gene name')
    
    gene_weights[d] = x.transform(lambda x: x/x.max(), axis=1)
    
    display(gene_weights[d].head(2))

Tissue,adipose_tissue,adrenal_gland,artery,bladder,blood,brain,breast,cervix,colorectum,fallopian_tube,...,skeletal_muscle,skin,small_intestine,spinal_cord,spleen,stomach,testis,thyroid,uterus,vagina
Gene name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.239567,0.146703,0.335357,0.192892,0.16548,0.264271,0.293534,0.428159,0.16481,0.357282,...,0.06346,0.197779,0.223315,0.390092,0.332783,0.142852,0.155928,0.286005,0.364335,0.3126
A1BG-AS1,0.471835,0.325448,0.697551,0.375882,0.380347,0.464646,0.625305,0.900366,0.409012,0.805123,...,0.095016,0.399626,0.578627,0.500254,0.858265,0.321345,0.228431,0.715573,0.915976,0.670038


Tissue,adipose_tissue,adrenal_gland,appendix,bladder,bone,brain,breast,cervix,colorectum,epididymis,...,spinal_cord,spleen,stomach,testis,thymus,thyroid,tongue,tonsil,uterus,vagina
Gene name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.013387,0.013387,0.013387,0.0,0.013387,0.013387,0.013387,0.013387,0.0,0.0,...,0.013387,0.074533,0.013387,0.0,0.0,0.013387,0.0,0.0,0.025609,0.0
A1CF,0.0,0.0,0.094284,0.0,0.0,0.0,0.0,0.0,0.494992,0.0,...,0.0,0.0,0.139047,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Undirected interactions

### Shuffle Interactions

In [12]:
importlib.reload(network_functions)

network_stats = []
for inter_kind in tqdm(['all', 'curated']):
    
    if inter_kind == 'all':
        
        inter_graph = intercell
    else:
        inter_graph = intercell_curated

    for d in tqdm(['GTEx', 'Consensus']):
        
        stats = network_functions.random_grouped_weighted_intercell_networks(
            gene_weights[d],
            inter_graph,
            direction=False,
            iterations=1000,
            n_jobs=18,
            extra_labels=[
                ('interactions', inter_kind),
                ('tissue_dataset', d)
            ],
        )
        
        network_stats.extend(stats)
        
network_stats = pd.DataFrame(network_stats)
network_stats.head()

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,value,dist,mean,std,z_score,interactions,tissue_dataset
0,adipose_tissue,adrenal_gland,3220.720795,"[3108.0946182859775, 3108.0946182859775, 3108....",3108.745322,9.680097,11.567598,all,GTEx
1,adipose_tissue,artery,4295.626281,"[4122.627038679531, 4122.627038679531, 4122.62...",4125.967484,9.323613,18.196679,all,GTEx
2,adipose_tissue,bladder,4116.187029,"[3956.580560824228, 3956.580560824228, 3956.58...",3959.132704,8.210483,19.128512,all,GTEx
3,adipose_tissue,blood,2428.54529,"[2448.566786666233, 2448.566786666233, 2448.56...",2434.136031,13.484003,-0.41462,all,GTEx
4,adipose_tissue,brain,2721.047268,"[2675.930644876973, 2675.930644876973, 2675.93...",2672.585525,11.0356,4.391401,all,GTEx


In [14]:
# to get all tissue pairs, we'll create a dataframe where we reverse the tissue order
network_stats_rev = network_stats.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)

network_stats_rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2778 entries, 0 to 2777
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   metastasis_tissue  2778 non-null   object 
 1   cancer_tissue      2778 non-null   object 
 2   value              2778 non-null   float64
 3   dist               2778 non-null   object 
 4   mean               2778 non-null   float64
 5   std                2778 non-null   float64
 6   z_score            2778 non-null   float64
 7   interactions       2778 non-null   object 
 8   tissue_dataset     2778 non-null   object 
dtypes: float64(4), object(5)
memory usage: 195.5+ KB


In [16]:
# Concatenate reverse dataframes
network_stats_full = pd.concat(
    [network_stats, network_stats_rev], ignore_index=True)
print(network_stats.shape)
print(network_stats_full.shape)
network_stats_full.head(2)

(2778, 9)
(5556, 9)


Unnamed: 0,cancer_tissue,metastasis_tissue,value,dist,mean,std,z_score,interactions,tissue_dataset
0,adipose_tissue,adrenal_gland,3220.720795,"[3108.0946182859775, 3108.0946182859775, 3108....",3108.745322,9.680097,11.567598,all,GTEx
1,adipose_tissue,artery,4295.626281,"[4122.627038679531, 4122.627038679531, 4122.62...",4125.967484,9.323613,18.196679,all,GTEx


In [21]:
network_stats_full.to_csv(
    net_dir+'undirected_weighted_network_stats_zscore.csv',
    index=False,
    columns=network_stats_full.columns.drop('dist')
)

In [24]:
# convert random distributions to an array an save
dist = np.array(network_stats_full.dist.to_list())
with open(net_dir+'weighted_net_random_grouped_dist.npy', 'wb') as f:
    np.save(f, dist)

### Shuffle gene weights

In [25]:
network_stats = pd.DataFrame([])
dist_arrays = []
for inter_kind in tqdm(['all', 'curated']): #
    
    if inter_kind == 'all':
        
        inter_graph = intercell
    else:
        inter_graph = intercell_curated

    for d in tqdm(['GTEx', 'Consensus']): #
        
        # compute the normal intercellular networks
        normal_net = network_functions.grouped_weighted_intercell_networks(
                        gene_weights[d],
                        inter_graph,
                        direction=False,
                        extra_labels=[
                            ('interactions', inter_kind),
                            ('tissue_dataset', d)
                        ]
        )
        
        random_net = Parallel(n_jobs=18)(
            delayed(network_functions.grouped_weighted_intercell_networks)(
                gene_weights[d],
                inter_graph,
                direction=False,
                extra_labels=[
                    ('interactions', inter_kind),
                    ('tissue_dataset', d)
                ],
                shuffle_weights=True
            ) for i in range(1000))
        
        normal_net = pd.DataFrame(normal_net)
        
        # create an array with the random network distributions for all pairs
        
        dist_array = np.array([i['value'] for i in random_net])
        
        # compute z-score
        normal_net['mean'] = np.mean(dist_array, axis=0)
        normal_net['std'] = np.std(dist_array, axis=0)
        normal_net['z_score'] = (normal_net['value'] - normal_net['mean'])/normal_net['std']
        
        network_stats = pd.concat([network_stats, normal_net])
        dist_arrays.append(dist_array)

network_stats.head()

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,value,interactions,tissue_dataset,mean,std,z_score
0,adipose_tissue,adrenal_gland,3220.720795,all,GTEx,3216.957409,94.19951,0.039951
1,adipose_tissue,artery,4295.626281,all,GTEx,3222.417858,91.265147,11.759236
2,adipose_tissue,bladder,4116.187029,all,GTEx,3216.464407,92.881717,9.686757
3,adipose_tissue,blood,2428.54529,all,GTEx,3218.661602,91.874012,-8.599998
4,adipose_tissue,brain,2721.047268,all,GTEx,3219.202528,93.014273,-5.355686


In [27]:
# to get all tissue pairs, we'll create a dataframe where we reverse the tissue order
network_stats_rev = network_stats.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)

network_stats_rev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2778 entries, 0 to 860
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   metastasis_tissue  2778 non-null   object 
 1   cancer_tissue      2778 non-null   object 
 2   value              2778 non-null   float64
 3   interactions       2778 non-null   object 
 4   tissue_dataset     2778 non-null   object 
 5   mean               2778 non-null   float64
 6   std                2778 non-null   float64
 7   z_score            2778 non-null   float64
dtypes: float64(4), object(4)
memory usage: 195.3+ KB


In [28]:
# Concatenate reverse dataframes
network_stats_full = pd.concat(
    [network_stats, network_stats_rev], ignore_index=True)
print(network_stats.shape)
print(network_stats_full.shape)
network_stats_full.head(2)

(2778, 8)
(5556, 8)


Unnamed: 0,cancer_tissue,metastasis_tissue,value,interactions,tissue_dataset,mean,std,z_score
0,adipose_tissue,adrenal_gland,3220.720795,all,GTEx,3216.957409,94.19951,0.039951
1,adipose_tissue,artery,4295.626281,all,GTEx,3222.417858,91.265147,11.759236


In [29]:
network_stats_full.to_csv(
    net_dir+'undirected_weighted_network_stats_zscore_random_gene_weights.csv',
    index=False,
)

## Directed interactions

In [26]:
importlib.reload(network_functions)

network_stats = []
for inter_kind in tqdm(['all', 'curated']):
    
    if inter_kind == 'all':
        
        inter_graph = intercell
    else:
        inter_graph = intercell_curated

    for d in tqdm(['GTEx', 'Consensus']):
        
        stats = network_functions.random_grouped_weighted_intercell_networks(
            gene_weights[d],
            inter_graph,
            direction=['c_to_m', 'm_to_c'],
            iterations=1000,
            n_jobs=18,
            extra_labels=[
                ('interactions', inter_kind),
                ('tissue_dataset', d)
            ],
        )
        
        network_stats.extend(stats)
        
network_stats = pd.DataFrame(network_stats)
network_stats.head()

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,cancer_tissue,metastasis_tissue,direction,value,dist,mean,std,z_score,interactions,tissue_dataset
0,adipose_tissue,adrenal_gland,c_to_m,1524.704003,"[1466.5736678435962, 1466.5736678435962, 1466....",1468.226596,6.745906,8.372101,all,GTEx
1,adipose_tissue,adrenal_gland,m_to_c,1461.773894,"[1390.9400489751824, 1390.9400489751824, 1390....",1395.20521,6.026084,11.046757,all,GTEx
2,adipose_tissue,artery,c_to_m,2033.420016,"[1943.8403861972058, 1943.8403861972058, 1943....",1944.04458,5.82631,15.339972,all,GTEx
3,adipose_tissue,artery,m_to_c,1955.165173,"[1861.3266766007932, 1861.3266766007932, 1861....",1864.220399,5.840688,15.570901,all,GTEx
4,adipose_tissue,bladder,c_to_m,1931.402416,"[1848.6182889906513, 1848.6182889906513, 1848....",1852.273968,5.15999,15.335001,all,GTEx


In [27]:
# to get all tissue pairs, we'll create a dataframe where we reverse the tissue order
# for directed interactions we also have to change the entry's direction
network_stats_rev = network_stats.copy().rename({
    'cancer_tissue':'metastasis_tissue',
    'metastasis_tissue':'cancer_tissue',
}, axis=1)
display(network_stats_rev.head(2))
network_stats_rev['direction'] = np.where(network_stats_rev['direction']=='c_to_m', 'm_to_c', 'c_to_m')
network_stats_rev.head(2)

Unnamed: 0,metastasis_tissue,cancer_tissue,direction,value,dist,mean,std,z_score,interactions,tissue_dataset
0,adipose_tissue,adrenal_gland,c_to_m,1524.704003,"[1466.5736678435962, 1466.5736678435962, 1466....",1468.226596,6.745906,8.372101,all,GTEx
1,adipose_tissue,adrenal_gland,m_to_c,1461.773894,"[1390.9400489751824, 1390.9400489751824, 1390....",1395.20521,6.026084,11.046757,all,GTEx


Unnamed: 0,metastasis_tissue,cancer_tissue,direction,value,dist,mean,std,z_score,interactions,tissue_dataset
0,adipose_tissue,adrenal_gland,m_to_c,1524.704003,"[1466.5736678435962, 1466.5736678435962, 1466....",1468.226596,6.745906,8.372101,all,GTEx
1,adipose_tissue,adrenal_gland,c_to_m,1461.773894,"[1390.9400489751824, 1390.9400489751824, 1390....",1395.20521,6.026084,11.046757,all,GTEx


In [28]:
# Concatenate reverse dataframes
network_stats_full = pd.concat(
    [network_stats, network_stats_rev], ignore_index=True)
print(network_stats.shape)
print(network_stats_full.shape)
network_stats_full.head()

(5556, 10)
(11112, 10)


Unnamed: 0,cancer_tissue,metastasis_tissue,direction,value,dist,mean,std,z_score,interactions,tissue_dataset
0,adipose_tissue,adrenal_gland,c_to_m,1524.704003,"[1466.5736678435962, 1466.5736678435962, 1466....",1468.226596,6.745906,8.372101,all,GTEx
1,adipose_tissue,adrenal_gland,m_to_c,1461.773894,"[1390.9400489751824, 1390.9400489751824, 1390....",1395.20521,6.026084,11.046757,all,GTEx
2,adipose_tissue,artery,c_to_m,2033.420016,"[1943.8403861972058, 1943.8403861972058, 1943....",1944.04458,5.82631,15.339972,all,GTEx
3,adipose_tissue,artery,m_to_c,1955.165173,"[1861.3266766007932, 1861.3266766007932, 1861....",1864.220399,5.840688,15.570901,all,GTEx
4,adipose_tissue,bladder,c_to_m,1931.402416,"[1848.6182889906513, 1848.6182889906513, 1848....",1852.273968,5.15999,15.335001,all,GTEx


In [30]:
# save network stats excluding the distribution column
network_stats_full.to_csv(
    net_dir+'directed_weighted_network_stats_zscore.csv', 
    index=False,
    columns=network_stats_full.columns.drop('dist'))

In [31]:
# convert random distributions to an array an save
dist = np.array(network_stats_full.dist.to_list())
with open(net_dir+'weighted_net_directed_random_grouped_dist.npy', 'wb') as f:
    np.save(f, dist)