In [9]:
import pandas as pd
import numpy as np
from statsmodels.stats import multitest
import igraph as ig
import omnipath as op
from functions import id_mapping, utility_functions
from tqdm.notebook import tqdm
import importlib
from joblib import Parallel, delayed
import plotly.express as px
from IPython.display import display

In [3]:
raw_data_dir = '../data/raw/'
gene_exp_dir = '../data/processed/gene_expression/'
org_pairs_dir = '../data/processed/organotropism_pairs/'
intercell_net_dir = '../data/processed/intercell_networks/'
interactions_dir = '../data/processed/intercell_interactions/'
intracell_dir = '../data/processed/intracell_network/'
enrichment_dir = '../data/processed/enrichment_analysis/'
utility_functions.check_dir(intracell_dir)
utility_functions.check_dir(enrichment_dir)

In [3]:
metastasis_datasets = ['autopsy', 'hcmdb']
tissue_datasets = ['gtex', 'consensus']
network_types = ['all', 'curated']

# Download intracellular graph

In [None]:
intracell_graph = op.interactions.AllInteractions.get(
    include=['omnipath', 'dorothea'],
    dorothea_levels=['A', 'B', 'C', 'D'],
    genesymbols=True,
    organism='human'
)
print(intracell_graph.shape[0])
intracell_graph.head()

In [29]:
# protein complexes appear in in many interactions
intracell_graph.loc[
    intracell_graph.source_genesymbol.str.contains(r'[:_]')&
    intracell_graph.target_genesymbol.str.contains(r'[:_]'),
    ['source_genesymbol', 'target_genesymbol']
]

Unnamed: 0,source_genesymbol,target_genesymbol
10129,PRKAA1_PRKAA2_PRKAB1_PRKAB2_PRKAG1_PRKAG2_PRKAG3,ATG13_RB1CC1_ULK1
10130,ATG13_RB1CC1_ULK1,PRKAA1_PRKAA2_PRKAB1_PRKAB2_PRKAG1_PRKAG2_PRKAG3
10131,ATG13_RB1CC1_ULK1,AKT1S1_DEPTOR_MLST8_MTOR_RPTOR
10132,AKT1S1_DEPTOR_MLST8_MTOR_RPTOR,ATG13_RB1CC1_ULK1
10143,RRAGA_RRAGC,AKT1S1_DEPTOR_MLST8_MTOR_RPTOR
...,...,...
22638,FRAS1_FREM2_NPNT,ITGA8_ITGB1
23594,CLCF1_CRLF1,CNTFR_IL6ST_LIFR
24479,GP5_GP9,GP1BA_GP1BB
24999,INHBA_INHBB,ACVR1_ACVR2A


In [40]:
# we will unfold all complexes
intracell_graph_simp = id_mapping.unfold_complexes(
    intracell_graph[['source_genesymbol', 'target_genesymbol']],
    columns=['source_genesymbol', 'target_genesymbol']
)
intracell_graph_simp.rename(columns={'source_genesymbol': 'source', 'target_genesymbol': 'target'}, inplace=True)
print(intracell_graph_simp.shape[0])
intracell_graph_simp.head()

317512


Unnamed: 0,source,target
0,CALM3,TRPC1
1,CALM1,TRPC1
2,CALM2,TRPC1
3,CAV1,TRPC1
4,DRD2,TRPC1


In [41]:
intracell_graph_simp.to_csv(intracell_dir+'intracell_network.csv', index=False)

# Random Walks with Restart (pagerank algorithm)
Intercellular interactions genes can be membrane receptors and emitters, ancoring and adhesion proteins and genes present in membrane-bound complexes. 
Our approach will be split into two:
* **Intercellular target genes**: We want to see how a signal from the outside propagates inside the cell, starting from these genes. We can use the regular intracellular interactions network and compute Random Walks with Restart RWR for each intercell gene.
* **Intercellular source genes**: We want to see how a signal propagates inside the cell, ending in these genes, that send the signal to the outside. In this case, since we want to evaluate how a signal inside the cell propagates to these genes, we need to invert all interaction directions, to be able to start the RWR from these genes. That way, we can find the regions in the intracellular graph that communicate and belong to pathways where the source genes are present, i.e., regions responsible for sending signals to the outside

## Create Graph

In [4]:
# Load full intercell graph
intercell = pd.read_csv(intercell_net_dir+'intercell_graph.csv')
print(intercell.shape[0])
display(intercell.head(2))

10170


Unnamed: 0,source,target
0,APP,GRM7
1,CXCL16,GRM7


In [5]:
# load expression data
# create a list with all genes that are present in GTEx and Consensus
all_genes_list = []

for td in tissue_datasets:
    g = pd.read_csv(
        gene_exp_dir+f'{td}/grouped_records.csv',
        usecols=['gene_id']
    ).drop_duplicates()['gene_id'].to_list()
    all_genes_list.extend(g)

# remove duplicates
all_genes_list = list(set(all_genes_list))
print(len(all_genes_list))

25764


In [6]:
# filter intercell interaction graph to include only genes present in GTEx or Consensus
print(intercell.shape[0])
intercell_filtered = intercell[
    (intercell.source.isin(all_genes_list))&
    (intercell.target.isin(all_genes_list))
]
print(intercell_filtered.shape[0])

# split dataset in source and target genes
source_genes = sorted(intercell_filtered.source.unique())
target_genes = sorted(intercell_filtered.target.unique())
print(len(source_genes))
len(target_genes)

10170
9918
1470


1196

In [7]:
# load intracellular interactions graph
intracell = pd.read_csv(raw_data_dir+'intracell_network.csv')
intracell.head(2)

Unnamed: 0,source,target
0,CALM3,TRPC1
1,CALM1,TRPC1


In [8]:
# Keep only genes expressed in GTEx and Consensus 
print(intracell.shape[0])
intracell_graph = intracell[
    (intracell.source.isin(all_genes_list))&
    (intracell.target.isin(all_genes_list))
]
print(intracell_graph.shape[0])
intracell_graph.head(2)

317512
311021


Unnamed: 0,source,target
0,CALM3,TRPC1
1,CALM1,TRPC1


In [9]:
# intracell genes
intracell_genes = pd.DataFrame(pd.concat([intracell_graph['source'], intracell_graph['target']]).drop_duplicates()).sort_values(0)
intracell_genes

Unnamed: 0,0
23443,A1BG
52199,A1CF
14577,A2M
52201,A2ML1
48319,A4GALT
...,...
88254,ZYG11A
62190,ZYG11B
20978,ZYX
46013,ZZEF1


In [10]:
# split graph in source and target graphs
# the intercell targets graph will have the same direction as the original graph
target_graph = intracell_graph[['source', 'target']]
print(target_graph.shape[0])
display(target_graph.head(2))

# the intercell sources graph will have opposite directions as the original graph
source_graph = intracell_graph.rename(
    columns={'source': 'target', 'target': 'source'}
)

source_graph.sort_index(axis=1, inplace=True)
print(source_graph.shape[0])
source_graph.head(2)

311021


Unnamed: 0,source,target
0,CALM3,TRPC1
1,CALM1,TRPC1


311021


Unnamed: 0,source,target
0,TRPC1,CALM3
1,TRPC1,CALM1


In [11]:
target_graph = ig.Graph.DataFrame(edges=target_graph, vertices=intracell_genes, directed=True, use_vids=False)
print(target_graph.ecount())

source_graph = ig.Graph.DataFrame(edges=source_graph, vertices=intracell_genes, directed=True, use_vids=False)
source_graph.ecount()

311021


311021

In [12]:
print(target_graph.is_simple())
source_graph.is_simple()

False


False

In [13]:
target_graph.simplify()
source_graph.simplify()

print(target_graph.ecount())
source_graph.ecount()

311011


311011

In [14]:
target_graph.vs['name']==source_graph.vs['name']

True

## Compute RWR starting from each intercell gene

In [15]:
# Remove intercell genes (vertices) not present in the intracell graph
gene_lists = {'target': target_genes, 'source': source_genes}

for k, genes in gene_lists.items():
    print(k)

    if k == 'target':
        graph = target_graph
    else:
        graph = source_graph
        
    l = []
    for gene in genes:
        if gene in graph.vs['name']:
            l.append(gene)
        else:
            print(gene)
    gene_lists[k] = l
    print()

target

source
CCL3L3
CCL4L2
CGB1
CGB5
DEFB106A
IFNA13



In [16]:
print(len(target_genes))
print(len(gene_lists['target']))
print(len(source_genes))
print(len(gene_lists['source']))

1196
1196
1470
1464


In [17]:
# compute random walks with restart starting from each intercell gene
graphs = [target_graph, source_graph]

for label, genes, graph in zip(gene_lists.keys(), gene_lists.values(), graphs):

    probs_array = []
    for gene in tqdm(genes):
        probs = graph.personalized_pagerank(reset_vertices=gene, damping=0.85)
        probs_array.append(probs)

    np.save(intracell_dir+f'{label}_rwr.npy', probs_array)

  0%|          | 0/1196 [00:00<?, ?it/s]

  0%|          | 0/1464 [00:00<?, ?it/s]

In [18]:
# export graphs
target_graph.write_pickle(intracell_dir+'target_graph.pickle')
source_graph.write_pickle(intracell_dir+'source_graph.pickle')

In [19]:
# export intracell gene list
# the source and target graphs have the same nodes, only the direction of the edges is different
pd.Series(target_graph.vs['name']).to_csv(intracell_dir+'intracell_genes.csv', index=False)

## Create Labels

In [111]:
# convert intercell gene lists
intercell_genes = {k: pd.DataFrame(d, columns=['gene']) for k, d in gene_lists.items()}

print(intercell_genes['source'].shape[0])
print(intercell_genes['target'].shape[0])

1464
1196


In [106]:
# load list of intercellular interaction associated with metastasis
met_interactions = pd.read_csv(interactions_dir+f'interactions_corr_0.25.csv')
met_interactions.head(2)

Unnamed: 0,inter_id,source,target,interaction,tissue_dataset,tau_source,tau_target,inter_tau,pairs_type,network_type,metastasis_dataset,signal,value,stat,pval,pval_corr,cancer_entropy,metastasis_entropy,is_curated
0,422,WNT4,LRP6,9414.0,gtex,0.739,0.348,0.739,frequency,genecalls,autopsy,-1.0,-5756.0,MWU_ratio,0.003759,0.022155,2.9247,3.007392,yes
1,1149,APOC2,LRP1,385.0,gtex,0.862,0.317,0.862,frequency,genecalls,hcmdb,1.0,2706.0,MWU_ratio,0.000344,0.012371,2.709835,2.472963,yes


In [107]:
# select unique interactions
genes = {i:{} for i in ['source', 'target']}
for curation in ['yes', 'no']:
    
    if curation == 'no':
        inter_list = met_interactions[['source', 'target']].drop_duplicates()
    else:
        inter_list = met_interactions.loc[met_interactions['is_curated'] == curation, ['source', 'target']].drop_duplicates()
    print(inter_list.shape[0])
    
    # split dataset in source and target genes
    source_sign_genes = sorted(inter_list.source.unique())
    target_sign_genes = sorted(inter_list.target.unique())
    genes['source'][curation] = source_sign_genes
    genes['target'][curation] = target_sign_genes
    print(f'{curation} # of source genes', len(source_sign_genes))
    print(f'{curation} # of target genes', len(target_sign_genes))

535
yes # of source genes 280
yes # of target genes 268
1121
no # of source genes 391
no # of target genes 368


In [115]:
labeled_genes = {}
for k1, d1 in genes.items():
    graphs = []
    for k2, d2 in d1.items():
        inter_genes = intercell_genes[k1].copy()
        sign_genes = inter_genes[inter_genes.gene.isin(d2)].copy()
        if k2 == 'no':
            sign_genes['label'] = ['1']*sign_genes.shape[0]
            graphs.append(pd.merge(inter_genes, sign_genes, how='left').fillna('0'))
        else:
            sign_genes['curated_label'] = ['1']*sign_genes.shape[0]
            graphs.append(pd.merge(inter_genes, sign_genes, how='left').fillna('0'))
    
    genes_ = pd.merge(graphs[0], graphs[1])
    
    # add column indicating if a gene is present in the curated graph
    curated_genes = genes_[genes_.gene.isin(curated_graph[k1])].copy()
    curated_genes['is_curated'] = [True]*curated_genes.shape[0]
    genes_ = pd.merge(genes_, curated_genes, how='left').fillna(False)
    
    print(f'# of all {k1} genes', genes_.shape[0])
    print(f'# of curated {k1} genes', genes_[genes_.is_curated==True].shape[0])
    display(genes_.head(2))
    genes_.to_csv(intracell_dir+f'{k1}_labels.csv', index=False)

# of all source genes 1464
# of curated source genes 1365


Unnamed: 0,gene,curated_label,label,is_curated
0,A2M,0,0,True
1,ACAN,0,0,True


# of all target genes 1196
# of curated target genes 1077


Unnamed: 0,gene,curated_label,label,is_curated
0,A1BG,0,0,True
1,ABCB1,0,0,False


# Intracell genes analysis

We will try to find significant differences between the metastasis-associated group and the non-associated group by performing a Mann-Whitney U test for all intracellular genes.

In [116]:
gene_type = ['target', 'source']
association_type = ['positive', 'negative', 'neutral', 'none']

## Load datasets

In [141]:
# RWR data
rwr_probs = {}

for gt in gene_type:
    
    rwr_probs[gt] = np.load(intracell_dir+f'{gt}_rwr.npy')

In [118]:
# load intracell genes
intracell_genes = pd.read_csv(intracell_dir+'intracell_genes.csv', names=['gene'], header=0)
print(intracell_genes.shape[0])
intracell_genes.head(2)

18215


Unnamed: 0,gene
0,A1BG
1,A1CF


In [119]:
# load list of intercellular interaction associated with metastasis
inter_data = pd.read_csv(interactions_dir+f'interactions_corr_0.25.csv', index_col='inter_id')

print('# unique interactions:', len(inter_data.index.unique()))
print('# curated interactions:', len(inter_data[inter_data.is_curated=='yes'].index.unique()))

# unique interactions: 1121
# curated interactions: 535


In [120]:
# intercell gene labels (0-not_sign, 1-sign)
target_genes = pd.read_csv(intracell_dir+'target_labels.csv')
source_genes = pd.read_csv(intracell_dir+'source_labels.csv')
print(target_genes.shape[0])
display(target_genes.head(2))
print(source_genes.shape[0])
source_genes.head(2)

1196


Unnamed: 0,gene,curated_label,label,is_curated
0,A1BG,0,0,True
1,ABCB1,0,0,False


1464


Unnamed: 0,gene,curated_label,label,is_curated
0,A2M,0,0,True
1,ACAN,0,0,True


In [121]:
# select unique interactions
inter_list = inter_data[['source', 'target', 'signal']].drop_duplicates()
print(inter_list.shape[0])
display(inter_list.head(2))
inter_list_curated = inter_data.loc[inter_data.is_curated=='yes', ['source', 'target', 'signal']].drop_duplicates()
print(inter_list_curated.shape[0])
display(inter_list_curated.head(2))

1121


Unnamed: 0_level_0,source,target,signal
inter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
422,WNT4,LRP6,-1.0
1149,APOC2,LRP1,1.0


535


Unnamed: 0_level_0,source,target,signal
inter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
422,WNT4,LRP6,-1.0
1149,APOC2,LRP1,1.0


## Gene Signal Analysis

Will split intercell genes in 4 groups:
* "positive" genes: genes that only participate in interactions that promote metastasis development
* "negative" genes: genes that only participate in interactions that hinder metastasis development
* "neutral" genes: genes that participate in both types of interactions
* no association genes: genes without metastasis association

### Complete graph

In [122]:
# assign signal to intercell genes based on interaction signal
# preserve the keys of intercell genes for indexing the RWR arrays

sign_target = pd.merge(
    target_genes.reset_index(), # keep the ids (index) of intercell genes
    inter_list[['target', 'signal']], 
    left_on='gene',
    right_on='target',
    how='inner' 
).drop_duplicates().drop(columns='target').set_index('index')
display(sign_target.head(2))

sign_source = pd.merge(
    source_genes.reset_index(), # keep the ids (index) of intercell genes
    inter_list[['source', 'signal']], 
    left_on='gene',
    right_on='source',
    how='inner'
).drop_duplicates().drop(columns='source').set_index('index')
display(sign_source.head(2))

Unnamed: 0_level_0,gene,curated_label,label,is_curated,signal
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6,ACHE,0,1,False,1.0
7,ACKR1,1,1,True,1.0


Unnamed: 0_level_0,gene,curated_label,label,is_curated,signal
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,ACVR2B,1,1,True,-1.0
14,ADAM17,1,1,True,-1.0


In [123]:
intercell_genes = {gt: {} for gt in gene_type}

for gt in gene_type:
    if gt == 'target':
        ag = sign_target # genes associated with metastasis
        nag = target_genes # genes not associated with metastasis
    else:
        ag = sign_source
        nag = source_genes
    
    unique = ag[~ag.gene.duplicated(keep=False)]
        
    intercell_genes[gt]['positive'] = unique[unique.signal==1].index.to_numpy()
    intercell_genes[gt]['negative'] = unique[unique.signal==-1].index.to_numpy()
    intercell_genes[gt]['neutral'] = ag[ag.gene.duplicated()].index.to_numpy()
    intercell_genes[gt]['none'] = nag[nag.label==0].index.to_numpy()

In [124]:
print('# metastasis associated target genes:', sign_target.gene.unique().shape[0])
print('# metastasis associated source genes:', sign_source.gene.unique().shape[0])

# metastasis associated target genes: 368
# metastasis associated source genes: 389


In [125]:
# confirm if index arrays have the right amount of genes 
print('# metastasis associated target genes:', 
      intercell_genes['target']['positive'].shape[0] +
      intercell_genes['target']['negative'].shape[0] +
      intercell_genes['target']['neutral'].shape[0]
     )
print('# metastasis associated source genes:', 
      intercell_genes['source']['positive'].shape[0] +
      intercell_genes['source']['negative'].shape[0] +
      intercell_genes['source']['neutral'].shape[0]
     )

# metastasis associated target genes: 368
# metastasis associated source genes: 389


In [126]:
print('# positive target genes:', intercell_genes['target']['positive'].shape[0])
print('# negative target genes:', intercell_genes['target']['negative'].shape[0])
print('# positive source genes:', intercell_genes['source']['positive'].shape[0])
print('# negative source genes:', intercell_genes['source']['negative'].shape[0])

# positive target genes: 266
# negative target genes: 86
# positive source genes: 270
# negative source genes: 96


In [127]:
# some genes participate in positive and negative interactions
print('# target genes with double signal:', intercell_genes['target']['neutral'].shape[0])
print('# source genes with double signal:', intercell_genes['source']['neutral'].shape[0])

# target genes with double signal: 16
# source genes with double signal: 23


In [128]:
# genes without metastasis association
print('# target genes without association:', intercell_genes['target']['none'].shape[0])
print('# source genes without association:', intercell_genes['source']['none'].shape[0])

# target genes without association: 828
# source genes without association: 1075


### Curated graph

In [129]:
# assign signal to intercell genes based on interaction signal
# preserve the keys of intercell genes for indexing the RWR arrays

sign_target_curated = pd.merge(
    target_genes.reset_index()[target_genes.is_curated==True], # keep the ids (index) of intercell genes
    inter_list_curated[['target', 'signal']], 
    left_on='gene',
    right_on='target',
    how='inner' 
).drop_duplicates().drop(columns='target').set_index('index')
display(sign_target_curated.head(2))
sign_source_curated = pd.merge(
    source_genes.reset_index()[source_genes.is_curated==True], # keep the ids (index) of intercell genes
    inter_list_curated[['source', 'signal']], 
    left_on='gene',
    right_on='source',
    how='inner' 
).drop_duplicates().drop(columns='source').set_index('index')
display(sign_source_curated.head(2))

Unnamed: 0_level_0,gene,curated_label,label,is_curated,signal
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,ACKR1,1,1,True,1.0
8,ACKR2,1,1,True,1.0


Unnamed: 0_level_0,gene,curated_label,label,is_curated,signal
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,ACVR2B,1,1,True,-1.0
14,ADAM17,1,1,True,-1.0


In [130]:
intercell_genes_curated = {gt: {} for gt in gene_type}

for gt in gene_type:
    if gt == 'target':
        ag = sign_target_curated # genes associated with metastasis
        nag = target_genes[target_genes.is_curated==True] # genes not associated with metastasis
    else:
        ag = sign_source_curated
        nag = source_genes[source_genes.is_curated==True]
    
    unique = ag[~ag.gene.duplicated(keep=False)]
        
    intercell_genes_curated[gt]['positive'] = unique[unique.signal==1].index.to_numpy()
    intercell_genes_curated[gt]['negative'] = unique[unique.signal==-1].index.to_numpy()
    intercell_genes_curated[gt]['neutral'] = ag[ag.gene.duplicated()].index.to_numpy()
    intercell_genes_curated[gt]['none'] = nag[nag.label==0].index.to_numpy()

In [131]:
print('# metastasis associated target genes:', sign_target_curated.gene.unique().shape[0])
print('# metastasis associated source genes:', sign_source_curated.gene.unique().shape[0])

# metastasis associated target genes: 268
# metastasis associated source genes: 278


In [132]:
# confirm if index arrays have the right amount of genes 
print('# metastasis associated target genes:', 
      intercell_genes_curated['target']['positive'].shape[0] +
      intercell_genes_curated['target']['negative'].shape[0] +
      intercell_genes_curated['target']['neutral'].shape[0]
     )
print('# metastasis associated source genes:', 
      intercell_genes_curated['source']['positive'].shape[0] +
      intercell_genes_curated['source']['negative'].shape[0] +
      intercell_genes_curated['source']['neutral'].shape[0]
     )

# metastasis associated target genes: 268
# metastasis associated source genes: 278


In [133]:
print('# positive target genes:', intercell_genes_curated['target']['positive'].shape[0])
print('# negative target genes:', intercell_genes_curated['target']['negative'].shape[0])
print('# positive source genes:', intercell_genes_curated['source']['positive'].shape[0])
print('# negative source genes:', intercell_genes_curated['source']['negative'].shape[0])

# positive target genes: 180
# negative target genes: 82
# positive source genes: 184
# negative source genes: 85


In [134]:
# some genes participate in positive and negative interactions
print('# target genes with double signal:', intercell_genes_curated['target']['neutral'].shape[0])
print('# source genes with double signal:', intercell_genes_curated['source']['neutral'].shape[0])

# target genes with double signal: 6
# source genes with double signal: 9


In [135]:
# genes without metastasis association
print('# target genes without association:', intercell_genes_curated['target']['none'].shape[0])
print('# source genes without association:', intercell_genes_curated['source']['none'].shape[0])

# target genes without association: 741
# source genes without association: 993


## Permutation Test
We will use the RWR with $\text{damping}=0.85$

### Complete Graph

In [142]:
# compute permutation test for all intracellular genes
# for all distinct groups of intercellular genes
importlib.reload(utility_functions)
perm_stats = {gt:[] for gt in gene_type}

for gt in gene_type:
    
    for at in association_type[:-1]: # leave 'none' out of the iteration
        
        x_index = intercell_genes[gt][at] # associated genes
        y_index = intercell_genes[gt]['none'] # non-associated genes
        
        intracell_array = rwr_probs[gt]
        x = intracell_array[x_index]
        y = intracell_array[y_index]
        
        # perm_list has the test statistic & p-value for all intercell genes:
        # [(statistic_0, pvalue_0), (statistic_1, pvalue_1), ...]
        perm_list = Parallel(
            n_jobs=-1)(
            delayed(utility_functions.perm_test)(
                x[:, i],
                y[:, i],
                n_permutations=5000) for i in tqdm(range(intracell_array.shape[1]))
        )
        
        col = pd.MultiIndex.from_tuples(zip([at for i in range(2)], ['statistic', 'pvalue']))
        perm_df = pd.DataFrame(
            perm_list,
            columns=col,
            index=intracell_genes.gene)
        
        perm_stats[gt].append(perm_df)
        

  0%|          | 0/18215 [00:00<?, ?it/s]

  0%|          | 0/18215 [00:00<?, ?it/s]

  0%|          | 0/18215 [00:00<?, ?it/s]

  0%|          | 0/18215 [00:00<?, ?it/s]

  0%|          | 0/18215 [00:00<?, ?it/s]

  0%|          | 0/18215 [00:00<?, ?it/s]

In [143]:
pd.concat(perm_stats['target'], axis=1).to_csv(intracell_dir+'target_perm_test.csv')
pd.concat(perm_stats['source'], axis=1).to_csv(intracell_dir+'source_perm_test.csv')

### Curated Graph

In [144]:
# compute permutation test for all intracellular genes
# for all distinct groups of intercellular genes
importlib.reload(utility_functions)
perm_stats = {gt:[] for gt in gene_type}

for gt in gene_type:
    
    for at in association_type[:-1]: # leave 'none' out of the iteration
        
        x_index = intercell_genes_curated[gt][at] # associated genes
        y_index = intercell_genes_curated[gt]['none'] # non-associated genes
        
        intracell_array = rwr_probs[gt]
        x = intracell_array[x_index]
        y = intracell_array[y_index]
        
        # perm_list has the test statistic & p-value for all intercell genes:
        # [(statistic_0, pvalue_0), (statistic_1, pvalue_1), ...]
        perm_list = Parallel(
            n_jobs=-1)(
            delayed(utility_functions.perm_test)(
                x[:, i],
                y[:, i],
                n_permutations=5000) for i in tqdm(range(intracell_array.shape[1]))
        )

        col = pd.MultiIndex.from_tuples(zip([at for i in range(2)], ['statistic', 'pvalue']))
        perm_df = pd.DataFrame(
            perm_list,
            columns=col,
            index=intracell_genes.gene)
        
        perm_stats[gt].append(perm_df)
        

  0%|          | 0/18215 [00:00<?, ?it/s]

  0%|          | 0/18215 [00:00<?, ?it/s]

  0%|          | 0/18215 [00:00<?, ?it/s]

  0%|          | 0/18215 [00:00<?, ?it/s]

  0%|          | 0/18215 [00:00<?, ?it/s]

  0%|          | 0/18215 [00:00<?, ?it/s]

In [145]:
pd.concat(perm_stats['target'], axis=1).to_csv(intracell_dir+'target_curated_perm_test.csv')
pd.concat(perm_stats['source'], axis=1).to_csv(intracell_dir+'source_curated_perm_test.csv')

### Results

In [168]:
genes = dict(
    target = pd.read_csv(intracell_dir+'target_labels.csv'),
    source = pd.read_csv(intracell_dir+'source_labels.csv')
)
genes['target'].head(2)

Unnamed: 0,gene,curated_label,label,is_curated
0,A1BG,0,0,True
1,ABCB1,0,0,False


In [169]:
perm_stats = dict(
    target = pd.read_csv(intracell_dir+'target_perm_test.csv', index_col=0, header=[0,1]),
    source = pd.read_csv(intracell_dir+'source_perm_test.csv', index_col=0, header=[0,1])
)
display(perm_stats['target'].head(2))
perm_stats['source'].head(2)

Unnamed: 0_level_0,positive,positive,negative,negative,neutral,neutral
Unnamed: 0_level_1,statistic,pvalue,statistic,pvalue,statistic,pvalue
gene,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A1BG,-0.001207546,0.256749,-0.001207586,0.189562,-0.00120735,0.077385
A1CF,1.785138e-07,0.0002,5.823224e-08,0.091982,2.254983e-07,0.029794


Unnamed: 0_level_0,positive,positive,negative,negative,neutral,neutral
Unnamed: 0_level_1,statistic,pvalue,statistic,pvalue,statistic,pvalue
gene,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A1BG,0.0,1.0,0.0,1.0,0.0,1.0
A1CF,0.0,1.0,0.0,1.0,0.0,1.0


In [170]:
perm_stats_curated = dict(
    target = pd.read_csv(intracell_dir+'target_curated_perm_test.csv', index_col=0, header=[0,1]),
    source = pd.read_csv(intracell_dir+'source_curated_perm_test.csv', index_col=0, header=[0,1])
)
display(perm_stats_curated['target'].head(2))
perm_stats_curated['source'].head(2)

Unnamed: 0_level_0,positive,positive,negative,negative,neutral,neutral
Unnamed: 0_level_1,statistic,pvalue,statistic,pvalue,statistic,pvalue
gene,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A1BG,-0.001349361,0.221356,-0.001349382,0.215957,-0.001349202,0.05079
A1CF,9.430759e-08,0.0002,6.586697e-08,0.028794,2.275775e-07,0.047391


Unnamed: 0_level_0,positive,positive,negative,negative,neutral,neutral
Unnamed: 0_level_1,statistic,pvalue,statistic,pvalue,statistic,pvalue
gene,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A1BG,0.0,1.0,0.0,1.0,0.0,1.0
A1CF,0.0,1.0,0.0,1.0,0.0,1.0


In [171]:
# Multiple tests correction
perm_stats_corr = {}
for k, stats in perm_stats.items():
    data = stats.copy()
    # remove intercell genes
    intercell_genes = genes[k].gene
    data = stats.loc[~(stats.index.isin(intercell_genes))].copy()
    for at in association_type[:-1]:
        
        data[at, 'pvalue'] = multitest.fdrcorrection(data[at, 'pvalue'])[1]
    
    x = data.loc[:, (slice(None), 'pvalue')].stack(level=[0,1]).reset_index()
    x.drop(columns='level_2', inplace=True)
    x.rename(columns={'level_1': 'signal', 0: 'pvalue'}, inplace=True)
    
    y = data.loc[:, (slice(None), 'statistic')].stack(level=[0,1]).reset_index()
    y.drop(columns='level_2', inplace=True)
    y.rename(columns={'level_1': 'signal', 0: 'statistic'}, inplace=True)
    
    perm_stats_corr[k] = pd.merge(x, y)
    
display(perm_stats_corr['target'].head(2))
perm_stats_corr['source'].head(2)

Unnamed: 0,gene,signal,pvalue,statistic
0,A1CF,negative,0.336134,5.823224e-08
1,A1CF,neutral,0.103806,2.254983e-07


Unnamed: 0,gene,signal,pvalue,statistic
0,A1BG,negative,1.0,0.0
1,A1BG,neutral,1.0,0.0


In [172]:
# Multiple tests correction
perm_stats_curated_corr = {}
for k, stats in perm_stats_curated.items():
    data = stats.copy()
    # remove intercell genes
    intercell_genes = genes[k].loc[genes[k].is_curated==True, 'gene']
    data = stats.loc[~(stats.index.isin(intercell_genes))].copy()
    for at in association_type[:-1]:
        
        data[at, 'pvalue'] = multitest.fdrcorrection(data[at, 'pvalue'])[1]
    
    x = data.loc[:, (slice(None), 'pvalue')].stack(level=[0,1]).reset_index()
    x.drop(columns='level_2', inplace=True)
    x.rename(columns={'level_1': 'signal', 0: 'pvalue'}, inplace=True)
    
    y = data.loc[:, (slice(None), 'statistic')].stack(level=[0,1]).reset_index()
    y.drop(columns='level_2', inplace=True)
    y.rename(columns={'level_1': 'signal', 0: 'statistic'}, inplace=True)
    
    perm_stats_curated_corr[k] = pd.merge(x, y)
    
display(perm_stats_curated_corr['target'].head(2))
perm_stats_curated_corr['source'].head(2)

Unnamed: 0,gene,signal,pvalue,statistic
0,A1CF,negative,0.145012,6.586697e-08
1,A1CF,neutral,0.177389,2.275775e-07


Unnamed: 0,gene,signal,pvalue,statistic
0,A1BG,negative,1.0,0.0
1,A1BG,neutral,1.0,0.0


In [173]:
for k, stats in perm_stats_corr.items():
    print(k + ' genes with p-value < 0.05:')
    
    for at in association_type[:-1]:
        sign_genes = stats[(stats.signal==at)&(stats.pvalue<0.05)].shape[0]
        print(f'- {at}:', sign_genes)
        
    stats.to_csv(intracell_dir+f'{k}_perm_test_records.csv', index=False)
    print()

target genes with p-value < 0.05:
- positive: 7608
- negative: 711
- neutral: 896

source genes with p-value < 0.05:
- positive: 222
- negative: 0
- neutral: 0



# Data for Enrichment Analysis

## Load permutation test results

In [4]:
# load permutation test results
test_stats = dict(
    target = pd.read_csv(intracell_dir+'target_perm_test_records.csv',),
    source = pd.read_csv(intracell_dir+'source_perm_test_records.csv',)
)
display(test_stats['target'].head(2))
test_stats['source'].head(2)

Unnamed: 0,gene,signal,pvalue,statistic
0,A1CF,negative,0.336134,5.823224e-08
1,A1CF,neutral,0.103806,2.254983e-07


Unnamed: 0,gene,signal,pvalue,statistic
0,A1BG,negative,1.0,0.0
1,A1BG,neutral,1.0,0.0


In [5]:
# load permutation test results
test_stats_curated = dict(
    target = pd.read_csv(intracell_dir+'target_perm_test_curated_records.csv',),
    source = pd.read_csv(intracell_dir+'source_perm_test_curated_records.csv',)
)
display(test_stats_curated['target'].head(2))
test_stats_curated['source'].head(2)

Unnamed: 0,gene,signal,pvalue,statistic
0,A1CF,negative,0.145012,6.586697e-08
1,A1CF,neutral,0.177389,2.275775e-07


Unnamed: 0,gene,signal,pvalue,statistic
0,A1BG,negative,1.0,0.0
1,A1BG,neutral,1.0,0.0


## Select genes

In [6]:
# Select statistic significant genes
# we'll use a significance of 0.01
target_sign = test_stats['target'][
    (test_stats['target'].signal=='positive')&
    (test_stats['target'].pvalue<0.05)
].reset_index(drop=True)
source_sign = test_stats['source'][
    (test_stats['source'].signal=='positive')&
    (test_stats['source'].pvalue<0.05)
].reset_index(drop=True)
print(target_sign.shape[0])
display(target_sign.head(2))

print(source_sign.shape[0])
display(source_sign.head(2))

7608


Unnamed: 0,gene,signal,pvalue,statistic
0,A1CF,positive,0.00125,1.785138e-07
1,AADAC,positive,0.048661,1.085023e-07


222


Unnamed: 0,gene,signal,pvalue,statistic
0,ACTN1,positive,0.045264,1.100934e-05
1,AKAP8L,positive,0.018818,5.550986e-07


In [7]:
# Select statistic significant genes
# we'll use a significance of 0.01
target_curated_sign = test_stats_curated['target'][
    (test_stats_curated['target'].signal=='positive')&
    (test_stats_curated['target'].pvalue<0.05)
].reset_index(drop=True)
source_curated_sign = test_stats_curated['source'][
    (test_stats_curated['source'].signal=='positive')&
    (test_stats_curated['source'].pvalue<0.05)
].reset_index(drop=True)
print(target_curated_sign.shape[0])
display(target_curated_sign.head(2))

print(source_curated_sign.shape[0])
display(source_curated_sign.head(2))

4822


Unnamed: 0,gene,signal,pvalue,statistic
0,A1CF,positive,0.002807,9.430759e-08
1,A2M,positive,0.049748,0.0001163855


246


Unnamed: 0,gene,signal,pvalue,statistic
0,ABR,positive,0.017103,1.683412e-07
1,ACTN1,positive,0.017103,1.625147e-05


In [10]:
# evaluate test statistic distribution
fig = px.violin(
    {'target': target_sign.statistic, 'source': source_sign.statistic},
    orientation='h',
    title='Distribution of permutation test statistic (complete graph)',
    height=500
)
fig.update_layout(
    xaxis_title_text='mean diff',
    yaxis_title_text='Gene dataset',
)
fig.show()

In [11]:
# evaluate test statistic distribution
fig = px.violin(
    {'target':target_curated_sign.statistic, 'source':source_curated_sign.statistic},
    orientation='h',
    title='Distribution of permutation test statistic (curated graph)',
    height=500
)
fig.update_layout(
    xaxis_title_text='mean diff',
    yaxis_title_text='Gene dataset',
)
fig.show()

### Outlier filter
The distribution of mean differences for each intracell is, as expected, skewed to lower values. Most statistically significant genes have a very small RWR probability and the difference between the groups of metastasis-associated and non-associate intercell genes is small, despite being significant.
So, it's safe to assume that the most interesting intracell genes to analyse are those that show bigger mean differences.

We'll select the outlier genes from the 2 distributions using the **tuckey's fences method**.

In [12]:
# source outliers
source_outliers = utility_functions.tuckeys_fences(source_sign.statistic)
source_outliers = source_sign[source_outliers==1]
print(source_outliers.shape[0])
source_outliers.head(2)

23


Unnamed: 0,gene,signal,pvalue,statistic
17,CASP5,positive,0.045264,8.1e-05
31,CIITA,positive,0.018818,0.00014


In [13]:
# source outliers
source_curated_outliers = utility_functions.tuckeys_fences(source_curated_sign.statistic)
source_curated_outliers = source_curated_sign[source_curated_outliers==1]
print(source_curated_outliers.shape[0])
source_curated_outliers.head(2)

23


Unnamed: 0,gene,signal,pvalue,statistic
28,CASP5,positive,0.041089,0.000104
37,CHEK1,positive,0.017103,4.9e-05


In [14]:
# target outliers
target_outliers = utility_functions.tuckeys_fences(target_sign.statistic)
target_outliers = target_sign[target_outliers==1]
print(target_outliers.shape[0])
target_outliers.head(2)

1177


Unnamed: 0,gene,signal,pvalue,statistic
34,ABI1,positive,0.00125,6.3e-05
35,ABI2,positive,0.009387,9e-06


In [15]:
# target outliers
target_curated_outliers = utility_functions.tuckeys_fences(target_curated_sign.statistic)
target_curated_outliers = target_curated_sign[target_curated_outliers==1]
print(target_curated_outliers.shape[0])
target_curated_outliers.head(2)

761


Unnamed: 0,gene,signal,pvalue,statistic
1,A2M,positive,0.049748,0.000116
18,ABI1,positive,0.002807,5.7e-05


In [16]:
source_sign.to_csv(enrichment_dir+'source_sign.csv', index=False)
source_outliers.to_csv(enrichment_dir+'source_sign_outliers.csv', index=False)
target_sign.to_csv(enrichment_dir+'target_sign.csv', index=False)
target_outliers.to_csv(enrichment_dir+'target_sign_outliers.csv', index=False)

In [17]:
source_curated_sign.to_csv(enrichment_dir+'source_curated_sign.csv', index=False)
source_curated_outliers.to_csv(enrichment_dir+'source_curated_sign_outliers.csv', index=False)
target_curated_sign.to_csv(enrichment_dir+'target_curated_sign.csv', index=False)
target_curated_outliers.to_csv(enrichment_dir+'target_curated_sign_outliers.csv', index=False)