In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats import multitest
import itertools
from tqdm.notebook import tqdm, tnrange
from functions import intercell_interactions, utility_functions
import importlib
from joblib import Parallel, delayed
from IPython.display import display

from Bio import Entrez
import sqlite3

from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
raw_data_dir = '../data/raw/'
gene_exp_dir = '../data/processed/gene_expression/'
org_pairs_dir = '../data/processed/organotropism_pairs/'
intercell_net_dir = '../data/processed/intercell_networks/'
interactions_dir = '../data/processed/intercell_interactions/'
utility_functions.check_dir(interactions_dir)

In [3]:
metastasis_datasets = ['autopsy', 'hcmdb']
tissue_datasets = ['gtex', 'consensus']
network_types = ['all', 'curated']

# Load Data

In [4]:
# we will use two intercellular interactions datasets:
# all interactions
graph = pd.read_csv(intercell_net_dir+f'intercell_graph.csv')
graph.head()

Unnamed: 0,source,target
0,APP,GRM7
1,CXCL16,GRM7
2,CXCL9,GRM7
3,CCL5,GRM7
4,ANXA1,GRM7


In [5]:
genes = {}
for dataset in tissue_datasets:
    
    calls = pd.read_csv(gene_exp_dir+f'{dataset}/grouped_records.csv')
    genes[dataset] = calls['gene_id'].unique()


In [6]:
org_pairs =  pd.read_csv(org_pairs_dir+'pairs_records.csv')
org_pairs.head(2)

Unnamed: 0,cancer,metastasis,cancer_tissue,metastasis_tissue,type,metastasis_dataset,tissue_dataset,control
0,breast,adrenal_gland,Breast - Mammary Tissue,Adrenal Gland,organotropism,autopsy,gtex,organotropism
1,breast,brain,Breast - Mammary Tissue,Brain - Amygdala,organotropism,autopsy,gtex,organotropism


# Interaction analysis with gene calls

## Compute intercellular interactions presence/absence in each tissue pair

In [60]:
importlib.reload(intercell_interactions)
pairs = {}

for td in tqdm(tissue_datasets):

    directory = intercell_net_dir + f'all/{td}/grouped'
    interactions = graph[
        graph.source.isin(genes[td]) &
        graph.target.isin(genes[td])
    ]
    interactions.sort_values(['source', 'target'], inplace=True)
    
    inters_array, all_pairs = intercell_interactions.intercell_interactions_analysis(
        directory,
        interactions,
    )
    # save output to file
    with open(f'{interactions_dir}{td}_interactions.txt', 'w') as file:
        np.savetxt(file, inters_array, fmt='%.0d')
    interactions.to_csv(interactions_dir+f'{td}_interactions_graph.csv', index=False)
    all_pairs.to_csv(interactions_dir+f'{td}_pairs.csv', index=False)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/528 [00:00<?, ?it/s]

  0%|          | 0/861 [00:00<?, ?it/s]

## Compute tissue pairs

In [66]:
# load tissue pairs
pairs = {
    d: pd.read_csv(interactions_dir+f'{d}_pairs.csv') for d in tissue_datasets
}
pairs['gtex'].head(2)

Unnamed: 0,cancer,metastasis
0,adipose_tissue,adrenal_gland
1,adipose_tissue,artery


In [67]:
# compute all combinations of pairs
all_pairs = {}
for d in pairs:
    pairs_rev = pairs[d].rename(
        {'cancer': 'metastasis', 'metastasis': 'cancer'},
        axis=1)
    # we will create a column named id that will keep an unique id for
    # each pair that will hold even when pairs (rows) are removed
    # this id index is exactly the same as the index on the columns of the 
    # interactions array. That way, we can index that array with the created ids
    all_pairs[d] = pd.concat([pairs[d], pairs_rev]).reset_index().rename({'index': 'id'}, axis=1)

    display(all_pairs[d].head(2))
    all_pairs[d].info()

Unnamed: 0,id,cancer,metastasis
0,0,adipose_tissue,adrenal_gland
1,1,adipose_tissue,artery


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056 entries, 0 to 1055
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          1056 non-null   int64 
 1   cancer      1056 non-null   object
 2   metastasis  1056 non-null   object
dtypes: int64(1), object(2)
memory usage: 24.9+ KB


Unnamed: 0,id,cancer,metastasis
0,0,adipose_tissue,adrenal_gland
1,1,adipose_tissue,appendix


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1722 entries, 0 to 1721
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          1722 non-null   int64 
 1   cancer      1722 non-null   object
 2   metastasis  1722 non-null   object
dtypes: int64(1), object(2)
memory usage: 40.5+ KB


In [68]:
# load frequency tables
frequencies = {}
for mt in metastasis_datasets:
    frequencies[mt] = pd.read_csv(org_pairs_dir+f'{mt}_frequencies.csv')

frequencies['autopsy'].head(2)

Unnamed: 0,cancer_organ,adrenal_gland,bladder,bone,brain,breast,colorectum,diaphragm,gallbladder,heart,...,prostate,skeletal_muscle,skin,small_intestine,spleen,stomach,testis,thyroid,uterus,vagina
0,adrenal_gland,1,0,3,2,0,1,1,0,0,...,0,0,3,0,2,1,0,1,0,0
1,anus,3,1,4,1,0,1,1,2,2,...,0,0,2,1,3,0,0,1,0,0


In [71]:
# stack frequencies and remove pairs with the same tissues
freq_pairs = {}
for mt, f in frequencies.items():
    p = pd.melt(f, id_vars='cancer_organ', var_name='metastasis', value_name='freq')
    
    p.rename(columns={'cancer_organ': 'cancer'}, inplace=True)
    
    p = p[p['cancer'] != p['metastasis']].reset_index(drop=True)
    freq_pairs[mt] = p
    
freq_pairs['autopsy'].head()

Unnamed: 0,cancer,metastasis,freq
0,anus,adrenal_gland,3
1,appendix,adrenal_gland,0
2,bile_duct,adrenal_gland,3
3,bladder,adrenal_gland,11
4,bone,adrenal_gland,4


In [72]:
# merge the frequency in each dataset with the tissue pairs
# this will ensure we only use pairs present in the dataset
# and prepare the dataset to do an analysis based on the frequency

for mt, f in freq_pairs.items():
    for td, ts in all_pairs.items():
        
        x = pd.merge(ts, f).sort_values('id')
        x.to_csv(interactions_dir+f'{mt}_{td}_pairs.csv', index=False)

## Compute stats

### Load data

In [73]:
# load interaction presence/absence arrays
inters_array = {
    td: np.loadtxt(interactions_dir+f'{td}_interactions.txt', dtype='int64') for td in tissue_datasets
}

print(inters_array['gtex'].shape)
print(inters_array['consensus'].shape)

(9590, 528)
(9857, 861)


In [74]:
# load interactions graph
inters_graph = {
    td: pd.read_csv(interactions_dir+f'{td}_interactions_graph.csv') for td in tissue_datasets
}
print(inters_graph['gtex'].shape)
print(inters_graph['consensus'].shape)

(9590, 2)
(9857, 2)


### Organotropism pairs (hypergeometric test)

In [83]:
importlib.reload(intercell_interactions)
inter_stats = []

for mt in tqdm(metastasis_datasets, desc='metastasis_dataset'):
    for td in tqdm(tissue_datasets, desc='tissue_dataset'):
        # define the condition of the hypergeometric test organotropism pairs to use
        # we'll use fdr corrected and all_tissues data
        org_pairs_ = org_pairs.loc[
            (org_pairs.metastasis_dataset==mt) &
            (org_pairs.tissue_dataset==td) &
            (org_pairs.type=='organotropism'), ['cancer', 'metastasis', 'type']]

        # since we are using grouped tissues we only need info about cancer and metastasis organs
        # we have to remove duplicate entries for the same labels
        org_pairs_.drop_duplicates(inplace=True)

        # add org and n_org labels to tissue pairs
        tissue_pairs = pd.read_csv(interactions_dir+f'{mt}_{td}_pairs.csv')
        labels = pd.merge(tissue_pairs, org_pairs_, how='left')

        labels.type[labels.type=='organotropism'] = 'org'
        labels.fillna('n_org', inplace=True)

        # in the undirected approach we will treat each network has unique, not caring which tissue is
        # the cancer or metastasis tissue
        # if a pair is both organotropism and not organotropism depending on the tissue classification
        # we'll consider only it's appearance as an organotropism pair
        
        unique_labels = labels.sort_values('type').drop_duplicates(subset='id', keep='last')
           
        # having the labeled pairs prepared now we have to select this pairs
        # in the interactions array
        # we'll index the inters array using the unique id generated for each pair     
        pairs_inters = inters_array[td][:, unique_labels.id]
  
        # finally, we need to iterate all interactions build contingency tables and
        # compute the odds ratio
        statistics = Parallel(n_jobs=-1)(delayed(intercell_interactions.interaction_stats)(
                pairs_inters[i,:],
                unique_labels.type,
                inters_graph[td].iloc[i].to_list(),
                [('metastasis_dataset', mt), ('tissue_dataset', td)]
            ) for i in tnrange(pairs_inters.shape[0], desc='interactions'))

        inter_stats.extend(statistics)

inter_stats_plot = pd.DataFrame(inter_stats)
inter_stats_plot.to_csv(interactions_dir+'undirected_orgpairs_genecalls_stats.csv', index=False)

metastasis_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

interactions:   0%|          | 0/9590 [00:00<?, ?it/s]

interactions:   0%|          | 0/9857 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

interactions:   0%|          | 0/9590 [00:00<?, ?it/s]

interactions:   0%|          | 0/9857 [00:00<?, ?it/s]

### Frequency

In [85]:
importlib.reload(intercell_interactions)
inter_stats = []

for mt in tqdm(metastasis_datasets, desc='metastasis_dataset'):
    for td in tqdm(tissue_datasets, desc='tissue_dataset'):

        # add org and n_org labels to tissue pairs
        tissue_pairs = pd.read_csv(interactions_dir+f'{mt}_{td}_pairs.csv')

        pairs_inters = inters_array[td][:, tissue_pairs.id]

        statistics = Parallel(n_jobs=-1)(delayed(intercell_interactions.freq_interaction_stats)(
                tissue_pairs.freq,
                pairs_inters[i,:],
                inters_graph[td].iloc[i].to_list(),
                [('metastasis_dataset', mt), ('tissue_dataset', td)]
            ) for i in tnrange(pairs_inters.shape[0], desc='interactions'))

        inter_stats.extend(statistics)

inter_stats_plot = pd.DataFrame(inter_stats)
inter_stats_plot.to_csv(interactions_dir+'undirected_frequency_genecalls_stats.csv', index=False)

metastasis_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

interactions:   0%|          | 0/9590 [00:00<?, ?it/s]

interactions:   0%|          | 0/9857 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

interactions:   0%|          | 0/9590 [00:00<?, ?it/s]

interactions:   0%|          | 0/9857 [00:00<?, ?it/s]

# Interaction analysis with gene weights
We chose the two following methods to determine the gene and interaction weights:
* **Gene weight** - normalization by the maximum expression value.
* **Interaction weight** - product of the two gene weights.

## Compute Weighted Networks

In [97]:
tissues = pd.read_csv(raw_data_dir+'tissue_match.csv', index_col='tissue')

In [103]:
importlib.reload(intercell_interactions)

network_weights = {}
all_pairs = {}
interaction_graph = {}
for td in tqdm(tissue_datasets, desc='tissue dataset'):


    gene_weights = pd.read_csv(gene_exp_dir+f'{td}/grouped_records.csv')
    gene_weights = gene_weights.pivot_table(values='log2_TPM', index='gene_id', columns='tissue')
    gene_weights = gene_weights/np.max(gene_weights.to_numpy(), keepdims=True, axis=1)


    tissues_ = tissues[td].dropna().index.unique().to_list()
    pair_ids = [p for p in itertools.combinations(tissues_, 2)]
    genes = gene_weights.index
    interactions = graph[graph.source.isin(genes) & graph.target.isin(genes)]
    
    inters_pairs = np.zeros((interactions.shape[0], len(pair_ids)))
    pairs = []
    for i, pair in enumerate(tqdm(pair_ids, desc='pairs')):
        
        weights = intercell_interactions.weighted_intercell_interactions_analysis(
            pair=pair,
            weights=gene_weights,
            interactions=interactions,
        )
        
        # Keep the interaction's order to be able to index it when computing stats
        inters_array = weights.sort_values(by=['source', 'target'])
        
        inters_pairs[:,i] = inters_array.weight
        
        pairs.append(pair)
    
    all_pairs[td] = pairs
    network_weights[td] = inters_pairs
    interaction_graph[td] = interactions.sort_values(by=['source', 'target'], ignore_index=True)

tissue dataset:   0%|          | 0/2 [00:00<?, ?it/s]

pairs:   0%|          | 0/528 [00:00<?, ?it/s]

pairs:   0%|          | 0/861 [00:00<?, ?it/s]

In [104]:
# Compare tissue pairs created in the gene calls analysis with those 
# created in the weighted networks analysis
for td in tissue_datasets:
    new_pairs = pd.DataFrame(all_pairs[td], columns=['cancer', 'metastasis'])
    old_pairs = pd.read_csv(interactions_dir+f'{td}_pairs.csv')
    print(f'{td} differences:', new_pairs.compare(old_pairs))

gtex differences: Empty DataFrame
Columns: []
Index: []
consensus differences: Empty DataFrame
Columns: []
Index: []


In [105]:
# Compare interaction graphs created in the gene calls analysis with those 
# created in the weighted networks analysis
for td in tissue_datasets:
    old_graph = pd.read_csv(interactions_dir+f'{td}_interactions_graph.csv')
    print(f'{td} differences:', interaction_graph[td].compare(old_graph))

gtex differences: Empty DataFrame
Columns: []
Index: []
consensus differences: Empty DataFrame
Columns: []
Index: []


Since the order of the tissue pairs and graph is the same we can use the index ids created in the gene calls analysis to index the weighted interactions array.
No need to export the new the new data to files

In [106]:
for td in network_weights:
    np.savetxt(interactions_dir+f'{td}_weighted_networks.txt', network_weights[td])

## Compute stats

### Load data

In [108]:
# load weighted interaction arrays
inters_array = {
    td: np.loadtxt(interactions_dir+f'{td}_weighted_networks.txt') for td in tissue_datasets
}

print(inters_array['gtex'].shape)
print(inters_array['consensus'].shape)

(9590, 528)
(9857, 861)


In [109]:
# load interactions graph
inters_graph = {
    td: pd.read_csv(interactions_dir+f'{td}_interactions_graph.csv') for td in tissue_datasets
}
print(inters_graph['gtex'].shape)
print(inters_graph['consensus'].shape)

(9590, 2)
(9857, 2)


### Organotropism pairs (hypergeometric test)

In [110]:
importlib.reload(intercell_interactions)
inter_stats = []

for mt in tqdm(metastasis_datasets, desc='metastasis_dataset'):
    for td in tqdm(tissue_datasets, desc='tissue_dataset'):
        # define the condition of the hypergeometric test organotropism pairs to use
        # we'll use fdr corrected and all_tissues data
        org_pairs_ = org_pairs.loc[
            (org_pairs.metastasis_dataset==mt) &
            (org_pairs.tissue_dataset==td) &
            (org_pairs.type=='organotropism'), ['cancer', 'metastasis', 'type']]

        # since we are using grouped tissues we only need info about cancer and metastasis organs
        # we have to remove duplicate entries for the same labels
        org_pairs_.drop_duplicates(inplace=True)

        # add org and n_org labels to tissue pairs
        tissue_pairs = pd.read_csv(interactions_dir+f'{mt}_{td}_pairs.csv')
        labels = pd.merge(tissue_pairs, org_pairs_, how='left')

        labels.type[labels.type=='organotropism'] = 'org'
        labels.fillna('n_org', inplace=True)

        # in the undirected approach we will treat each network has unique, 
        # not caring which tissue is the cancer or metastasis tissue
        # If a pair is both organotropism and not organotropism depending
        # on the tissue classification, we'll consider only it's appearance
        # as an organotropism pair
        
        unique_labels = labels.sort_values('type').drop_duplicates(subset='id', keep='last')
        
        # having the labeled pairs prepared now we have to select this pairs
        # in the interactions array
        pairs_inters = inters_array[td][:, unique_labels.id]
  
        # finally, we need to iterate all interactions build contingency tables and
        # compute the odds ratio
        statistics = Parallel(n_jobs=-1)(delayed(intercell_interactions.weighted_interaction_stats)(
                pairs_inters[i,:],
                unique_labels.type,
                inters_graph[td].iloc[i].to_list(),
                [('metastasis_dataset', mt), ('tissue_dataset', td)]
            ) for i in tnrange(pairs_inters.shape[0], desc='interactions'))

        inter_stats.extend(statistics)

inter_stats_plot = pd.DataFrame(inter_stats)
inter_stats_plot.to_csv(interactions_dir+'undirected_orgpairs_weighted_network_stats.csv', index=False)

metastasis_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

interactions:   0%|          | 0/9590 [00:00<?, ?it/s]

interactions:   0%|          | 0/9857 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

interactions:   0%|          | 0/9590 [00:00<?, ?it/s]

interactions:   0%|          | 0/9857 [00:00<?, ?it/s]

### Frequency

In [112]:
importlib.reload(intercell_interactions)
inter_stats = []

for mt in tqdm(metastasis_datasets, desc='metastasis_dataset'):
    for td in tqdm(tissue_datasets, desc='tissue_dataset'):

        tissue_pairs = pd.read_csv(interactions_dir+f'{mt}_{td}_pairs.csv')
        
        pairs_inters = inters_array[td][:, tissue_pairs.id]
        
        statistics = Parallel(n_jobs=-1)(delayed(
            intercell_interactions.weighted_freq_interaction_stats)(
                pairs_inters[i,:],
                tissue_pairs.freq,
                inters_graph[td].iloc[i].to_list(),
                [('metastasis_dataset', mt), ('tissue_dataset', td)]
            ) for i in tnrange(pairs_inters.shape[0], desc='interactions'))

        inter_stats.extend(statistics)

inter_stats_plot = pd.DataFrame(inter_stats)
inter_stats_plot.to_csv(interactions_dir+'undirected_frequency_weighted_network_stats.csv', index=False)

metastasis_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

interactions:   0%|          | 0/9590 [00:00<?, ?it/s]

interactions:   0%|          | 0/9857 [00:00<?, ?it/s]

tissue_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

interactions:   0%|          | 0/9590 [00:00<?, ?it/s]

interactions:   0%|          | 0/9857 [00:00<?, ?it/s]

# Interaction Selection

In [113]:
network_type = ['genecalls', 'weighted_network']
pairs_type = ['orgpairs', 'frequency']

## load datasets

In [114]:
conditions = {pt:{} for pt in pairs_type}
for pt in pairs_type:
    for nt in network_type:
        conditions[pt][nt] = pd.read_csv(interactions_dir+f'undirected_{pt}_{nt}_stats.csv')

In [115]:
# load interaction vs tissue pairs arrays
interaction_arrays = {nt:{} for nt in network_type}
for nt in network_type:
    for td in tissue_datasets:
        if nt == 'genecalls':
            interaction_arrays[nt][td] = np.loadtxt(interactions_dir+f'{td}_interactions.txt', dtype='int64')
        else:
            interaction_arrays[nt][td] = np.loadtxt(interactions_dir+f'{td}_weighted_networks.txt')

In [116]:
for nt in interaction_arrays:
    for td in interaction_arrays[nt]:
        print(nt, td, interaction_arrays[nt][td].shape)

genecalls gtex (9590, 528)
genecalls consensus (9857, 861)
weighted_network gtex (9590, 528)
weighted_network consensus (9857, 861)


In [119]:
# load tissue pairs
tissue_pairs = {mt:{} for mt in metastasis_datasets}
for mt in metastasis_datasets:
    for td in tissue_datasets:
        tissue_pairs[mt][td] = pd.read_csv(interactions_dir+f'{mt}_{td}_pairs.csv')

In [121]:
gene_tau = {
    td: pd.read_csv(gene_exp_dir+f'{td}/records.csv', usecols=['gene_id', 'tau'])\
        .drop_duplicates(ignore_index=True) for td in tissue_datasets
}

In [122]:
# load interactions graph
inter_graph = {
    td: pd.read_csv(interactions_dir+f'{td}_interactions_graph.csv') for td in tissue_datasets
}
inter_graph['gtex'].head(2)

Unnamed: 0,source,target
0,A2M,KLK3
1,A2M,LRP1


In [123]:
full_graph = graph.copy()
for td in tissue_datasets:
    x = inter_graph[td].reset_index().rename({'index': td}, axis=1)
    full_graph = pd.merge(full_graph, x, how='left')
    
print(full_graph.shape)
full_graph.head(2)

(10170, 4)


Unnamed: 0,source,target,gtex,consensus
0,APP,GRM7,426.0,429.0
1,CXCL16,GRM7,2883.0,2927.0


## Record significant interactions
Firstly, we will record the sign of the interaction:
* Positive: interactions that promote metastasis development
* Negative: interactions that hinder metastasis development

This groups will be defined by:
* genecalls + hypertest: the $\log{(OR)}$ sign has same direction as the interaction sign
* genecalls + frequency: we use the Mann-Whitney U the sign of the difference between the Mann-Whitney U statistics of the two groups
* weighted_network + hypertest: we use the Mann-Whitney U the sign of the difference between the Mann-Whitney U statistics of the two groups
* weighted_network + frequency: the sign of the correlation coefficient has the same direction as the interaction sign

In [125]:
statistics = [['logOR'], ['MWU_org_stat', 'MWU_n_org_stat'], ['MWU_inter_stat', 'MWU_n_inter_stat'], ['spearman']]

for pt in pairs_type:
    for nt in network_type:
        x = conditions[pt][nt]
        
        for i in statistics:

            try:
                if len(i) == 1:
                    y = x.copy()
                    y[y[i[0]] > 0] = 1
                    y[y[i[0]] < 0] = -1
                    y[y[i[0]] == 0] = 0
                    
                    x['signal'] = y[i[0]]
                    
                    if i[0] == 'logOR':
                        x['value'] = x['OR']
                        x['stat'] = ['OR' for j in range(x.shape[0])]
                    else:
                        x['value'] = abs(x['spearman'])
                        x['stat'] = ['spearman' for j in range(x.shape[0])]
                    
                else:
                    y = x.copy()
                    cond = y[i[0]] - y[i[1]]
                    y[cond > 0] = 1
                    y[cond < 0] = -1
                    y[cond == 0] = 0
                    x['signal'] = y[i[0]]
                    
                    # value = U1 - U2
                    x['value'] = x.iloc[:,4] - x.iloc[:,5]
                    x['stat'] = ['MWU_ratio' for i in range(x.shape[0])]
            except KeyError:
                pass

In [126]:
for pt in pairs_type:
    for nt in network_type:
        y = conditions[pt][nt]
        tests = ['fisher_exact', 'MannWhitneyU', 'pvalue']
        stat_label = [i for i in tests if i in y.columns.to_list()][0]
        print(y.dropna(subset=[stat_label], axis=0).shape[0])
        print(y.dropna(subset=['signal'], axis=0).shape[0])
        print()

38894
27374

38894
38894

32947
32947

38894
38894



In [129]:
# record significant interactions (p-value<0.05)
# and correct for multiple tests
interactions = []

for pt in tqdm(pairs_type, desc='pair type'):
    for nt in tqdm(network_type, desc='network type'):
        x = conditions[pt][nt]
        for mt in metastasis_datasets:
            for td in tissue_datasets:
                
                y = x[(x.metastasis_dataset==mt)&
                      (x.tissue_dataset==td)].reset_index(drop=True).dropna()
                
                # each dataset (y) has the same number of interactions with the
                # same order as the intercell graph for each tissue dataset
                # that way, we can use the newly created index to index 
                # interactions on the original graph
                
                tests = ['fisher_exact', 'MannWhitneyU', 'pvalue']
                stat_label = [i for i in tests if i in y.columns.to_list()][0]
                
                reject, pval = multitest.fdrcorrection(y[stat_label])
                
                stat_label_corr = stat_label + '_corr'
                y[stat_label_corr] = pval
                
                # filter using a significance of 0.05
                sign_inters = y[y[stat_label_corr]<0.05]
                
                interactions.append({
                    'pairs_type': pt,
                    'network_type': nt,
                    'metastasis_dataset': mt,
                    'tissue_dataset': td,
                    'interaction': sign_inters.index.to_list(),
                    'signal': sign_inters.signal.to_list(),
                    'value': sign_inters['value'],
                    'stat': sign_inters['stat'],
                    'pval': sign_inters[stat_label],
                    'pval_corr': sign_inters[stat_label_corr]
                })

interactions = pd.DataFrame(interactions)
interactions.head(2)

pair type:   0%|          | 0/2 [00:00<?, ?it/s]

network type:   0%|          | 0/2 [00:00<?, ?it/s]

network type:   0%|          | 0/2 [00:00<?, ?it/s]

Unnamed: 0,pairs_type,network_type,metastasis_dataset,tissue_dataset,interaction,signal,value,stat,pval,pval_corr
0,orgpairs,genecalls,autopsy,gtex,[],[],"Series([], Name: value, dtype: float64)","Series([], Name: stat, dtype: object)","Series([], Name: fisher_exact, dtype: float64)","Series([], Name: fisher_exact_corr, dtype: flo..."
1,orgpairs,genecalls,autopsy,consensus,[],[],"Series([], Name: value, dtype: float64)","Series([], Name: stat, dtype: object)","Series([], Name: fisher_exact, dtype: float64)","Series([], Name: fisher_exact_corr, dtype: flo..."


In [130]:
# count how many times each interaction appears in all conditions
interactions_records = interactions.explode(['interaction', 'signal', 'value', 'stat', 'pval', 'pval_corr'])
display(interactions_records.tail(5))
grouped = interactions_records.groupby(['tissue_dataset', 'interaction'], as_index=False).size()
grouped.head(2)

Unnamed: 0,pairs_type,network_type,metastasis_dataset,tissue_dataset,interaction,signal,value,stat,pval,pval_corr
15,frequency,weighted_network,hcmdb,consensus,9835,1.0,0.216349,spearman,5e-05,0.00036
15,frequency,weighted_network,hcmdb,consensus,9836,1.0,0.261254,spearman,1e-06,1.9e-05
15,frequency,weighted_network,hcmdb,consensus,9837,1.0,0.142113,spearman,0.008113,0.023471
15,frequency,weighted_network,hcmdb,consensus,9851,-1.0,0.191411,spearman,0.000343,0.001697
15,frequency,weighted_network,hcmdb,consensus,9856,1.0,0.162239,spearman,0.00247,0.008668


Unnamed: 0,tissue_dataset,interaction,size
0,consensus,0,1
1,consensus,3,3


In [131]:
# merge interaction counts in each tissue dataset graph with the full interaction graph
# preserve all interactions: left (full graph) keys
# interactions that do not appear in one graph or both graphs will have a NaN count
counts = full_graph.copy()
for td in tissue_datasets:
    x = grouped.loc[grouped.tissue_dataset==td, ['interaction', 'size']]
    counts = pd.merge(counts, x, left_on=td, right_on='interaction', how='left')

# sum counts from both graphs
counts['size_x'] = counts['size_x'].fillna(0)
counts['size_y'] = counts['size_y'].fillna(0)
counts['count'] = counts['size_x']+counts['size_y']

In [132]:
inters_counts = counts[['source', 'target', 'gtex', 'consensus', 'count']]
inters_counts.sort_values('count', ascending=False)

Unnamed: 0,source,target,gtex,consensus,count
9654,PF4,CD163,7101.0,7306.0,8.0
7133,PF4,LDLR,7112.0,7317.0,8.0
2172,PF4,FGFR2,7110.0,7315.0,8.0
7749,SERPINA5,CPB2,7988.0,8236.0,8.0
4592,PRTN3,F2RL1,7363.0,7573.0,8.0
...,...,...,...,...,...
7188,F2RL1,GNAS,3649.0,3704.0,0.0
7190,LPAR2,GNAS,6202.0,6370.0,0.0
7192,EDNRA,GNAS,3218.0,3269.0,0.0
7194,CCKBR,GNAS,779.0,786.0,0.0


In [133]:
# all conditions
n_conditions = 1
for i in interactions.columns[:4]: # exclude "interaction" & "signal"
    print(i)
    n_conditions *= len(interactions[i].unique())
    print(interactions[i].unique())
print('n_conditions:', n_conditions)

pairs_type
['orgpairs' 'frequency']
network_type
['genecalls' 'weighted_network']
metastasis_dataset
['autopsy' 'hcmdb']
tissue_dataset
['gtex' 'consensus']
n_conditions: 16


In [134]:
inters_counts['pct'] = inters_counts['count']/n_conditions
inters_counts.sort_values(by='count', ascending=False)

Unnamed: 0,source,target,gtex,consensus,count,pct
9654,PF4,CD163,7101.0,7306.0,8.0,0.5
7133,PF4,LDLR,7112.0,7317.0,8.0,0.5
2172,PF4,FGFR2,7110.0,7315.0,8.0,0.5
7749,SERPINA5,CPB2,7988.0,8236.0,8.0,0.5
4592,PRTN3,F2RL1,7363.0,7573.0,8.0,0.5
...,...,...,...,...,...,...
7188,F2RL1,GNAS,3649.0,3704.0,0.0,0.0
7190,LPAR2,GNAS,6202.0,6370.0,0.0,0.0
7192,EDNRA,GNAS,3218.0,3269.0,0.0,0.0
7194,CCKBR,GNAS,779.0,786.0,0.0,0.0


In [135]:
inters_counts[inters_counts['count']>0].shape

(7262, 6)

## Tissue Diversity Analysis (interaction tissue specificity)
In this step we will evaluate each interaction tissue specificity. Ideally, we are looking for interactions that appear in most tissue pairs, i.e. have an high tissue diversity. That means that this interactions appear in most pairs but are more common in organotropism pairs or pairs with an higher frequency.
Interactions specific to one or few tissues might also be drivers of metastasis but its relevance for metastasis formation might be masked by other factors specific to that or those few tissues, like organ accessibility, vascularization, anatomic location, etc.

To quantify interaction specificity we will use:
* shannon index (entropy)
* max(tau) of each interaction

### Compute shannon index and max(tau) of each interaction

In [136]:
# Confirm if there are entries without signal
# If they exist, we need to find out why that happens
interactions_records[interactions_records.signal==0]

Unnamed: 0,pairs_type,network_type,metastasis_dataset,tissue_dataset,interaction,signal,value,stat,pval,pval_corr


In [137]:
interactions_records = interactions_records.dropna()
interactions_records.head(2)

Unnamed: 0,pairs_type,network_type,metastasis_dataset,tissue_dataset,interaction,signal,value,stat,pval,pval_corr
3,orgpairs,genecalls,hcmdb,consensus,515,1.0,5.612903,OR,0.000243,0.049071
3,orgpairs,genecalls,hcmdb,consensus,2684,-1.0,0.198223,OR,0.00025,0.049071


In [138]:
g = full_graph[['source', 'target', 'gtex']].rename(columns={'gtex': 'interaction'})
g['tissue_dataset'] = ['gtex' for i in range(g.shape[0])]
c = full_graph[['source', 'target', 'consensus']].rename(columns={'consensus': 'interaction'})
c['tissue_dataset'] = ['consensus' for i in range(g.shape[0])]
fg = pd.concat([g, c]).reset_index().rename(columns={'index':'inter_id'})
display(fg.head(2))
fg.info()

Unnamed: 0,inter_id,source,target,interaction,tissue_dataset
0,0,APP,GRM7,426.0,gtex
1,1,CXCL16,GRM7,2883.0,gtex


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20340 entries, 0 to 20339
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   inter_id        20340 non-null  int64  
 1   source          20340 non-null  object 
 2   target          20340 non-null  object 
 3   interaction     19447 non-null  float64
 4   tissue_dataset  20340 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 794.7+ KB


In [139]:
# drop interactions that do not exist (depends on the tissue dataset)
fg.dropna(inplace=True)
fg.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19447 entries, 0 to 20339
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   inter_id        19447 non-null  int64  
 1   source          19447 non-null  object 
 2   target          19447 non-null  object 
 3   interaction     19447 non-null  float64
 4   tissue_dataset  19447 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 911.6+ KB


In [140]:
a = gene_tau['gtex']
a['tissue_dataset'] = ['gtex' for i in range(a.shape[0])]
b = gene_tau['consensus']
b['tissue_dataset'] = ['consensus' for i in range(b.shape[0])]
ab = pd.concat([a, b])
ab.head(2)

Unnamed: 0,gene_id,tau,tissue_dataset
0,TSPAN6,0.442,gtex
1,TNMD,0.895,gtex


In [141]:
# add source and target genes tau
fgab = pd.merge(
    fg,
    ab.rename(columns={'gene_id': 'source'}),
    on=['source', 'tissue_dataset']
)
fgab = pd.merge(
    fgab,
    ab.rename(columns={'gene_id': 'target'}),
    suffixes=['_source', '_target'],
    on=['target', 'tissue_dataset']
) 
fgab.head(2)

Unnamed: 0,inter_id,source,target,interaction,tissue_dataset,tau_source,tau_target
0,0,APP,GRM7,426.0,gtex,0.149,0.825
1,1,CXCL16,GRM7,2883.0,gtex,0.467,0.825


In [142]:
# interaction_tau = max(source_tau, target_tau)
fgab['inter_tau'] = fgab[['tau_source', 'tau_target']].max(axis=1)
fgab.head(2)

Unnamed: 0,inter_id,source,target,interaction,tissue_dataset,tau_source,tau_target,inter_tau
0,0,APP,GRM7,426.0,gtex,0.149,0.825,0.825
1,1,CXCL16,GRM7,2883.0,gtex,0.467,0.825,0.825


In [143]:
# add the unique interactions id
# this way, the column "interaction" has the specific id for each tissue dataset
sign_inter = pd.merge(fgab, interactions_records)
sign_inter.head(2)

Unnamed: 0,inter_id,source,target,interaction,tissue_dataset,tau_source,tau_target,inter_tau,pairs_type,network_type,metastasis_dataset,signal,value,stat,pval,pval_corr
0,0,APP,GRM7,426.0,gtex,0.149,0.825,0.825,frequency,genecalls,autopsy,-1.0,-5636.0,MWU_ratio,0.00048,0.005027
1,0,APP,GRM7,426.0,gtex,0.149,0.825,0.825,frequency,weighted_network,autopsy,-1.0,0.193216,spearman,9.5e-05,0.002015


In [144]:
# Compute entropy for each interaction and condition
importlib.reload(intercell_interactions)

inter_records = sign_inter.to_dict('records')
entropy_records = Parallel(n_jobs=8)(
    delayed(intercell_interactions.inter_shannon_index)(
        inter_records[i],
        interaction_arrays,
        tissue_pairs
) for i in range(len(inter_records)))

In [145]:
# convert entropy results to dataframe
entropy = pd.DataFrame(entropy_records)

# prepare data to plot
c_entropy = entropy[entropy.columns.drop('metastasis_entropy')]
c_entropy.rename(columns={'cancer_entropy': 'entropy'}, inplace=True)
c_entropy['entropy_tissue'] = ['cancer' for i in range(c_entropy.shape[0])]
m_entropy = entropy[entropy.columns.drop('cancer_entropy')]
m_entropy.rename(columns={'metastasis_entropy': 'entropy'}, inplace=True)
m_entropy['entropy_tissue'] = ['metastasis' for i in range(m_entropy.shape[0])]

plot_entropy = pd.concat([c_entropy, m_entropy])
entropy.head(2)

Unnamed: 0,inter_id,source,target,interaction,tissue_dataset,tau_source,tau_target,inter_tau,pairs_type,network_type,metastasis_dataset,signal,value,stat,pval,pval_corr,cancer_entropy,metastasis_entropy
0,0,APP,GRM7,426.0,gtex,0.149,0.825,0.825,frequency,genecalls,autopsy,-1.0,-5636.0,MWU_ratio,0.00048,0.005027,2.573301,2.062874
1,0,APP,GRM7,426.0,gtex,0.149,0.825,0.825,frequency,weighted_network,autopsy,-1.0,0.193216,spearman,9.5e-05,0.002015,2.831882,2.756112


### Filter interactions

#### Drop tissue specific interactions ($interaction\space tau <0.9$ & $entropy > 0$)

In [146]:
filt = entropy[
    (entropy.inter_tau<0.9)&
    (entropy.cancer_entropy>0)&
    (entropy.metastasis_entropy>0)
]
print('n_interactions:', len(filt.inter_id.unique()))

n_interactions: 5975


#### Split dataset in spearman correlation and non spearman correlation entries

In [147]:
corr_inter = filt[filt.stat=='spearman']
print(corr_inter.shape[0])
print('n_interactions (correlation):', len(corr_inter.inter_id.unique()))

n_corr_inter = filt[~(filt.stat=='spearman')]
print(n_corr_inter.shape[0])
print('n_interactions (other tests):', len(n_corr_inter.inter_id.unique()))

9240
n_interactions (correlation): 5242
5069
n_interactions (other tests): 3434


More than half of spearman correlation interactions are also significant in other conditions

#### Apply correlation thresholds to remove lower correlation entries (when the spearman correlation is close to zero)

In [148]:
spearman = conditions['frequency']['weighted_network'].drop(columns='pvalue')
entropy_filt_corr = pd.merge(corr_inter, spearman)

thresholds = [0.2, 0.25, 0.3]
interaction_lists = []
for t in thresholds:
    thresh_filt = entropy_filt_corr[
        (entropy_filt_corr.spearman>t) | 
        (entropy_filt_corr.spearman<-t)
    ]
    print(f'(corr < -{t} & corr > {t})')
    print('n_interactions (correlation):', len(thresh_filt.inter_id.unique()))
    
    inter_filtered = pd.concat([n_corr_inter, thresh_filt])
    interaction_lists.append(inter_filtered)
    print('n_interactions (total):', len(inter_filtered.inter_id.unique()))
    print()

(corr < -0.2 & corr > 0.2)
n_interactions (correlation): 2114
n_interactions (total): 4164

(corr < -0.25 & corr > 0.25)
n_interactions (correlation): 806
n_interactions (total): 3646

(corr < -0.3 & corr > 0.3)
n_interactions (correlation): 152
n_interactions (total): 3457



In [149]:
thresholds = [0.2, 0.25, 0.3]
interaction_lists = []
for t in thresholds:
    # value corresponds to abs(spearman)
    thresh_filt = corr_inter[(corr_inter.value>t)]
    print(f'(corr < -{t} & corr > {t})')
    print('n_interactions (correlation):', len(thresh_filt.inter_id.unique()))
    
    inter_filtered = pd.concat([n_corr_inter, thresh_filt])
    interaction_lists.append(inter_filtered)
    print('n_interactions (total):', len(inter_filtered.inter_id.unique()))
    print()

(corr < -0.2 & corr > 0.2)
n_interactions (correlation): 2114
n_interactions (total): 4164

(corr < -0.25 & corr > 0.25)
n_interactions (correlation): 806
n_interactions (total): 3646

(corr < -0.3 & corr > 0.3)
n_interactions (correlation): 152
n_interactions (total): 3457



#### Filter interactions based on the metastasis dataset
We will keep interactions that appear both in HCMDB and Autopsy

In [151]:
final_lists = []
for t, i in zip(thresholds, interaction_lists):
    print('correlation threshold:', t)
    
    g = i.groupby(['inter_id'], as_index=False)
    filtered = g.filter(lambda x: (x.metastasis_dataset.unique().shape[0]>1)&(x.signal.unique().shape[0]==1))
    final_lists.append(filtered)
    print(filtered.shape[0])
    print('n_interactions:', len(filtered.inter_id.unique()))

correlation threshold: 0.2
4506
n_interactions: 1528
correlation threshold: 0.25
2906
n_interactions: 1121
correlation threshold: 0.3
2356
n_interactions: 1006


In [152]:
# export interaction list
for t, i in zip(thresholds, final_lists):
    i.to_csv(interactions_dir+f'interactions_corr_{t}.csv', index=False)

## Export file to publish

In [154]:
inters = pd.read_csv(interactions_dir+f'interactions_corr_0.25.csv')
print(inters.shape[0])
print(inters.inter_id.drop_duplicates().shape[0])
inters.head(2)

2906
1121


Unnamed: 0,inter_id,source,target,interaction,tissue_dataset,tau_source,tau_target,inter_tau,pairs_type,network_type,metastasis_dataset,signal,value,stat,pval,pval_corr,cancer_entropy,metastasis_entropy
0,422,WNT4,LRP6,9414.0,gtex,0.739,0.348,0.739,frequency,genecalls,autopsy,-1.0,-5756.0,MWU_ratio,0.003759,0.022155,2.9247,3.007392
1,1149,APOC2,LRP1,385.0,gtex,0.862,0.317,0.862,frequency,genecalls,hcmdb,1.0,2706.0,MWU_ratio,0.000344,0.012371,2.709835,2.472963


In [156]:
curated_graph = pd.read_csv(intercell_net_dir+'intercell_curated_graph.csv')
curated_graph.head(2)

Unnamed: 0,source,target
0,APP,GRM7
1,CXCL16,GRM7


In [157]:
curated_graph['is_curated'] = ['yes' for i in range(curated_graph.shape[0])]

In [158]:
inters_new = pd.merge(inters, curated_graph, how='left').fillna('no')
print(inters_new.inter_id.drop_duplicates().shape[0])
print(inters_new[inters_new.is_curated=='yes'].inter_id.drop_duplicates().shape[0])
inters_new.head(3)

1121
535


Unnamed: 0,inter_id,source,target,interaction,tissue_dataset,tau_source,tau_target,inter_tau,pairs_type,network_type,metastasis_dataset,signal,value,stat,pval,pval_corr,cancer_entropy,metastasis_entropy,is_curated
0,422,WNT4,LRP6,9414.0,gtex,0.739,0.348,0.739,frequency,genecalls,autopsy,-1.0,-5756.0,MWU_ratio,0.003759048,0.022155,2.9247,3.007392,yes
1,1149,APOC2,LRP1,385.0,gtex,0.862,0.317,0.862,frequency,genecalls,hcmdb,1.0,2706.0,MWU_ratio,0.0003438877,0.012371,2.709835,2.472963,yes
2,1158,FCN1,LRP1,3739.0,gtex,0.895,0.317,0.895,frequency,genecalls,autopsy,1.0,8371.0,MWU_ratio,2.148662e-07,1.1e-05,2.573301,2.062874,no


In [159]:
inters_new.to_csv('../SupplementaryFiles/'+'metastasis_associated_intercellular_interactions_v1.1.csv', index=False)

# Gene Selection and Comparison

## Pubmed Search

In [160]:
# load interaction list
inter_data = pd.read_csv(interactions_dir+f'interactions_corr_0.25.csv', index_col='inter_id')
inter_data.head(2)

Unnamed: 0_level_0,source,target,interaction,tissue_dataset,tau_source,tau_target,inter_tau,pairs_type,network_type,metastasis_dataset,signal,value,stat,pval,pval_corr,cancer_entropy,metastasis_entropy
inter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
422,WNT4,LRP6,9414.0,gtex,0.739,0.348,0.739,frequency,genecalls,autopsy,-1.0,-5756.0,MWU_ratio,0.003759,0.022155,2.9247,3.007392
1149,APOC2,LRP1,385.0,gtex,0.862,0.317,0.862,frequency,genecalls,hcmdb,1.0,2706.0,MWU_ratio,0.000344,0.012371,2.709835,2.472963


In [238]:
all_genes_list = []

for td in ['gtex', 'consensus']:
    g = pd.read_csv(gene_exp_dir+f'{td}/grouped_records.csv')['gene_id'].drop_duplicates().to_list()
    all_genes_list.extend(g)

all_genes_list = set(all_genes_list)
len(all_genes_list)

25764

In [239]:
inter_list = inter_data[['source', 'target']].drop_duplicates().reset_index(drop=True)

sign_genes = pd.DataFrame(
    pd.concat([
        inter_list['source'].rename('gene'),
        inter_list['target'].rename('gene')])
        ).drop_duplicates()

sign_genes['type'] = ['sign'] * sign_genes.shape[0]
print('total number of metastasis-associated genes:', sign_genes.shape[0])

total number of metastasis-associated genes: 616


In [240]:
intercell_genes = pd.concat([
    graph['source'].rename('gene'),
    graph['target'].rename('gene')
]).drop_duplicates().reset_index(drop=True)

intercell_genes = pd.DataFrame(intercell_genes[intercell_genes.isin(all_genes_list)])

intercell_genes = pd.merge(intercell_genes, sign_genes, how='left').fillna('not_sign')
intercell_genes.head(2)

Unnamed: 0,gene,type
0,APP,sign
1,CXCL16,sign


#### Association with metastasis OR invasion

In [None]:
# search pmids of genes from metastasis-associated interactions in article Titles and Abstracts
from Bio import Entrez
gene_pmid = {'gene': [], 'pmid': []}
for gene in tqdm(intercell_genes['gene'].to_list()):
    Entrez.email = "jamiranda@fc.ul.pt"
    handle = Entrez.esearch(
        db='pubmed',
        term=f'({gene}) AND (metastasis OR invasion)',
        field='Title/Abstract',
        RetMax=1000
    )
    record = Entrez.read(handle)
    pmids = record["IdList"]

    gene_pmid['pmid'].extend(pmids)
    gene_pmid['gene'].extend([gene] * len(pmids))
    handle.close()

gene_pmid = pd.DataFrame(gene_pmid)

In [269]:
pmids_association = pd.merge(intercell_genes, gene_pmid)
pmids_association.head(2)

Unnamed: 0,gene,type,pmid
0,APP,sign,37267825.0
1,APP,sign,37219600.0


In [270]:
# output pmdis recordes
pmids_association.to_csv(interactions_dir+'gene_pmids_association.csv', index=False)

#### Total number of papers per gene

In [None]:
# search pmids of genes from metastasis-associated interactions in article Titles and Abstracts
from Bio import Entrez
gene_pmid = {'gene': [], 'pmid': []}
for gene in tqdm(intercell_genes['gene'].to_list()):
    Entrez.email = "jamiranda@fc.ul.pt"
    handle = Entrez.esearch(
        db='pubmed',
        term=f'{gene}',
        field='Title/Abstract',
        RetMax=1000
    )
    record = Entrez.read(handle)
    pmids = record["IdList"]

    gene_pmid['pmid'].extend(pmids)
    gene_pmid['gene'].extend([gene] * len(pmids))
    handle.close()

gene_pmid = pd.DataFrame(gene_pmid)

In [None]:
pmids_total = pd.merge(intercell_genes, gene_pmid)
pmids_total.head(2)

#### Calculate PMID ratio

In [None]:
# calculate number of pmids per gene
n_pmids_association = pmids_association.groupby(['gene', 'type'], as_index=False)\
    .count().rename(columns={'pmid': 'n_pmid'})

n_pmid_total = pmids_total.groupby(['gene', 'type'], as_index=False)\
    .count().rename(columns={'pmid': 'n_pmid'})

In [301]:
# calculate the ratio of pmids associated vs total number of pmids
n_pmid_ratio = pd.merge(
    n_pmids_association.reset_index(), n_pmid_total.reset_index(),
    on=['gene', 'type'], suffixes=['_association', '_total']
    )
n_pmid_ratio['ratio'] = n_pmid_ratio['n_pmid_association']/n_pmid_ratio['n_pmid_total']

n_pmid_ratio.to_csv(interactions_dir+'pmid_ratio.csv', index=False)

## Disgenet Search

In [303]:
# load gene list
pmids = pd.read_csv(interactions_dir+'pmid_ratio.csv')
display(pmids.head(2))

Unnamed: 0,gene,type,n_pmid_association,n_pmid_total,ratio
0,A1BG,not_sign,4,73,0.054795
1,A2M,not_sign,23,654,0.035168


In [311]:
curated_graph = pd.read_csv(intercell_net_dir+'intercell_curated_graph.csv')
curated_graph = set(curated_graph['source'].tolist() + curated_graph['target'].tolist())

In [313]:
intercell_genes = pmids[['gene', 'type']].drop_duplicates()
print(intercell_genes.shape[0])
display(intercell_genes.head(2))
intercell_curated_genes = intercell_genes[intercell_genes.gene.isin(curated_graph)]
print(intercell_curated_genes.shape[0])
intercell_curated_genes.head(2)

1935


Unnamed: 0,gene,type
0,A1BG,not_sign
1,A2M,not_sign


1813


Unnamed: 0,gene,type
0,A1BG,not_sign
1,A2M,not_sign


In [315]:
disgenet = sqlite3.connect(raw_data_dir+'disgenet_2020.db')

In [316]:
gene_disease_net = pd.read_sql_query("SELECT * FROM geneDiseaseNetwork", disgenet)
gene_disease_net.head(2)

Unnamed: 0,NID,diseaseNID,geneNID,source,association,associationType,sentence,pmid,score,EL,EI,year
0,1130681,2107,793,BEFREE,,Biomarker,No correlation could be found between Broder's...,1000501.0,0.1,,0.956175,1976
1,261998,431,775,BEFREE,,GeneticVariation,"However, there are few reports describing soma...",10021299.0,0.4,,0.987013,1999


In [317]:
print(gene_disease_net.shape[0])
gene_disease_net.source.unique()

3261324


array(['BEFREE', 'MGD', 'UNIPROT', 'CTD_human', 'RGD', 'CLINVAR',
       'CLINGEN', 'GENOMICS_ENGLAND', 'ORPHANET', 'PSYGENET', 'LHGDN',
       'CTD_mouse', 'GWASDB', 'GWASCAT', 'CTD_rat', 'HPO', 'CGI'],
      dtype=object)

In [318]:
gene_disease_net.association.unique()

array([nan,  0.,  1.])

In [319]:
gene_attr = pd.read_sql_query("SELECT * FROM geneAttributes", disgenet)
print(gene_attr.shape[0])
display(gene_attr.head(2))

26137


Unnamed: 0,geneNID,geneId,geneName,geneDescription,pLI,DSI,DPI
0,1,1,A1BG,alpha-1-B glycoprotein,4.9917e-09,0.7,0.538
1,2,2,A2M,alpha-2-macroglobulin,4.5229e-11,0.529,0.769


In [320]:
intercell_gene_attr = gene_attr[gene_attr.geneName.isin(intercell_genes.gene)]
print(intercell_gene_attr.shape[0])
intercell_gene_attr.head(2)

1909


Unnamed: 0,geneNID,geneId,geneName,geneDescription,pLI,DSI,DPI
0,1,1,A1BG,alpha-1-B glycoprotein,4.9917e-09,0.7,0.538
1,2,2,A2M,alpha-2-macroglobulin,4.5229e-11,0.529,0.769


In [321]:
disease_attr = pd.read_sql_query("SELECT * FROM diseaseAttributes", disgenet)
disease_attr.head(2)

Unnamed: 0,diseaseNID,diseaseId,diseaseName,type
0,1,C0000727,"Abdomen, Acute",phenotype
1,2,C0000729,Abdominal Cramps,phenotype


In [322]:
disease2class = pd.read_sql_query("SELECT * FROM disease2class", disgenet)
display(disease2class.head(2))
disease_class = pd.read_sql_query("SELECT * FROM diseaseClass", disgenet)
display(disease_class.head(2))
disease_classes = pd.merge(disease_class, disease2class)
disease_classes.head(2)

Unnamed: 0,diseaseNID,diseaseClassNID
0,1840,20
1,38,20


Unnamed: 0,diseaseClassNID,vocabulary,diseaseClass,diseaseClassName
0,2,MSH,C23,"Pathological Conditions, Signs and Symptoms"
1,3,MSH,C16,"Congenital, Hereditary, and Neonatal Diseas..."


Unnamed: 0,diseaseClassNID,vocabulary,diseaseClass,diseaseClassName,diseaseNID
0,2,MSH,C23,"Pathological Conditions, Signs and Symptoms",8745
1,2,MSH,C23,"Pathological Conditions, Signs and Symptoms",1856


In [323]:
gene_disease_net = pd.merge(gene_disease_net, intercell_gene_attr)
gene_disease_net = pd.merge(gene_disease_net, disease_attr)
gene_disease_net = pd.merge(gene_disease_net, disease_classes)
print(gene_disease_net.shape[0])
gene_disease_net.head(2)

1620220


Unnamed: 0,NID,diseaseNID,geneNID,source,association,associationType,sentence,pmid,score,EL,...,pLI,DSI,DPI,diseaseId,diseaseName,type,diseaseClassNID,vocabulary,diseaseClass,diseaseClassName
0,1623571,3683,3138,BEFREE,,AlteredExpression,The r/r mice were not resistant to other skele...,10021460.0,0.01,,...,7.7197e-18,0.385,0.885,C0151846,Periosteal Disorder,disease,25,MSH,C05,Musculoskeletal Diseases
1,1623592,3683,4149,BEFREE,,AlteredExpression,The r/r mice were not resistant to other skele...,10021460.0,0.01,,...,0.012093,0.397,0.846,C0151846,Periosteal Disorder,disease,25,MSH,C05,Musculoskeletal Diseases


In [324]:
gene_disease_net.columns

Index(['NID', 'diseaseNID', 'geneNID', 'source', 'association',
       'associationType', 'sentence', 'pmid', 'score', 'EL', 'EI', 'year',
       'geneId', 'geneName', 'geneDescription', 'pLI', 'DSI', 'DPI',
       'diseaseId', 'diseaseName', 'type', 'diseaseClassNID', 'vocabulary',
       'diseaseClass', 'diseaseClassName'],
      dtype='object')

In [325]:
gene_disease_net.type.unique()

array(['disease', 'group', 'phenotype'], dtype=object)

In [326]:
gene_disease_net.diseaseClassName.unique()

array(['   Musculoskeletal Diseases', '   Neoplasms',
       '   Skin and Connective Tissue Diseases',
       '   Pathological Conditions, Signs and Symptoms',
       '   Digestive System Diseases', '   Endocrine System Diseases',
       '   Female Urogenital Diseases and Pregnancy Complications',
       '   Immune System Diseases',
       '   Congenital, Hereditary, and Neonatal Diseases and Abnormalities',
       '   Respiratory Tract Diseases', '   Eye Diseases',
       '   Stomatognathic Diseases', '   Male Urogenital Diseases',
       '   Nervous System Diseases', '   Cardiovascular Diseases',
       '   Nutritional and Metabolic Diseases',
       '   Otorhinolaryngologic Diseases',
       '   Hemic and Lymphatic Diseases', '   Infections',
       '   Mental Disorders', '   Behavior and Behavior Mechanisms',
       '   Chemically-Induced Disorders', '   Wounds and Injuries',
       '   Occupational Diseases', '   Animal Diseases',
       '   Disorders of Environmental Origin'], dt

In [327]:
gda = gene_disease_net.loc[
    gene_disease_net.diseaseClassName=='   Neoplasms',
    ['geneName', 'diseaseName', 'diseaseClassName', 'source', 'score']
].drop_duplicates()
print(gda.shape[0])
print(gda.geneName.unique().shape[0])
gda.sort_values('score', ascending=False).head(5)

97213
1804


Unnamed: 0,geneName,diseaseName,diseaseClassName,source,score
821712,KIT,Gastrointestinal Stromal Tumors,Neoplasms,MGD,1.0
821990,KIT,Gastrointestinal Stromal Tumors,Neoplasms,CTD_human,1.0
821626,KIT,Gastrointestinal Stromal Tumors,Neoplasms,UNIPROT,1.0
375259,MET,Liver carcinoma,Neoplasms,UNIPROT,1.0
375263,MET,Liver carcinoma,Neoplasms,CGI,1.0


In [328]:
n_associations = gda[['geneName', 'diseaseName']].drop_duplicates().groupby(['geneName'], as_index=False).count()
n_associations.rename(columns={'diseaseName': 'n_associations'}, inplace=True)
print(n_associations.shape[0])
n_associations.head(2)

1804


Unnamed: 0,geneName,n_associations
0,A1BG,12
1,A2M,27


In [329]:
score_sum = gda[['geneName', 'diseaseName', 'score']].drop_duplicates()
score_sum = gda[['geneName', 'score']].groupby(['geneName'], as_index=False).sum()
score_sum.rename(columns={'score': 'score_sum'}, inplace=True)
print(score_sum.shape[0])
score_sum.head(2)

1804


Unnamed: 0,geneName,score_sum
0,A1BG,0.18
1,A2M,5.23


In [330]:
# keep only one entry for each gene -> the entry with the highest score
gda_intercell = gda[
    ['geneName', 'score']
].sort_values('score', ascending=False).drop_duplicates(subset='geneName', keep='first')
gda_intercell.rename(columns={'score': 'max_score'}, inplace=True)
print(gda_intercell.shape[0])
gda_intercell.head(2)

1804


Unnamed: 0,geneName,max_score
821712,KIT,1.0
375259,MET,1.0


In [331]:
# number of associated neoplasms + sum of scores
gda_intercell = pd.merge(gda_intercell, n_associations)
gda_intercell = pd.merge(gda_intercell, score_sum)
print(gda_intercell.shape[0])
gda_intercell.head(2)

1804


Unnamed: 0,geneName,max_score,n_associations,score_sum
0,KIT,1.0,435,80.09
1,MET,1.0,358,75.47


In [332]:
# add missing intercell genes and assign an association score of 0
# and number of associated neoplasms also 0
gda_intercell_all = pd.merge(
    intercell_genes,
    gda_intercell,
    how='left',
    right_on='geneName',
    left_on='gene'
)[['gene', 'max_score', 'score_sum', 'n_associations', 'type']]

gda_intercell_all.fillna(0, inplace=True)
print(gda_intercell_all.shape[0])
gda_intercell_all.head(2)

1935


Unnamed: 0,gene,max_score,score_sum,n_associations,type
0,A1BG,0.03,0.18,12.0,not_sign
1,A2M,0.51,5.23,27.0,not_sign


In [333]:
# add missing intercell genes and assign an association score of 0
# and number of associated neoplasms also 0
gda_intercell_curated = pd.merge(
    intercell_curated_genes,
    gda_intercell,
    how='left',
    right_on='geneName',
    left_on='gene'
)[['gene', 'max_score', 'score_sum', 'n_associations', 'type']]

gda_intercell_curated.fillna(0, inplace=True)
print(gda_intercell_curated.shape[0])
gda_intercell_curated.head(2)

1813


Unnamed: 0,gene,max_score,score_sum,n_associations,type
0,A1BG,0.03,0.18,12.0,not_sign
1,A2M,0.51,5.23,27.0,not_sign


In [334]:
gda_intercell_all.to_csv(interactions_dir+'disgenet_search.csv', index=False)
gda_intercell_curated.to_csv(interactions_dir+'disgenet_search_curated.csv', index=False)

## Open targets search

In [57]:
pmids = pd.read_csv(interactions_dir+'pmid_ratio.csv')

curated_graph = pd.read_csv(intercell_net_dir+'intercell_curated_graph.csv')
curated_graph = set(curated_graph['source'].tolist() + curated_graph['target'].tolist())

intercell_genes = pmids[['gene', 'type']].drop_duplicates()
intercell_curated_genes = intercell_genes[intercell_genes.gene.isin(curated_graph)]

### Explore database

In [58]:
# path to ClinVar (EVA) evidence dataset 
# directory stored on your local machine
targetsPath = raw_data_dir+"open_targets/targets"
moleculePath = raw_data_dir+"open_targets/molecule"
associationPath = raw_data_dir+"open_targets/associationByOverallDirect"
diseasePath = raw_data_dir+"open_targets/diseases"

In [59]:
# establish spark connection
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [60]:
# read datasets
targets = spark.read.parquet(targetsPath)
diseases = spark.read.parquet(diseasePath)
molecules = spark.read.parquet(moleculePath)
association = spark.read.parquet(associationPath)

In [61]:
# Browse the targets schema
targets.printSchema()

root
 |-- id: string (nullable = true)
 |-- approvedSymbol: string (nullable = true)
 |-- biotype: string (nullable = true)
 |-- transcriptIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genomicLocation: struct (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: integer (nullable = true)
 |-- alternativeGenes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- approvedName: string (nullable = true)
 |-- go: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |    |    |-- evidence: string (nullable = true)
 |    |    |-- aspect: string (nullable = true)
 |    |    |-- geneProduct: string (nullable = true)
 |    |    |-- ecoId: string (nullable = true)
 |-- hallmarks: struct (nullable = true)
 |    |-- attribu

In [62]:
# select fields of interest
targetSelect = (
    targets.select(
        "id",
        "approvedSymbol",
        #"hallmarks",
    )
)
targetSelect.show(5)

+---------------+--------------+
|             id|approvedSymbol|
+---------------+--------------+
|ENSG00000002586|          CD99|
|ENSG00000015479|         MATR3|
|ENSG00000037280|          FLT4|
|ENSG00000038427|          VCAN|
|ENSG00000050730|         TNIP3|
+---------------+--------------+
only showing top 5 rows



In [63]:
target_df = targetSelect.toPandas()
target_df.head(2)

Unnamed: 0,id,approvedSymbol
0,ENSG00000002586,CD99
1,ENSG00000015479,MATR3


In [64]:
# Browse the disease schema
diseases.printSchema()

root
 |-- id: string (nullable = true)
 |-- code: string (nullable = true)
 |-- dbXRefs: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- name: string (nullable = true)
 |-- directLocationIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- obsoleteTerms: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- parents: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- sko: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- synonyms: struct (nullable = true)
 |    |-- hasBroadSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- hasExactSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- hasNarrowSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- hasRelatedSynonym: array (nullable = true

In [65]:
# select fields of interest
diseaseSelect = (
    diseases.select(
        "id",
        "name",
        'description',
        'therapeuticAreas'
    )
)
diseaseSelect.show(5)

+-------------+--------------------+--------------------+--------------------+
|           id|                name|         description|    therapeuticAreas|
+-------------+--------------------+--------------------+--------------------+
|MONDO_0013767|autoimmune lympho...|RAS-associated au...|[OTAR_0000018, MO...|
|MONDO_0014375|congenital diarrh...|Congenital chroni...|[OTAR_0000018, EF...|
|MONDO_0014662|congenital insens...|A hereditary sens...|[OTAR_0000018, EF...|
|MONDO_0015440|   ring chromosome 6|Ring chromosome 6...|      [OTAR_0000018]|
|MONDO_0015650|   epilepsy syndrome|                null|       [EFO_0000618]|
+-------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [66]:
disease_df = diseaseSelect.toPandas()
disease_df.head(2)

Unnamed: 0,id,name,description,therapeuticAreas
0,MONDO_0013767,autoimmune lymphoproliferative syndrome type 4,RAS-associated autoimmune leukoproliferative d...,"[OTAR_0000018, MONDO_0045024, EFO_0000540]"
1,MONDO_0014375,congenital diarrhea 7 with exudative enteropathy,Congenital chronic diarrhea with protein-losin...,"[OTAR_0000018, EFO_0010282]"


In [67]:
disease_df = disease_df.explode(column='therapeuticAreas')
disease_df.head()

Unnamed: 0,id,name,description,therapeuticAreas
0,MONDO_0013767,autoimmune lymphoproliferative syndrome type 4,RAS-associated autoimmune leukoproliferative d...,OTAR_0000018
0,MONDO_0013767,autoimmune lymphoproliferative syndrome type 4,RAS-associated autoimmune leukoproliferative d...,MONDO_0045024
0,MONDO_0013767,autoimmune lymphoproliferative syndrome type 4,RAS-associated autoimmune leukoproliferative d...,EFO_0000540
1,MONDO_0014375,congenital diarrhea 7 with exudative enteropathy,Congenital chronic diarrhea with protein-losin...,OTAR_0000018
1,MONDO_0014375,congenital diarrhea 7 with exudative enteropathy,Congenital chronic diarrhea with protein-losin...,EFO_0010282


In [68]:
# Browse the evidence schema
association.printSchema()

root
 |-- diseaseId: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- score: double (nullable = true)
 |-- evidenceCount: long (nullable = true)



In [69]:
# select fields of interest
associationSelect = (
    association.select(
        "targetId",
        "diseaseId",
        "score",
    )
)
associationSelect.show(5)

+---------------+-----------+--------------------+
|       targetId|  diseaseId|               score|
+---------------+-----------+--------------------+
|ENSG00000000938|EFO_0000574|  0.0831554673040579|
|ENSG00000002586|EFO_0000574|0.013653922409184817|
|ENSG00000002822|EFO_0000574|  0.3326218692162316|
|ENSG00000003400|EFO_0000574|  0.3991462430594779|
|ENSG00000003402|EFO_0000574|0.027233518203149707|
+---------------+-----------+--------------------+
only showing top 5 rows



In [70]:
association_df = associationSelect.toPandas()
association_df.head(2)

                                                                                

Unnamed: 0,targetId,diseaseId,score
0,ENSG00000000938,EFO_0000574,0.083155
1,ENSG00000002586,EFO_0000574,0.013654


In [71]:
# Browse the molecule schema
molecules.printSchema()

root
 |-- id: string (nullable = true)
 |-- canonicalSmiles: string (nullable = true)
 |-- inchiKey: string (nullable = true)
 |-- drugType: string (nullable = true)
 |-- name: string (nullable = true)
 |-- yearOfFirstApproval: long (nullable = true)
 |-- maximumClinicalTrialPhase: long (nullable = true)
 |-- parentId: string (nullable = true)
 |-- hasBeenWithdrawn: boolean (nullable = true)
 |-- isApproved: boolean (nullable = true)
 |-- withdrawnNotice: struct (nullable = true)
 |    |-- countries: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- classes: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- year: long (nullable = true)
 |-- tradeNames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- synonyms: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- crossReferences: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNul

In [72]:
# select fields of interest
moleculeSelect = (
    molecules.select(
        "id",
        "name",
        "isApproved",
        "linkedTargets",
        "linkedDiseases",
        'maximumClinicalTrialPhase',
        'drugType'
    )
)
moleculeSelect.show(2)

+----------+--------------+----------+--------------------+--------------------+-------------------------+--------------+
|        id|          name|isApproved|       linkedTargets|      linkedDiseases|maximumClinicalTrialPhase|      drugType|
+----------+--------------+----------+--------------------+--------------------+-------------------------+--------------+
|CHEMBL1009|      LEVODOPA|      true|{[ENSG00000151577...|{[MONDO_0005090, ...|                        4|Small molecule|
| CHEMBL101|PHENYLBUTAZONE|      true|{[ENSG00000073756...|{[HP_0002829, HP_...|                        4|Small molecule|
+----------+--------------+----------+--------------------+--------------------+-------------------------+--------------+
only showing top 2 rows



In [73]:
molecule_df = moleculeSelect.toPandas()
display(molecule_df.head(2))
molecule_df.info()

Unnamed: 0,id,name,isApproved,linkedTargets,linkedDiseases,maximumClinicalTrialPhase,drugType
0,CHEMBL1009,LEVODOPA,True,"([ENSG00000151577], 1)","([MONDO_0005090, MONDO_0043209, EFO_0002610, M...",4.0,Small molecule
1,CHEMBL101,PHENYLBUTAZONE,True,"([ENSG00000073756, ENSG00000095303], 2)","([HP_0002829, HP_0003326, EFO_0005755], 3)",4.0,Small molecule


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12854 entries, 0 to 12853
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         12854 non-null  object 
 1   name                       12854 non-null  object 
 2   isApproved                 12847 non-null  object 
 3   linkedTargets              4026 non-null   object 
 4   linkedDiseases             4026 non-null   object 
 5   maximumClinicalTrialPhase  12847 non-null  float64
 6   drugType                   12847 non-null  object 
dtypes: float64(1), object(6)
memory usage: 703.1+ KB


In [74]:
molecule_df.dropna(inplace=True)
molecule_df['target'] = molecule_df.linkedTargets.apply(lambda x: x.asDict()['rows'])

In [75]:
# for now we only want target and molecule information
target_drug = molecule_df.drop(columns=['linkedTargets', 'linkedDiseases']).explode('target')
target_drug.head(2)

Unnamed: 0,id,name,isApproved,maximumClinicalTrialPhase,drugType,target
0,CHEMBL1009,LEVODOPA,True,4.0,Small molecule,ENSG00000151577
1,CHEMBL101,PHENYLBUTAZONE,True,4.0,Small molecule,ENSG00000073756


In [76]:
sign_targets = target_df[target_df.approvedSymbol.isin(intercell_genes[intercell_genes.type=='sign'].gene)].rename(columns={'id': 'target'})
print(sign_targets.shape[0])
print(intercell_genes[intercell_genes.type=='sign'].shape[0])
sign_targets.head(2)

617
616


Unnamed: 0,target,approvedSymbol
2,ENSG00000037280,FLT4
3,ENSG00000038427,VCAN


In [77]:
sign_targets.approvedSymbol[sign_targets.approvedSymbol.duplicated()]

47584    SIGLEC5
Name: approvedSymbol, dtype: object

In [78]:
target_drug_association = pd.merge(target_drug, sign_targets).drop(columns='target')
target_drug_association.rename(
    columns={
        'approvedSymbol': 'target',
        'name': 'drug',
        'id': 'drug_id'
    },
    inplace=True
)
display(target_drug_association.head(2))
print(target_drug_association.target.unique().shape[0])

Unnamed: 0,drug_id,drug,isApproved,maximumClinicalTrialPhase,drugType,target
0,CHEMBL1240,PROPANTHELINE BROMIDE,True,4.0,Small molecule,CHRM3
1,CHEMBL978,METHACHOLINE,True,4.0,Small molecule,CHRM3


229


In [79]:
target_drug_association = pd.merge(target_drug, sign_targets).drop(columns='target')
target_drug_association.rename(
    columns={
        'approvedSymbol': 'target',
        'name': 'drug',
        'id': 'drug_id'
    },
    inplace=True
)
display(target_drug_association.head(2))
print(target_drug_association.target.unique().shape[0])

Unnamed: 0,drug_id,drug,isApproved,maximumClinicalTrialPhase,drugType,target
0,CHEMBL1240,PROPANTHELINE BROMIDE,True,4.0,Small molecule,CHRM3
1,CHEMBL978,METHACHOLINE,True,4.0,Small molecule,CHRM3


229


In [80]:
y = molecule_df.dropna()
y['disease'] = y.linkedDiseases.apply(lambda x: x.asDict()['rows'])
disease_drug_association = y.drop(columns=['linkedTargets', 'linkedDiseases']).explode('disease')
disease_drug_association.rename(
    columns={
        'name': 'drug',
        'id': 'drug_id'
    }, inplace=True)
display(disease_drug_association.head(2))

disease_drug_association = pd.merge(
    disease_drug_association,
    disease_df.rename(columns={'id': 'disease'}),
    on='disease'
)
disease_drug_association.head(2)

Unnamed: 0,drug_id,drug,isApproved,maximumClinicalTrialPhase,drugType,target,disease
0,CHEMBL1009,LEVODOPA,True,4.0,Small molecule,[ENSG00000151577],MONDO_0005090
0,CHEMBL1009,LEVODOPA,True,4.0,Small molecule,[ENSG00000151577],MONDO_0043209


Unnamed: 0,drug_id,drug,isApproved,maximumClinicalTrialPhase,drugType,target,disease,name,description,therapeuticAreas
0,CHEMBL1009,LEVODOPA,True,4.0,Small molecule,[ENSG00000151577],MONDO_0005090,schizophrenia,A major psychotic disorder characterized by ab...,MONDO_0002025
1,CHEMBL1009,LEVODOPA,True,4.0,Small molecule,[ENSG00000151577],MONDO_0005090,schizophrenia,A major psychotic disorder characterized by ab...,EFO_0000618


In [81]:
# drop drugs not associated with the significant targets
disease_drug_association = disease_drug_association[
    disease_drug_association.drug.isin(target_drug_association.drug)
]
disease_drug_association.head(2)

Unnamed: 0,drug_id,drug,isApproved,maximumClinicalTrialPhase,drugType,target,disease,name,description,therapeuticAreas
40,CHEMBL1467,ALLOPURINOL,True,4.0,Small molecule,[ENSG00000158125],MONDO_0005090,schizophrenia,A major psychotic disorder characterized by ab...,MONDO_0002025
41,CHEMBL1467,ALLOPURINOL,True,4.0,Small molecule,[ENSG00000158125],MONDO_0005090,schizophrenia,A major psychotic disorder characterized by ab...,EFO_0000618


In [82]:
target_disease_association = association_df.rename(columns={'targetId': 'target', 'diseaseId': 'disease'})
target_disease_association = pd.merge(target_disease_association, sign_targets)
target_disease_association = pd.merge(
    target_disease_association,
    disease_df.rename(columns={'id': 'disease'})
)
target_disease_association.drop(columns='target', inplace=True)
target_disease_association.rename(columns={'approvedSymbol': 'target'}, inplace=True)
print(target_disease_association.shape[0])

# keep only proteins that have an associated-drug
target_disease_association = target_disease_association[
    target_disease_association.target.isin(target_drug_association.target)]
print(target_disease_association.shape[0])
target_disease_association.head(2)

536500
296687


Unnamed: 0,disease,score,target,name,description,therapeuticAreas
0,EFO_0000574,0.003696,ITGAL,lymphoma,A malignant (clonal) proliferation of B- lymph...,OTAR_0000018
1,EFO_0000574,0.003696,ITGAL,lymphoma,A malignant (clonal) proliferation of B- lymph...,MONDO_0045024


### Repurposing drugs
Right now we have three kinds of associations:
* **target_drug_association**: proteins from metastasis-associated interactions that already have at least a drug that targets them
* **disease_drug_association**: diseases treated by drugs that target proteins from metastasis-associated interactions
* **target_disease_association**: direct associations between proteins from metastasis-associated interactions and diseases

We are interested in three types of targets:
* *targets that are associated with cancer/metastasis and have already a drug used for that purpose.* This is a way to validate our metastasis-associated interactions
* *targets that are associated with cancer/metastasis and have a drug that is not used to treat that condition.* This suggests that repurposing that drug might be useful
* *targets that have an associated-drug but are not associated with cancer/metastasis.* This suggests that these protein might be considered as a therapeutic target by repurposing the associated-drug 

In [83]:
display(target_drug_association.info())
target_drug_association.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1624 entries, 0 to 1623
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   drug_id                    1624 non-null   object 
 1   drug                       1624 non-null   object 
 2   isApproved                 1624 non-null   object 
 3   maximumClinicalTrialPhase  1624 non-null   float64
 4   drugType                   1624 non-null   object 
 5   target                     1624 non-null   object 
dtypes: float64(1), object(5)
memory usage: 76.2+ KB


None

Unnamed: 0,drug_id,drug,isApproved,maximumClinicalTrialPhase,drugType,target
0,CHEMBL1240,PROPANTHELINE BROMIDE,True,4.0,Small molecule,CHRM3
1,CHEMBL978,METHACHOLINE,True,4.0,Small molecule,CHRM3
2,CHEMBL1200330,PILOCARPINE HYDROCHLORIDE,True,4.0,Small molecule,CHRM3
3,CHEMBL667,ACETYLCHOLINE,True,4.0,Small molecule,CHRM3
4,CHEMBL1184,ACETYLCHOLINE CHLORIDE,True,4.0,Small molecule,CHRM3


In [84]:
display(disease_drug_association.info())
disease_drug_association.head()

<class 'pandas.core.frame.DataFrame'>
Index: 24683 entries, 40 to 95021
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   drug_id                    24683 non-null  object 
 1   drug                       24683 non-null  object 
 2   isApproved                 24683 non-null  object 
 3   maximumClinicalTrialPhase  24683 non-null  float64
 4   drugType                   24683 non-null  object 
 5   target                     24683 non-null  object 
 6   disease                    24683 non-null  object 
 7   name                       24683 non-null  object 
 8   description                24375 non-null  object 
 9   therapeuticAreas           24683 non-null  object 
dtypes: float64(1), object(9)
memory usage: 2.1+ MB


None

Unnamed: 0,drug_id,drug,isApproved,maximumClinicalTrialPhase,drugType,target,disease,name,description,therapeuticAreas
40,CHEMBL1467,ALLOPURINOL,True,4.0,Small molecule,[ENSG00000158125],MONDO_0005090,schizophrenia,A major psychotic disorder characterized by ab...,MONDO_0002025
41,CHEMBL1467,ALLOPURINOL,True,4.0,Small molecule,[ENSG00000158125],MONDO_0005090,schizophrenia,A major psychotic disorder characterized by ab...,EFO_0000618
60,CHEMBL502,DONEPEZIL,True,4.0,Small molecule,[ENSG00000087085],MONDO_0005090,schizophrenia,A major psychotic disorder characterized by ab...,MONDO_0002025
61,CHEMBL502,DONEPEZIL,True,4.0,Small molecule,[ENSG00000087085],MONDO_0005090,schizophrenia,A major psychotic disorder characterized by ab...,EFO_0000618
72,CHEMBL1201607,NATALIZUMAB,True,4.0,Antibody,[ENSG00000115232],MONDO_0005090,schizophrenia,A major psychotic disorder characterized by ab...,MONDO_0002025


In [85]:
display(target_disease_association.info())
target_disease_association.head()

<class 'pandas.core.frame.DataFrame'>
Index: 296687 entries, 0 to 536484
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   disease           296687 non-null  object 
 1   score             296687 non-null  float64
 2   target            296687 non-null  object 
 3   name              296687 non-null  object 
 4   description       263596 non-null  object 
 5   therapeuticAreas  296687 non-null  object 
dtypes: float64(1), object(5)
memory usage: 15.8+ MB


None

Unnamed: 0,disease,score,target,name,description,therapeuticAreas
0,EFO_0000574,0.003696,ITGAL,lymphoma,A malignant (clonal) proliferation of B- lymph...,OTAR_0000018
1,EFO_0000574,0.003696,ITGAL,lymphoma,A malignant (clonal) proliferation of B- lymph...,MONDO_0045024
2,EFO_0000574,0.003696,ITGAL,lymphoma,A malignant (clonal) proliferation of B- lymph...,EFO_0005803
3,EFO_0000574,0.001478,SELE,lymphoma,A malignant (clonal) proliferation of B- lymph...,OTAR_0000018
4,EFO_0000574,0.001478,SELE,lymphoma,A malignant (clonal) proliferation of B- lymph...,MONDO_0045024


In [86]:
target_disease_association[target_disease_association.target=='EGFR'].sort_values('score', ascending=False).head(5)

Unnamed: 0,disease,score,target,name,description,therapeuticAreas
12709,EFO_0003060,0.843237,EGFR,non-small cell lung carcinoma,A group of at least three distinct histologica...,OTAR_0000010
12710,EFO_0003060,0.843237,EGFR,non-small cell lung carcinoma,A group of at least three distinct histologica...,MONDO_0045024
2564,EFO_0000311,0.750973,EGFR,cancer,"A tumor composed of atypical neoplastic, often...",MONDO_0045024
11708,EFO_0000519,0.730699,EGFR,glioblastoma multiforme,The most malignant astrocytic tumor (WHO grade...,EFO_0000618
11707,EFO_0000519,0.730699,EGFR,glioblastoma multiforme,The most malignant astrocytic tumor (WHO grade...,MONDO_0045024


In [87]:
target_disease_association[
    (target_disease_association.target=='EGFR')&
    (target_disease_association.therapeuticAreas=='MONDO_0045024')
].sort_values('score', ascending=False).head(5)

Unnamed: 0,disease,score,target,name,description,therapeuticAreas
12710,EFO_0003060,0.843237,EGFR,non-small cell lung carcinoma,A group of at least three distinct histologica...,MONDO_0045024
2564,EFO_0000311,0.750973,EGFR,cancer,"A tumor composed of atypical neoplastic, often...",MONDO_0045024
11707,EFO_0000519,0.730699,EGFR,glioblastoma multiforme,The most malignant astrocytic tumor (WHO grade...,MONDO_0045024
40698,EFO_0000571,0.728355,EGFR,lung adenocarcinoma,A carcinoma that arises from the lung and is c...,MONDO_0045024
4039,EFO_0000181,0.691397,EGFR,head and neck squamous cell carcinoma,A squamous cell carcinoma that arises from any...,MONDO_0045024


[MONDO_0045024](https://platform.opentargets.org/disease/MONDO_0045024) is the main identifier for cancer or benign tumor. We will use this ontology to filter targets and drugs

Number of metastasis-associated proteins that are drug targets and are annotated has being linked to neoplastic diseases

In [88]:
target_disease_association[
    target_disease_association.therapeuticAreas=='MONDO_0045024'
].target.unique().shape[0]

229

Number of metastasis-associated proteins which are a target for at least one drug

In [89]:
target_drug_association.target.unique().shape[0]

229

All metastasis-associated proteins which have a drug that targets them are associated with cancer diseases. That means that  no target meets the criteria for the third type of target

In [90]:
cancer_related_drugs = disease_drug_association[disease_drug_association.therapeuticAreas=='MONDO_0045024'].drug_id.drop_duplicates()
cancer_related_drugs.head(2)

2016    CHEMBL3137343
2028       CHEMBL1671
Name: drug_id, dtype: object

In [91]:
target_cancer_drug = target_drug_association[target_drug_association.drug_id.isin(cancer_related_drugs)].copy()
target_cancer_drug['type'] = ['cancer' for i in range(target_cancer_drug.shape[0])]
target_other_drug = target_drug_association[~target_drug_association.drug_id.isin(cancer_related_drugs)].copy()
target_other_drug['type'] = ['other' for i in range(target_other_drug.shape[0])]
drug_records = pd.concat([target_cancer_drug, target_other_drug])
display(drug_records.head())

Unnamed: 0,drug_id,drug,isApproved,maximumClinicalTrialPhase,drugType,target,type
2,CHEMBL1200330,PILOCARPINE HYDROCHLORIDE,True,4.0,Small molecule,CHRM3,cancer
7,CHEMBL517712,ATROPINE,True,4.0,Small molecule,CHRM3,cancer
11,CHEMBL550,PILOCARPINE,True,4.0,Small molecule,CHRM3,cancer
17,CHEMBL1231,OXYBUTYNIN,True,4.0,Small molecule,CHRM3,cancer
19,CHEMBL1382,TOLTERODINE,True,4.0,Small molecule,CHRM3,cancer


In [92]:
drug_records[drug_records.type=='cancer'].drug.drop_duplicates().head(10)

2     PILOCARPINE HYDROCHLORIDE
7                      ATROPINE
11                  PILOCARPINE
17                   OXYBUTYNIN
19                  TOLTERODINE
28           TIOTROPIUM BROMIDE
30               GLYCOPYRRONIUM
34             ATROPINE SULFATE
37                   TIOTROPIUM
42                  BETHANECHOL
Name: drug, dtype: object

In [93]:
drug_records[drug_records.drug=='IMATINIB']

Unnamed: 0,drug_id,drug,isApproved,maximumClinicalTrialPhase,drugType,target,type
855,CHEMBL941,IMATINIB,True,4.0,Small molecule,KIT,cancer
1014,CHEMBL941,IMATINIB,True,4.0,Small molecule,PDGFRB,cancer


In [98]:
# add curated label
curated_targets = intercell_curated_genes.loc[intercell_curated_genes.type=='sign', ['gene']].rename(columns={'gene': 'target'})
curated_targets['is_curated'] = ['yes' for _ in range(curated_targets.shape[0])]

drug_records = pd.merge(drug_records, curated_targets, on='target', how='left').fillna('no')
drug_records.head()

Unnamed: 0,drug_id,drug,isApproved,maximumClinicalTrialPhase,drugType,target,type,is_curated
0,CHEMBL1200330,PILOCARPINE HYDROCHLORIDE,True,4.0,Small molecule,CHRM3,cancer,yes
1,CHEMBL517712,ATROPINE,True,4.0,Small molecule,CHRM3,cancer,yes
2,CHEMBL550,PILOCARPINE,True,4.0,Small molecule,CHRM3,cancer,yes
3,CHEMBL1231,OXYBUTYNIN,True,4.0,Small molecule,CHRM3,cancer,yes
4,CHEMBL1382,TOLTERODINE,True,4.0,Small molecule,CHRM3,cancer,yes


In [99]:
drug_records.to_csv(interactions_dir+'open_targets_relevant_drugs.csv', index=False)