# Generation of test dataset

Here, I explore [DrugMechDB](https://sulab.github.io/DrugMechDB/) which could ideally provide us with true positive paths / MOAs.

In [1]:
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
import json
import networkx as nx
import matplotlib.pyplot as plt
import os

from pubchempy import get_compounds
from collections import Counter

## Load in the DrugMechDB file

In [2]:
KG_DIR = '../data/kg'
VALIDATION_DIR = '../data/validation'
MAPPING_DIR = '../data/mappings'

In [3]:
node_names_df = pd.read_excel(f'{VALIDATION_DIR}/indication_MOA_paths.xlsx', sheet_name='paths')
node_names_df.head(2)

Unnamed: 0,n1,e1,n2,e2,n3,e3,n4,e4,n5,e5,n6,e6,n7,e7,n8
0,imatinib,INHIBITS,BCR/ABL,CAUSES,CML (ph+),,,,,,,,,,
1,imatinib,INHIBITS,c-Kit,UP_REGULATES,Cellular proliferation,CAUSES,Systemic mast cell disease,,,,,,,,


In [4]:
node_type_df = pd.read_excel(f'{VALIDATION_DIR}/indication_MOA_paths.xlsx', sheet_name='metapaths')
node_ids_df = pd.read_excel(f'{VALIDATION_DIR}/indication_MOA_paths.xlsx', sheet_name='node_ids')

In [5]:
node_ids_df.head(2)

Unnamed: 0,n1,n2,n3,n4,n5,n6,n7,n8
0,MESH:D000068877,UniProt:P00519,MESH:D015464,,,,,
1,MESH:D000068877,UniProt:P10721,GO:0008283,MESH:D034721,,,,


In [6]:
node_type_df.head(2)

Unnamed: 0,n1,e1,n2,e2,n3,e3,n4,e4,n5,e5,n6,e6,n7,e7,n8
0,Drug,INHIBITS,Protein,CAUSES,Disease,,,,,,,,,,
1,Drug,INHIBITS,Protein,UP_REGULATES,Biological Process,CAUSES,Disease,,,,,,,,


# Get all drug-bp pairs

From the main file, each row is a valide drug-bp path. So each path is considered individually and only paths that connect a drug to a biological process are selected.

In [7]:
drug_bp_pairs = []

for idx, row in tqdm(node_type_df.iterrows(), total=node_type_df.shape[0]):
    try:
        assert row['n1'] == 'Drug'
        drug_node = {
            'idx': node_ids_df.iloc[idx]['n1'], 
            'name': node_names_df.iloc[idx]['n1']
        }
    except AssertionError:
        continue

    for _index in range(1, 8):
        if row[f'n{_index}'] == 'Biological Process':
            drug_bp_pairs.append((drug_node, {
                'idx': node_ids_df.iloc[idx][f'n{_index}'], 
                'name': node_names_df.iloc[idx][f'n{_index}']
            }))

len(drug_bp_pairs)


100%|██████████| 123/123 [00:00<00:00, 2801.43it/s]


101

# Harmonization of nodes to consistent ontology

For compounds nodes, we will ground them to pubchem nodes and see overlap

In [8]:
drug2mech_harmonized_edges = []
drug_id_mapping = {}

for cmp_node, bp_node in tqdm(drug_bp_pairs):
    compound = get_compounds(cmp_node['name'], 'name')
    if len(compound) == 0:
        continue

    cidx = 'pubchem.compound:' + str(compound[0].cid)
    drug_id_mapping[cmp_node['name']] = cidx
    
    drug2mech_harmonized_edges.append({
        'source': cidx,
        'source_node_type': 'Compound',
        'target': bp_node['idx'],
        'target_node_type': 'Biological Process',
        'edge_type': 'induces'
    })

len(drug2mech_harmonized_edges)

100%|██████████| 101/101 [01:07<00:00,  1.49it/s]


94

In [9]:
validation_df = pd.DataFrame(drug2mech_harmonized_edges)
validation_df.drop_duplicates(inplace=True)
validation_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,GO:0008283,Biological Process,induces
2,pubchem.compound:1983,Compound,GO:0001659,Biological Process,induces
3,pubchem.compound:2244,Compound,GO:0007596,Biological Process,induces
5,pubchem.compound:156391,Compound,GO:0006954,Biological Process,induces
6,pubchem.compound:4539,Compound,GO:0006260,Biological Process,induces


In [10]:
validation_df.shape

(85, 5)

## Now, get those which are connected to our KG:

We know they're already connected if both the drug and BP are already in the KG:

In [11]:
combined_kg = pd.read_csv(f'{KG_DIR}/enriched_kg.tsv', sep='\t')
combined_kg.head(2)

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:10607,Compound,ncbigene:3553,Gene,upregulates
1,pubchem.compound:10607,Compound,ncbigene:203068,Gene,downregulates


In [12]:
small_kg = pd.read_csv(f'{KG_DIR}/small_enriched_kg.tsv', sep='\t')
small_kg.head(2)

Unnamed: 0,source,target,edge_type,source_node_type,target_node_type
0,ncbigene:6622,ncbigene:1861,interacts,Gene,Gene
1,ncbigene:6622,ncbigene:6477,interacts,Gene,Gene


In [13]:
xsmall_kg = pd.read_csv(f'{KG_DIR}/xsmall_enriched_kg.tsv', sep='\t')
xsmall_kg.head(2)

Unnamed: 0,source,target,edge_type,source_node_type,target_node_type
0,ncbigene:6622,ncbigene:6477,interacts,Gene,Gene
1,ncbigene:6622,GO:0043065,participates,Gene,Biological Process


We have to keep track of items for the full, Small, and XSmall variants of MoA-net:

In [14]:
variants = {'full': dict(),
            'small': dict(),
            'xsmall': dict()}

In [15]:
variants['full']['bps'] = set(combined_kg[combined_kg['target'].str.startswith('GO')]['target'].tolist())
variants['full']['drugs'] = set(combined_kg[combined_kg['source'].str.startswith('pubchem')]['source'].tolist())

variants['small']['bps'] = set(small_kg[small_kg['target'].str.startswith('GO')]['target'].tolist())
variants['small']['drugs'] = set(small_kg[small_kg['source'].str.startswith('pubchem')]['source'].tolist())

variants['xsmall']['bps'] = set(xsmall_kg[xsmall_kg['target'].str.startswith('GO')]['target'].tolist())
variants['xsmall']['drugs'] = set(xsmall_kg[xsmall_kg['source'].str.startswith('pubchem')]['source'].tolist())

Let's see which are already within our KG:

In [16]:
validation_df.loc[(validation_df['source'].isin(variants['full']['drugs'])) & (validation_df['target'].isin(variants['full']['bps']))]

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
78,pubchem.compound:2145,Compound,GO:0008283,Biological Process,induces


In [17]:
validation_df.loc[(validation_df['source'].isin(variants['small']['drugs'])) & (validation_df['target'].isin(variants['small']['bps']))]

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
78,pubchem.compound:2145,Compound,GO:0008283,Biological Process,induces


In [18]:
validation_df.loc[(validation_df['source'].isin(variants['xsmall']['drugs'])) & (validation_df['target'].isin(variants['xsmall']['bps']))]

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
78,pubchem.compound:2145,Compound,GO:0008283,Biological Process,induces


Let's get all the proteins and respective PPIs from the KGs:

In [19]:
target_sets = set(combined_kg[combined_kg['target'].str.startswith('ncbigene')]['target'].tolist())
source_sets = set(combined_kg[combined_kg['source'].str.startswith('ncbigene')]['source'].tolist())
variants['full']['proteins'] = target_sets.union(source_sets)
len(variants['full']['proteins'])

9301

In [20]:
target_sets = set(small_kg[small_kg['target'].str.startswith('ncbigene')]['target'].tolist())
source_sets = set(small_kg[small_kg['source'].str.startswith('ncbigene')]['source'].tolist())
variants['small']['proteins'] = target_sets.union(source_sets)
len(variants['small']['proteins'])

7446

In [21]:
target_sets = set(xsmall_kg[xsmall_kg['target'].str.startswith('ncbigene')]['target'].tolist())
source_sets = set(xsmall_kg[xsmall_kg['source'].str.startswith('ncbigene')]['source'].tolist())
variants['xsmall']['proteins'] = target_sets.union(source_sets)
len(variants['xsmall']['proteins'])

3176

We need a mapping file to map DrugMechDB proteins to NCBI terminology:

In [22]:
# open json file
with open(f'{MAPPING_DIR}/uniprot2ncbi.json', 'r') as f:
    uniprot2ncbi = json.load(f)

Now, go through the DrugMechDB df again, and for each row, get the path between the drug and BP.

I know this is pretty redundant, but I will just do it for now.

In [23]:
node_type_df.head(5)

Unnamed: 0,n1,e1,n2,e2,n3,e3,n4,e4,n5,e5,n6,e6,n7,e7,n8
0,Drug,INHIBITS,Protein,CAUSES,Disease,,,,,,,,,,
1,Drug,INHIBITS,Protein,UP_REGULATES,Biological Process,CAUSES,Disease,,,,,,,,
2,Drug,INHIBITS,Protein,UP_REGULATES,Biological Process,CAUSES,Disease,,,,,,,,
3,Drug,INHIBITS,Pathway,ASSOCIATED_WITH,Disease,,,,,,,,,,
4,Drug,INHIBITS,Protein,PRODUCES,Compound Class,CAUSES,Disease,,,,,,,,


In [24]:
validation_moas = dict()
drug2prot_relation = dict()

for idx, row in tqdm(node_type_df.iterrows(), total=node_type_df.shape[0]):
    path_nodes = list()
    try:
        assert row['n1'] == 'Drug' and row['n2'] == 'Protein'
        drug_node = node_names_df.iloc[idx]['n1']
        prot_node = node_ids_df.iloc[idx][f'n2']
        assert drug_node in drug_id_mapping and prot_node in uniprot2ncbi
        drug_node = drug_id_mapping[drug_node]
        prot_node = uniprot2ncbi[prot_node]
        path_nodes.append(prot_node)

        drug2prot_relation[(drug_node, prot_node)] = node_names_df.iloc[idx]['e1']
        
    except AssertionError:
        continue

    for _index in range(2, 8):
        if row[f'n{_index}'] == 'Protein':
            try:
                prot_node = uniprot2ncbi[node_ids_df.iloc[idx][f'n{_index}']]
                path_nodes.append(prot_node)
            except KeyError:
                pass
        elif row[f'n{_index}'] == 'Biological Process':
            bp_node = node_ids_df.iloc[idx][f'n{_index}']
            validation_moas[(drug_node, bp_node)] = path_nodes

100%|██████████| 123/123 [00:00<00:00, 1794.88it/s]


In [25]:
len(validation_moas)

48

We have 48 MOAs which map to ontologies in our KG. That means 48 Drug-BP pairs which can go in our test dataset.

Next, I check which of these drug-BP pairs are actually connected to our KG in some way.

We will map the drug-protein relation using the following dictionary:

In [26]:
neg_relations = ['INHIBITS', 'DECREASES_EXPRESSION', 'DISRUPTED_IN', 'DISRUPTS', 'DOWN_REGULATES', 'REDUCED_IN', 'REDUCES']

pos_relations = ['ACTIVATES', 'ACTIVATED_BY', 'INCREASED_BY', 'INCREASES', 'INCREASES_EXPRESSION', 'UPREGULATED_IN',
            'UP_REGULATES', 'ELEVATED_IN', 'STIMULATES']

relation_map = {rel: 'upregulates' for rel in pos_relations}
for rel in neg_relations:
    relation_map[rel] = 'downregulates'

Finally, we will see whether connections between drug-BP pairs can be made through the PPIs in the network, since only one pair was fully present in the KG.

We have to do it for the full, Small, and XSmall variants of MoA-net, so let's loop through.

In [27]:
for key in variants:
    variants[key]['connected_pairs'] = dict()
    variants[key]['other_edges'] = {'source': [],
                                    'source_node_type': [],
                                    'target': [],
                                    'target_node_type': [],
                                    'edge_type': []}

In [28]:
for kg in variants.keys():

    for key, val in validation_moas.items():
        if key[1].startswith('MESH'):
            continue
        # if both drug and GO BP in graph, great!
        if key[0] in variants[kg]['drugs'] and key[1] in variants[kg]['bps']:
            variants[kg]['connected_pairs'][key] = val
        # if drug in graph but not GO BP, see if the prot connected to GO BP is:
        elif key[0] in variants[kg]['drugs'] and len(val) > 0 and val[-1] in variants[kg]['proteins']:
            # add the last connection to KG
            variants[kg]['other_edges']['source'].append(val[-1])
            variants[kg]['other_edges']['source_node_type'].append('Gene')
            variants[kg]['other_edges']['target'].append(key[1])
            variants[kg]['other_edges']['target_node_type'].append('Biological Process')
            variants[kg]['other_edges']['edge_type'].append('participates')
            variants[kg]['connected_pairs'][key] = val
        # if GO BP in graph but not drug, see if the prot connected to drug is:
        elif key[1] in variants[kg]['bps'] and len(val) > 0 and val[0] in variants[kg]['proteins']:
            # add the connection to KG
            variants[kg]['other_edges']['source'].append(key[0])
            variants[kg]['other_edges']['source_node_type'].append('Compound')
            variants[kg]['other_edges']['target'].append(val[0])
            variants[kg]['other_edges']['target_node_type'].append('Gene')
            variants[kg]['other_edges']['edge_type'].append(relation_map[drug2prot_relation[(key[0], val[0])]])
            variants[kg]['connected_pairs'][key] = val
        # if neither are in the KG, see if the protein(s) connected to them are:
        elif len(val) > 0 and val[0] in variants[kg]['proteins'] and val[-1] in variants[kg]['proteins']:
            # add BOTH connections to KG
            variants[kg]['other_edges']['source'].append(val[-1])
            variants[kg]['other_edges']['source_node_type'].append('Gene')
            variants[kg]['other_edges']['target'].append(key[1])
            variants[kg]['other_edges']['target_node_type'].append('Biological Process')
            variants[kg]['other_edges']['edge_type'].append('participates')

            variants[kg]['other_edges']['source'].append(key[0])
            variants[kg]['other_edges']['source_node_type'].append('Compound')
            variants[kg]['other_edges']['target'].append(val[0])
            variants[kg]['other_edges']['target_node_type'].append('Gene')
            variants[kg]['other_edges']['edge_type'].append(relation_map[drug2prot_relation[(key[0], val[0])]])

            variants[kg]['connected_pairs'][key] = val

In [29]:
len(variants['full']['connected_pairs'])

48

In [30]:
len(variants['small']['connected_pairs'])

48

In [31]:
len(variants['xsmall']['connected_pairs'])

28

Luckily, all 48 of them are! Now we can create the final KG as well as the final test set.

First the validation / test set:

In [32]:
for kg in variants:
    variants[kg]['drug2process_edges'] = {'source': [],
                        'source_node_type': [],
                        'target': [],
                        'target_node_type': [],
                        'edge_type': []}

    for key in variants[kg]['connected_pairs'].keys():
        variants[kg]['drug2process_edges']['source'].append(key[0])
        variants[kg]['drug2process_edges']['source_node_type'].append('Compound')
        variants[kg]['drug2process_edges']['target'].append(key[1])
        variants[kg]['drug2process_edges']['target_node_type'].append('Biological Process')
        variants[kg]['drug2process_edges']['edge_type'].append('induces')

In [33]:
validation_df = pd.DataFrame(variants['full']['drug2process_edges'])
validation_df.drop_duplicates(inplace=True)
validation_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,GO:0008283,Biological Process,induces
1,pubchem.compound:2244,Compound,GO:0007596,Biological Process,induces
2,pubchem.compound:156391,Compound,GO:0006954,Biological Process,induces
3,pubchem.compound:5745,Compound,GO:0006954,Biological Process,induces
4,pubchem.compound:5745,Compound,GO:0050900,Biological Process,induces


In [34]:
len(validation_df)

48

In [35]:
validation_df.to_csv(f'{KG_DIR}/drugmechdb_triples.tsv', sep='\t', index=False)

In [36]:
validation_df = pd.DataFrame(variants['small']['drug2process_edges'])
validation_df.drop_duplicates(inplace=True)
validation_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,GO:0008283,Biological Process,induces
1,pubchem.compound:2244,Compound,GO:0007596,Biological Process,induces
2,pubchem.compound:156391,Compound,GO:0006954,Biological Process,induces
3,pubchem.compound:5745,Compound,GO:0006954,Biological Process,induces
4,pubchem.compound:5745,Compound,GO:0050900,Biological Process,induces


In [37]:
len(validation_df)

48

In [38]:
validation_df.to_csv(f'{KG_DIR}/drugmechdb_triples_small.tsv', sep='\t', index=False)

In [39]:
validation_df = pd.DataFrame(variants['xsmall']['drug2process_edges'])
validation_df.drop_duplicates(inplace=True)
validation_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,GO:0008283,Biological Process,induces
1,pubchem.compound:2244,Compound,GO:0007596,Biological Process,induces
2,pubchem.compound:5745,Compound,GO:0050900,Biological Process,induces
3,pubchem.compound:5743,Compound,GO:0006954,Biological Process,induces
4,pubchem.compound:2200,Compound,GO:0034776,Biological Process,induces


In [40]:
len(validation_df)

28

In [41]:
validation_df.to_csv(f'{KG_DIR}/drugmechdb_triples_xsmall.tsv', sep='\t', index=False)

Now, the final KGs:

In [42]:
kg_data = pd.DataFrame(variants['full']['other_edges'])

kg_data.head(5)

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,ncbigene:5156,Gene,downregulates
1,ncbigene:5742,Gene,GO:0007596,Biological Process,participates
2,pubchem.compound:2244,Compound,ncbigene:5742,Gene,downregulates
3,ncbigene:5743,Gene,GO:0006954,Biological Process,participates
4,pubchem.compound:156391,Compound,ncbigene:5743,Gene,downregulates


In [43]:
# Append to the existing KG
final_kg_df = pd.concat([combined_kg, kg_data], ignore_index=True)
final_kg_df.drop_duplicates(inplace=True)
final_kg_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:10607,Compound,ncbigene:3553,Gene,upregulates
1,pubchem.compound:10607,Compound,ncbigene:203068,Gene,downregulates
2,pubchem.compound:10607,Compound,ncbigene:54658,Gene,downregulates
3,pubchem.compound:10607,Compound,ncbigene:7153,Gene,downregulates
4,pubchem.compound:10607,Compound,ncbigene:7277,Gene,downregulates


In [44]:
final_kg_df.to_csv(f'{KG_DIR}/final_kg.tsv', sep='\t', index=False)

Final Small KG:

In [45]:
kg_data = pd.DataFrame(variants['small']['other_edges'])

kg_data.head(5)

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,ncbigene:5156,Gene,downregulates
1,ncbigene:5742,Gene,GO:0007596,Biological Process,participates
2,pubchem.compound:2244,Compound,ncbigene:5742,Gene,downregulates
3,ncbigene:5743,Gene,GO:0006954,Biological Process,participates
4,pubchem.compound:156391,Compound,ncbigene:5743,Gene,downregulates


In [46]:
# Append to the existing KG
final_kg_df = pd.concat([small_kg, kg_data], ignore_index=True)
final_kg_df.drop_duplicates(inplace=True)
final_kg_df.head()

Unnamed: 0,source,target,edge_type,source_node_type,target_node_type
0,ncbigene:6622,ncbigene:1861,interacts,Gene,Gene
1,ncbigene:6622,ncbigene:6477,interacts,Gene,Gene
2,ncbigene:6622,GO:0043065,participates,Gene,Biological Process
3,ncbigene:3308,ncbigene:6767,interacts,Gene,Gene
4,ncbigene:3308,GO:0006457,participates,Gene,Biological Process


In [47]:
final_kg_df.to_csv(f'{KG_DIR}/small_kg.tsv', sep='\t', index=False)

Finall XSmall KG:

In [48]:
kg_data = pd.DataFrame(variants['xsmall']['other_edges'])

kg_data.head(5)

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,ncbigene:5156,Gene,downregulates
1,ncbigene:5742,Gene,GO:0007596,Biological Process,participates
2,pubchem.compound:2244,Compound,ncbigene:5742,Gene,downregulates
3,ncbigene:301,Gene,GO:0050900,Biological Process,participates
4,pubchem.compound:5745,Compound,ncbigene:301,Gene,upregulates


In [49]:
# Append to the existing KG
final_kg_df = pd.concat([xsmall_kg, kg_data], ignore_index=True)
final_kg_df.drop_duplicates(inplace=True)
final_kg_df.head()

Unnamed: 0,source,target,edge_type,source_node_type,target_node_type
0,ncbigene:6622,ncbigene:6477,interacts,Gene,Gene
1,ncbigene:6622,GO:0043065,participates,Gene,Biological Process
2,ncbigene:3308,GO:0006457,participates,Gene,Biological Process
3,ncbigene:2885,GO:0007165,participates,Gene,Biological Process
4,ncbigene:5295,GO:0050821,participates,Gene,Biological Process


In [50]:
final_kg_df.to_csv(f'{KG_DIR}/xsmall_kg.tsv', sep='\t', index=False)