# Generation of test dataset

Here, I explore [DrugMechDB](https://sulab.github.io/DrugMechDB/) which could ideally provide us with true positive paths / MOAs.

In [1]:
from collections import defaultdict
import pandas as pd
from tqdm import tqdm

from pubchempy import get_compounds

## Load in the DrugMechDB file

In [2]:
KG_DIR = '../data/kg'

In [3]:
node_names_df = pd.read_excel(f'{KG_DIR}/indication_MOA_paths.xlsx', sheet_name='paths')
node_names_df.head(2)

Unnamed: 0,n1,e1,n2,e2,n3,e3,n4,e4,n5,e5,n6,e6,n7,e7,n8
0,imatinib,INHIBITS,BCR/ABL,CAUSES,CML (ph+),,,,,,,,,,
1,imatinib,INHIBITS,c-Kit,UP_REGULATES,Cellular proliferation,CAUSES,Systemic mast cell disease,,,,,,,,


In [4]:
node_type_df = pd.read_excel(f'{KG_DIR}/indication_MOA_paths.xlsx', sheet_name='metapaths')
node_ids_df = pd.read_excel(f'{KG_DIR}/indication_MOA_paths.xlsx', sheet_name='node_ids')

In [5]:
node_ids_df.head(2)

Unnamed: 0,n1,n2,n3,n4,n5,n6,n7,n8
0,MESH:D000068877,UniProt:P00519,MESH:D015464,,,,,
1,MESH:D000068877,UniProt:P10721,GO:0008283,MESH:D034721,,,,


In [6]:
node_type_df.head(2)

Unnamed: 0,n1,e1,n2,e2,n3,e3,n4,e4,n5,e5,n6,e6,n7,e7,n8
0,Drug,INHIBITS,Protein,CAUSES,Disease,,,,,,,,,,
1,Drug,INHIBITS,Protein,UP_REGULATES,Biological Process,CAUSES,Disease,,,,,,,,


# Get all drug-bp pairs

From the main file, each row is a valide drug-bp path. So each path is considered individually and only paths that connect a drug to a biological process are selected.

In [10]:
drug_bp_pairs = []

for idx, row in tqdm(node_type_df.iterrows(), total=node_type_df.shape[0]):
    try:
        assert row['n1'] == 'Drug'
        drug_node = {
            'idx': node_ids_df.iloc[idx]['n1'], 
            'name': node_names_df.iloc[idx]['n1']
        }
    except AssertionError:
        continue

    for _index in range(1, 8):
        if row[f'n{_index}'] == 'Biological Process':
            drug_bp_pairs.append((drug_node, {
                'idx': node_ids_df.iloc[idx][f'n{_index}'], 
                'name': node_names_df.iloc[idx][f'n{_index}']
            }))

len(drug_bp_pairs)


100%|██████████| 123/123 [00:00<00:00, 2802.34it/s]


101

# Harmonization of nodes to consistent ontology

For compounds nodes, we will ground them to pubchem nodes and see overlap

In [12]:
drug2mech_harmonized_edges = []

for cmp_node, bp_node in tqdm(drug_bp_pairs):
    compound = get_compounds(cmp_node['name'], 'name')
    if len(compound) == 0:
        continue

    cidx = 'pubchem.compound:' + str(compound[0].cid)
    
    drug2mech_harmonized_edges.append({
        'source': cidx,
        'source_node_type': 'Compound',
        'target': bp_node['idx'],
        'target_node_type': 'Biological Process',
        'edge_type': 'induces'
    })

len(drug2mech_harmonized_edges)

  0%|          | 0/101 [00:00<?, ?it/s]

100%|██████████| 101/101 [01:58<00:00,  1.17s/it]


95

In [15]:
validation_df = pd.DataFrame(drug2mech_harmonized_edges)
validation_df.drop_duplicates(inplace=True)
validation_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,GO:0008283,Biological Process,induces
2,pubchem.compound:1983,Compound,GO:0001659,Biological Process,induces
3,pubchem.compound:2244,Compound,GO:0007596,Biological Process,induces
5,pubchem.compound:156391,Compound,GO:0006954,Biological Process,induces
6,pubchem.compound:4539,Compound,GO:0006260,Biological Process,induces


In [16]:
validation_df.shape

(86, 5)

In [17]:
validation_df.to_csv(f'{KG_DIR}/test.tsv', sep='\t', index=False)