# Generation of test dataset

Here, I explore [DrugMechDB](https://sulab.github.io/DrugMechDB/) which could ideally provide us with true positive paths / MOAs.

In [1]:
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
import json

from pubchempy import get_compounds

## Load in the DrugMechDB file

In [2]:
KG_DIR = '../data/kg'
VALIDATION_DIR = '../data/validation'
MAPPING_DIR = '../data/mappings'

In [3]:
node_names_df = pd.read_excel(f'{VALIDATION_DIR}/indication_MOA_paths.xlsx', sheet_name='paths')
node_names_df.head(2)

Unnamed: 0,n1,e1,n2,e2,n3,e3,n4,e4,n5,e5,n6,e6,n7,e7,n8
0,imatinib,INHIBITS,BCR/ABL,CAUSES,CML (ph+),,,,,,,,,,
1,imatinib,INHIBITS,c-Kit,UP_REGULATES,Cellular proliferation,CAUSES,Systemic mast cell disease,,,,,,,,


In [4]:
node_type_df = pd.read_excel(f'{VALIDATION_DIR}/indication_MOA_paths.xlsx', sheet_name='metapaths')
node_ids_df = pd.read_excel(f'{VALIDATION_DIR}/indication_MOA_paths.xlsx', sheet_name='node_ids')

In [5]:
node_ids_df.head(2)

Unnamed: 0,n1,n2,n3,n4,n5,n6,n7,n8
0,MESH:D000068877,UniProt:P00519,MESH:D015464,,,,,
1,MESH:D000068877,UniProt:P10721,GO:0008283,MESH:D034721,,,,


In [6]:
node_type_df.head(2)

Unnamed: 0,n1,e1,n2,e2,n3,e3,n4,e4,n5,e5,n6,e6,n7,e7,n8
0,Drug,INHIBITS,Protein,CAUSES,Disease,,,,,,,,,,
1,Drug,INHIBITS,Protein,UP_REGULATES,Biological Process,CAUSES,Disease,,,,,,,,


# Get all drug-bp pairs

From the main file, each row is a valide drug-bp path. So each path is considered individually and only paths that connect a drug to a biological process are selected.

In [7]:
drug_bp_pairs = []

for idx, row in tqdm(node_type_df.iterrows(), total=node_type_df.shape[0]):
    try:
        assert row['n1'] == 'Drug'
        drug_node = {
            'idx': node_ids_df.iloc[idx]['n1'], 
            'name': node_names_df.iloc[idx]['n1']
        }
    except AssertionError:
        continue

    for _index in range(1, 8):
        if row[f'n{_index}'] == 'Biological Process':
            drug_bp_pairs.append((drug_node, {
                'idx': node_ids_df.iloc[idx][f'n{_index}'], 
                'name': node_names_df.iloc[idx][f'n{_index}']
            }))

len(drug_bp_pairs)


100%|██████████| 123/123 [00:00<00:00, 10053.97it/s]


101

# Harmonization of nodes to consistent ontology

For compounds nodes, we will ground them to pubchem nodes and see overlap

In [8]:
drug2mech_harmonized_edges = []
drug_id_mapping = {}

for cmp_node, bp_node in tqdm(drug_bp_pairs):
    compound = get_compounds(cmp_node['name'], 'name')
    if len(compound) == 0:
        continue

    cidx = 'pubchem.compound:' + str(compound[0].cid)
    drug_id_mapping[cmp_node['name']] = cidx
    
    drug2mech_harmonized_edges.append({
        'source': cidx,
        'source_node_type': 'Compound',
        'target': bp_node['idx'],
        'target_node_type': 'Biological Process',
        'edge_type': 'induces'
    })

len(drug2mech_harmonized_edges)

100%|██████████| 101/101 [01:19<00:00,  1.27it/s]


95

In [9]:
validation_df = pd.DataFrame(drug2mech_harmonized_edges)
validation_df.drop_duplicates(inplace=True)
validation_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,GO:0008283,Biological Process,induces
2,pubchem.compound:1983,Compound,GO:0001659,Biological Process,induces
3,pubchem.compound:2244,Compound,GO:0007596,Biological Process,induces
5,pubchem.compound:156391,Compound,GO:0006954,Biological Process,induces
6,pubchem.compound:4539,Compound,GO:0006260,Biological Process,induces


In [10]:
validation_df.shape

(86, 5)

## Now, get those which are connected to our KG:

We know they're already connected if both the drug and BP are already in the KG:

In [11]:
combined_kg = pd.read_csv(f'{KG_DIR}/full_kg.tsv', sep='\t')
combined_kg.head(2)

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:10607,Compound,ncbigene:3553,Gene,upregulates
1,pubchem.compound:10607,Compound,ncbigene:203068,Gene,downregulates


In [12]:
bps = set(combined_kg[combined_kg['target'].str.startswith('GO')]['target'].tolist())
drugs = set(combined_kg[combined_kg['source'].str.startswith('pubchem')]['source'].tolist())

In [13]:
validation_df.loc[(validation_df['source'].isin(drugs)) & (validation_df['target'].isin(bps))]

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
79,pubchem.compound:2145,Compound,GO:0008283,Biological Process,induces


Proteins:

In [14]:
target_sets = set(combined_kg[combined_kg['target'].str.startswith('ncbigene')]['target'].tolist())
source_sets = set(combined_kg[combined_kg['source'].str.startswith('ncbigene')]['source'].tolist())
proteins_in_kg = target_sets.union(source_sets)
len(proteins_in_kg)

9301

In [15]:
uniprot2ncbi = pd.read_csv(
    's3://enveda-datascience/lauren/gene_mapping.tsv', 
    sep='\t',
    usecols=['NCBI Gene ID(supplied by NCBI)', 'UniProt ID(supplied by UniProt)'],
    dtype=str,
)
uniprot2ncbi.dropna(subset=['UniProt ID(supplied by UniProt)'], inplace=True)
uniprot2ncbi.head(2)

Unnamed: 0,NCBI Gene ID(supplied by NCBI),UniProt ID(supplied by UniProt)
0,1,P04217
2,29974,Q9NQ94


In [16]:
uniprot2ncbi['NCBI Gene ID(supplied by NCBI)'] = 'ncbigene:' + uniprot2ncbi['NCBI Gene ID(supplied by NCBI)']
uniprot2ncbi['UniProt ID(supplied by UniProt)'] = 'UniProt:' + uniprot2ncbi['UniProt ID(supplied by UniProt)']
uniprot2ncbi.set_index('UniProt ID(supplied by UniProt)', inplace=True)
uniprot2ncbi = uniprot2ncbi.to_dict()['NCBI Gene ID(supplied by NCBI)']
len(uniprot2ncbi)

20156

In [17]:
# write a dictionary to a json file
with open(f'{MAPPING_DIR}/uniprot2ncbi.json', 'w') as f:
    json.dump(uniprot2ncbi, f)

Now, go through the df again, and for each row, get the path between the drug and BP.

I know this is pretty redundant, but I will just do it for now.

In [18]:
node_type_df.head(20)

Unnamed: 0,n1,e1,n2,e2,n3,e3,n4,e4,n5,e5,n6,e6,n7,e7,n8
0,Drug,INHIBITS,Protein,CAUSES,Disease,,,,,,,,,,
1,Drug,INHIBITS,Protein,UP_REGULATES,Biological Process,CAUSES,Disease,,,,,,,,
2,Drug,INHIBITS,Protein,UP_REGULATES,Biological Process,CAUSES,Disease,,,,,,,,
3,Drug,INHIBITS,Pathway,ASSOCIATED_WITH,Disease,,,,,,,,,,
4,Drug,INHIBITS,Protein,PRODUCES,Compound Class,CAUSES,Disease,,,,,,,,
5,Drug,INHIBITS,Protein,PRODUCES,Compound Class,CAUSES,Disease,,,,,,,,
6,Drug,INHIBITS,Pathway,LOCALIZED_TO,Anatomy,REGULATES,Biological Process,DISRUPTED_IN,Disease,,,,,,
7,Drug,INHIBITS,Protein,PRODUCES,Compound Class,CAUSES,Disease,,,,,,,,
8,Drug,INHIBITS,Protein,PRODUCES,Compound Class,CAUSES,Disease,,,,,,,,
9,Drug,INHIBITS,Protein,PART_OF,Pathway,PRODUCES,Compound,PART_OF,Biological Process,CAUSES,Disease,,,,


In [19]:
validation_moas = dict()
drug2prot_relation = dict()

for idx, row in tqdm(node_type_df.iterrows(), total=node_type_df.shape[0]):
    path_nodes = list()
    try:
        assert row['n1'] == 'Drug' and row['n2'] == 'Protein'
        drug_node = node_names_df.iloc[idx]['n1']
        prot_node = node_ids_df.iloc[idx][f'n2']
        assert drug_node in drug_id_mapping and prot_node in uniprot2ncbi
        drug_node = drug_id_mapping[drug_node]
        prot_node = uniprot2ncbi[prot_node]
        path_nodes.append(prot_node)

        drug2prot_relation[(drug_node, prot_node)] = node_names_df.iloc[idx]['e1']
        
    except AssertionError:
        continue

    for _index in range(2, 8):
        if row[f'n{_index}'] == 'Protein':
            try:
                prot_node = uniprot2ncbi[node_ids_df.iloc[idx][f'n{_index}']]
                path_nodes.append(prot_node)
            except KeyError:
                pass
        elif row[f'n{_index}'] == 'Biological Process':
            bp_node = node_ids_df.iloc[idx][f'n{_index}']
            validation_moas[(drug_node, bp_node)] = path_nodes

100%|██████████| 123/123 [00:00<00:00, 12291.51it/s]


In [20]:
len(validation_moas)

48

We have 48 MOAs which map to ontologies in our KG. That means 48 Drug-BP pairs which can go in our test dataset.

Next, I check which of these drug-BP pairs are actually connected to our KG in some way.

We will map the drug-protein relation using the following dictionary:

In [21]:
neg_relations = ['INHIBITS', 'DECREASES_EXPRESSION', 'DISRUPTED_IN', 'DISRUPTS', 'DOWN_REGULATES', 'REDUCED_IN', 'REDUCES']

pos_relations = ['ACTIVATES', 'ACTIVATED_BY', 'INCREASED_BY', 'INCREASES', 'INCREASES_EXPRESSION', 'UPREGULATED_IN',
            'UP_REGULATES', 'ELEVATED_IN', 'STIMULATES']

relation_map = {rel: 'upregulates' for rel in pos_relations}
for rel in neg_relations:
    relation_map[rel] = 'downregulates'

In [22]:
connected_pairs = dict()

other_edges = {'source': [],
                'source_node_type': [],
                'target': [],
                'target_node_type': [],
                'edge_type': []}

for key, val in validation_moas.items():
    if key[1].startswith('MESH'):
        continue
    # if both drug and GO BP in graph, great!
    if key[0] in drugs and key[1] in bps:
        connected_pairs[key] = val
    # if drug in graph but not GO BP, see if the prot connected to GO BP is:
    elif key[0] in drugs and len(val) > 0 and val[-1] in proteins_in_kg:
        # add the last connection to KG
        other_edges['source'].append(val[-1])
        other_edges['source_node_type'].append('Gene')
        other_edges['target'].append(key[1])
        other_edges['target_node_type'].append('Biological Process')
        other_edges['edge_type'].append('participates')
        connected_pairs[key] = val
    # if GO BP in graph but not drug, see if the prot connected to drug is:
    elif key[1] in bps and len(val) > 0 and val[0] in proteins_in_kg:
        # add the connection to KG
        other_edges['source'].append(key[0])
        other_edges['source_node_type'].append('Compound')
        other_edges['target'].append(val[0])
        other_edges['target_node_type'].append('Gene')
        other_edges['edge_type'].append(relation_map[drug2prot_relation[(key[0], val[0])]])
        connected_pairs[key] = val
    # if neither are in the KG, see if the protein(s) connected to them are:
    elif len(val) > 0 and val[0] in proteins_in_kg and val[-1] in proteins_in_kg:
        # add BOTH connections to KG
        other_edges['source'].append(val[-1])
        other_edges['source_node_type'].append('Gene')
        other_edges['target'].append(key[1])
        other_edges['target_node_type'].append('Biological Process')
        other_edges['edge_type'].append('participates')

        other_edges['source'].append(key[0])
        other_edges['source_node_type'].append('Compound')
        other_edges['target'].append(val[0])
        other_edges['target_node_type'].append('Gene')
        other_edges['edge_type'].append(relation_map[drug2prot_relation[(key[0], val[0])]])

        connected_pairs[key] = val

In [23]:
len(connected_pairs)

48

Luckily, all 48 of them are! Now we can create the final KG as well as the final test set.

First the validation / test set:

In [24]:
drug2process_edges = {'source': [],
                      'source_node_type': [],
                      'target': [],
                      'target_node_type': [],
                      'edge_type': []}

for key in connected_pairs.keys():
    drug2process_edges['source'].append(key[0])
    drug2process_edges['source_node_type'].append('Compound')
    drug2process_edges['target'].append(key[1])
    drug2process_edges['target_node_type'].append('Biological Process')
    drug2process_edges['edge_type'].append('induces')

In [25]:
validation_df = pd.DataFrame(drug2process_edges)
validation_df.drop_duplicates(inplace=True)
validation_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,GO:0008283,Biological Process,induces
1,pubchem.compound:2244,Compound,GO:0007596,Biological Process,induces
2,pubchem.compound:156391,Compound,GO:0006954,Biological Process,induces
3,pubchem.compound:5745,Compound,GO:0006954,Biological Process,induces
4,pubchem.compound:5745,Compound,GO:0050900,Biological Process,induces


In [26]:
len(validation_df)

48

In [27]:
validation_df.to_csv(f'{KG_DIR}/test.tsv', sep='\t', index=False)

Now, the final KG:

In [28]:
kg_data = pd.DataFrame(other_edges)

kg_data.head(5)

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,ncbigene:5156,Gene,downregulates
1,ncbigene:5742,Gene,GO:0007596,Biological Process,participates
2,pubchem.compound:2244,Compound,ncbigene:5742,Gene,downregulates
3,ncbigene:5743,Gene,GO:0006954,Biological Process,participates
4,pubchem.compound:156391,Compound,ncbigene:5743,Gene,downregulates


In [29]:
# Append to the existing KG
final_kg_df = pd.concat([combined_kg, kg_data], ignore_index=True)
final_kg_df.drop_duplicates(inplace=True)
final_kg_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:10607,Compound,ncbigene:3553,Gene,upregulates
1,pubchem.compound:10607,Compound,ncbigene:203068,Gene,downregulates
2,pubchem.compound:10607,Compound,ncbigene:54658,Gene,downregulates
3,pubchem.compound:10607,Compound,ncbigene:7153,Gene,downregulates
4,pubchem.compound:10607,Compound,ncbigene:7277,Gene,downregulates


In [30]:
final_kg_df.to_csv(f'{KG_DIR}/final_kg.tsv', sep='\t', index=False)

Let's double check to ensure that all pairs are connected to the KG:

In [31]:
final_kg_df = pd.concat([final_kg_df, validation_df], ignore_index=True)

In [32]:
# get all of the proteins connected to other proteins
prots_connected = set()
for i, row in final_kg_df.loc[final_kg_df['edge_type'] == 'interacts'].iterrows():
    prots_connected.add(row['source'])
    prots_connected.add(row['target'])

In [33]:
# get all the BPs whose protein is also connected to other proteins
bp_nodes = {row['target'] for i, row in final_kg_df.loc[final_kg_df['edge_type'] == 'participates'].iterrows() if row['source'] in prots_connected}
# get all the drugs whose protein is also connected to other proteins
drug_nodes = {row['source'] for i, row in final_kg_df.loc[final_kg_df['edge_type'].isin({'downregulates', 'upregulates'})].iterrows() if row['target'] in prots_connected}

In [34]:
# get the drug-BP pairs
drugbp_pairs = final_kg_df.loc[final_kg_df['edge_type'] == 'induces']

In [35]:
len(drugbp_pairs)

1851

In [36]:
drugbp_pairs.loc[drugbp_pairs['source'].isin(drug_nodes) & drugbp_pairs['target'].isin(bp_nodes)]

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
94908,pubchem.compound:4828,Compound,GO:0007165,Biological Process,induces
94909,pubchem.compound:4828,Compound,GO:0007267,Biological Process,induces
94910,pubchem.compound:5073,Compound,GO:0005975,Biological Process,induces
94911,pubchem.compound:5073,Compound,GO:0007165,Biological Process,induces
94912,pubchem.compound:5073,Compound,GO:0007267,Biological Process,induces
...,...,...,...,...,...
96819,pubchem.compound:5339,Compound,GO:0006954,Biological Process,induces
96820,pubchem.compound:3487,Compound,GO:1904862,Biological Process,induces
96821,pubchem.compound:16197727,Compound,GO:0042438,Biological Process,induces
96822,pubchem.compound:16197727,Compound,GO:0009650,Biological Process,induces
