# Generation of graphs with DrugMechDB

Here, I explore [DrugMechDB](https://sulab.github.io/DrugMechDB/) which could ideally provide us with true positive paths / MOAs.

In [1]:
from collections import defaultdict
from itertools import product

import pandas as pd
import networkx as nx
from tqdm import tqdm

from pubchempy import get_compounds

## Load in the DrugMechDB file

In [2]:
KG_DIR = '../data/kg'

In [3]:
node_names_df = pd.read_excel(f'{KG_DIR}/indication_MOA_paths.xlsx', sheet_name='paths')
node_names_df.head(2)

Unnamed: 0,n1,e1,n2,e2,n3,e3,n4,e4,n5,e5,n6,e6,n7,e7,n8
0,imatinib,INHIBITS,BCR/ABL,CAUSES,CML (ph+),,,,,,,,,,
1,imatinib,INHIBITS,c-Kit,UP_REGULATES,Cellular proliferation,CAUSES,Systemic mast cell disease,,,,,,,,


In [4]:
node_type_df = pd.read_excel(f'{KG_DIR}/indication_MOA_paths.xlsx', sheet_name='metapaths')
node_ids_df = pd.read_excel(f'{KG_DIR}/indication_MOA_paths.xlsx', sheet_name='node_ids')

In [5]:
node_ids_df.head(2)

Unnamed: 0,n1,n2,n3,n4,n5,n6,n7,n8
0,MESH:D000068877,UniProt:P00519,MESH:D015464,,,,,
1,MESH:D000068877,UniProt:P10721,GO:0008283,MESH:D034721,,,,


In [6]:
node_type_df.head(2)

Unnamed: 0,n1,e1,n2,e2,n3,e3,n4,e4,n5,e5,n6,e6,n7,e7,n8
0,Drug,INHIBITS,Protein,CAUSES,Disease,,,,,,,,,,
1,Drug,INHIBITS,Protein,UP_REGULATES,Biological Process,CAUSES,Disease,,,,,,,,


Lets create a graph for all this data.

In [7]:
# Create graph from node names
G = nx.DiGraph()

for idx, row in node_names_df.iterrows():
    for source_index, target_index in zip(list(range(1, 8)), list(range(2, 9))):
        if pd.isna(row[f'n{source_index}']) or pd.isna(row[f'n{target_index}']):
            continue

        G.add_node(
            node_ids_df.iloc[idx][f'n{source_index}'], 
            name=row[f'n{source_index}'],
            type=node_type_df.iloc[idx][f'n{source_index}']
        )

        G.add_node(
            node_ids_df.iloc[idx][f'n{target_index}'], 
            name=row[f'n{target_index}'],
            type=node_type_df.iloc[idx][f'n{target_index}']
        )

        if row[f'e{source_index}'] in [
            'INHIBITS', 'DECREASES_EXPRESSION', 'DISRUPTED_IN', 'DISRUPTS', 'DOWN_REGULATES', 
            'REDUCED_IN', 'REDUCES']:
            edge_type = -1
        elif row[f'e{source_index}'] in [
            'ACTIVATES', 'ACTIVATED_BY', 'INCREASED_BY', 'INCREASES', 'INCREASES_EXPRESSION', 'UPREGULATED_IN',
            'UP_REGULATES', 'ELEVATED_IN', 'STIMULATES'
        ]:
            edge_type = 1
        else:  # Not so useful edges for this usecase
            continue

        G.add_edge(
            node_ids_df.iloc[idx][f'n{source_index}'], 
            node_ids_df.iloc[idx][f'n{target_index}'], 
            name=edge_type
        )
    

In [8]:
G.number_of_nodes(), G.number_of_edges()

(383, 203)

In [9]:
gnodes = defaultdict(set)

for node, ndata in G.nodes(data=True):
    gnodes[ndata['type']].add(node)

len(gnodes)

14

From these, we are just interested in drugs, proteins, and processes. 
So, lets check the namespaces these main entities cover.

In [10]:
drug_namespaces = defaultdict(set)

for i in gnodes['Drug']:
    drug_namespaces[i.split(':')[0]].add(i)

{i: len(j) for i, j in drug_namespaces.items()}

{'MESH': 104, 'DB': 1}

In [11]:
protein_namespaces = defaultdict(set)

for i in gnodes['Protein']:
    protein_namespaces[i.split(':')[0]].add(i)

{i: len(j) for i, j in protein_namespaces.items()}

{'UniProt': 73}

In [12]:
process_namespaces = defaultdict(set)

for i in gnodes['Biological Process']:
    process_namespaces[i.split(':')[0]].add(i)

{i: len(j) for i, j in process_namespaces.items()}

{'GO': 50, 'MESH': 1}

Since there is just one MeSH term. We will leave it out. For the rest, we will map drugs from MeSH and DrugBank to PubChem and UniProts to Ncbigene

Mapping the uniprot proteins to ncbigene

In [13]:
uniprot2ncbi = pd.read_csv(
    '../data/mappings/gene_mapping.tsv', 
    sep='\t',
    usecols=['NCBI Gene ID(supplied by NCBI)', 'UniProt ID(supplied by UniProt)'],
    dtype=str,
)
uniprot2ncbi.dropna(subset=['UniProt ID(supplied by UniProt)'], inplace=True)
uniprot2ncbi.head(2)

Unnamed: 0,NCBI Gene ID(supplied by NCBI),UniProt ID(supplied by UniProt)
0,1,P04217
2,29974,Q9NQ94


In [14]:
uniprot2ncbi['NCBI Gene ID(supplied by NCBI)'] = 'ncbigene:' + uniprot2ncbi['NCBI Gene ID(supplied by NCBI)']
uniprot2ncbi['UniProt ID(supplied by UniProt)'] = 'UniProt:' + uniprot2ncbi['UniProt ID(supplied by UniProt)']
uniprot2ncbi.set_index('UniProt ID(supplied by UniProt)', inplace=True)
uniprot2ncbi = uniprot2ncbi.to_dict()['NCBI Gene ID(supplied by NCBI)']
len(uniprot2ncbi)

20156

# Load the KG files

In [15]:
combined_kg = pd.read_csv(f'{KG_DIR}/full_kg.tsv', sep='\t')
combined_kg.head(2)

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:10607,Compound,ncbigene:3553,Gene,upregulates
1,pubchem.compound:10607,Compound,ncbigene:203068,Gene,downregulates


In [16]:
target_sets = set(combined_kg[combined_kg['target'].str.startswith('ncbigene')]['target'].tolist())
source_sets = set(combined_kg[combined_kg['source'].str.startswith('ncbigene')]['source'].tolist())
proteins_in_kg = target_sets.union(source_sets)
len(proteins_in_kg)

9301

# Subsetting protein nodes to those found in KG

In [17]:
nodes_of_interest = set()

for type_specific_dict in  [drug_namespaces, process_namespaces, protein_namespaces]:
    for namespace, node_ids in type_specific_dict.items():
        if namespace == 'UniProt':
            for node in node_ids:
                if uniprot2ncbi.get(node) is not None:
                    nodes_of_interest.add(node)
        else:    
            nodes_of_interest.update(node_ids)

len(nodes_of_interest)

210

In [18]:
set(i.split(':')[0] for i in nodes_of_interest)

{'DB', 'GO', 'MESH', 'UniProt'}

In [19]:
filtered_graph = G.subgraph(nodes_of_interest)
filtered_graph.number_of_nodes(), filtered_graph.number_of_edges()

(210, 106)

# Get all edge information from filtered graph

In [20]:
drug2mech_edges = []
counter = {
     'drug-protein': 0,
     'protein-process': 0,
     'protein-protein': 0,
     'drug-process': 0,
}

for source, target, data in filtered_graph.edges(data=True):

    # Edge type
    if data['name'] == -1:
            edge_type = 'downregulates'
    else:
        edge_type = 'upregulates'

    # Conditions for edge to be added
    if filtered_graph.nodes[source]['type'] == 'Drug' and filtered_graph.nodes[target]['type'] == 'Protein':
        source_node = filtered_graph.nodes[source]['name']
        target_node = uniprot2ncbi[target]
        drug2mech_edges.append((source_node, target_node, edge_type))
        counter['drug-protein'] += 1

    elif filtered_graph.nodes[source]['type'] == 'Protein' and filtered_graph.nodes[target]['type'] == 'Biological Process':
        source_node = uniprot2ncbi[source]
        drug2mech_edges.append((source_node, target, edge_type))
        counter['protein-process'] += 1

    elif filtered_graph.nodes[source]['type'] == 'Protein' and filtered_graph.nodes[target]['type'] == 'Protein':
        source_node = uniprot2ncbi[source]
        target_node = uniprot2ncbi[target]
        drug2mech_edges.append((source_node, target_node, edge_type))
        counter['protein-protein'] += 1

    elif filtered_graph.nodes[source]['type'] == 'Drug' and filtered_graph.nodes[target]['type'] == 'Biological Process':
        source_node = filtered_graph.nodes[source]['name']
        drug2mech_edges.append((source_node, target, edge_type))
        counter['drug-process'] += 1

    else:
        continue # includes GO-GO edges

len(drug2mech_edges)

102

# Harmonization of nodes to consistent ontology

For compounds nodes, we will ground them to pubchem nodes and see overlap

In [21]:
drug2mech_harmonized_edges = []

for source_node, target_node, rel_type in tqdm(drug2mech_edges):
    if ':' not in source_node:
        compound = get_compounds(source_node, 'name')
        if len(compound) == 0:
            continue

        cidx = 'pubchem.compound:' + str(compound[0].cid)
        
        drug2mech_harmonized_edges.append((cidx, target_node, rel_type))
    else:
        drug2mech_harmonized_edges.append((source_node, target_node, rel_type))

len(drug2mech_harmonized_edges)

  0%|          | 0/102 [00:00<?, ?it/s]

100%|██████████| 102/102 [00:50<00:00,  2.04it/s]


98

# Getting the final edges for the graph

In [22]:
other_edges = []
edge_connections = defaultdict(set)

for source, target, edge_type in tqdm(drug2mech_harmonized_edges):

    # drug-protein edges
    if 'pubchem.compound' in source and 'ncbigene' in target:
        other_edges.append({
            'source': source,
            'source_node_type': 'Compound',
            'target': target,
            'target_node_type': 'Gene',
            'edge_type': edge_type
        })

        edge_connections[source].add(target)

    # protein-protein edges
    elif 'ncbigene' in source and 'ncbigene' in target:
        other_edges.append({
            'source': source,
            'source_node_type': 'Gene',
            'target': target,
            'target_node_type': 'Gene',
            'edge_type': edge_type
        })

        edge_connections[source].add(target)

    # protein-process edges
    elif 'ncbigene' in source and 'GO' in target:
        other_edges.append({
            'source': source,
            'source_node_type': 'Gene',
            'target': target,
            'target_node_type': 'Biological Process',
            'edge_type': 'participates'
        })

        edge_connections[source].add(target)

    else:
        print(source, target, edge_type)
        continue

100%|██████████| 98/98 [00:00<00:00, 91689.00it/s]

pubchem.compound:5380 GO:0002553 downregulates
pubchem.compound:443943 GO:0006954 downregulates
pubchem.compound:9833444 GO:0032782 upregulates
pubchem.compound:204 GO:0042060 upregulates
pubchem.compound:6421 GO:0070265 upregulates





In [23]:
# create a directed network graph using the edge list
graph = nx.DiGraph()

for edge in other_edges:
    graph.add_edge(edge['source'], edge['target'], edge_type=edge['edge_type'])


In [24]:
# iterate through the graph to get all nodes starting from drugs going through proteins and ending at processes
drug2process_edges = []

drug_nodes = [
    node for node in graph.nodes() 
    if 'pubchem.compound' in node
]

process_nodes = [
    node for node in graph.nodes() if 'GO' in node
]

for start_node, end_node in tqdm(list(product(drug_nodes, process_nodes))):
    if not nx.has_path(graph, source=start_node, target=end_node):
        continue
    drug2process_edges.append({
        'source': start_node,
        'source_node_type': 'Compound',
        'target': end_node,
        'target_node_type': 'Biological Process',
        'edge_type': 'induces'
    })
    

100%|██████████| 928/928 [00:00<00:00, 70464.43it/s]


In [25]:
len(drug2process_edges)

42

In [26]:
kg_data = pd.DataFrame(other_edges)

# Append to the existing KG
final_kg_df = pd.concat([combined_kg, kg_data], ignore_index=True)
final_kg_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:10607,Compound,ncbigene:3553,Gene,upregulates
1,pubchem.compound:10607,Compound,ncbigene:203068,Gene,downregulates
2,pubchem.compound:10607,Compound,ncbigene:54658,Gene,downregulates
3,pubchem.compound:10607,Compound,ncbigene:7153,Gene,downregulates
4,pubchem.compound:10607,Compound,ncbigene:7277,Gene,downregulates


In [27]:
final_kg_df.to_csv(f'{KG_DIR}/final_kg.tsv', sep='\t', index=False)

In [28]:
validation_df = pd.DataFrame(drug2process_edges)
validation_df.head()

Unnamed: 0,source,source_node_type,target,target_node_type,edge_type
0,pubchem.compound:5291,Compound,GO:0008283,Biological Process,induces
1,pubchem.compound:5745,Compound,GO:0050900,Biological Process,induces
2,pubchem.compound:5745,Compound,GO:0006954,Biological Process,induces
3,pubchem.compound:5745,Compound,GO:0019370,Biological Process,induces
4,pubchem.compound:5743,Compound,GO:0050900,Biological Process,induces


In [29]:
validation_df.to_csv(f'{KG_DIR}/test.tsv', sep='\t', index=False)