In [1]:
import sys
from pathlib import Path
# !pip install -r requirements.txt
import pandas as pd
import networkx as nx
import re
import yaml
from itertools import chain
from pathlib import Path
from operator import itemgetter
from collections import defaultdict
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# !pip install mygene
import mygene
mg = mygene.MyGeneInfo()
import ast

In [2]:
from data_tools.data_tools.files._retrieval import download
from data_tools.data_tools.files._analysis import *
import warnings
warnings.filterwarnings("ignore")

# DrugMechDB data

In [3]:
with open('indication_paths.yaml', 'r') as fh:
    ind = yaml.safe_load(fh)

In [4]:
import pprint
pprint.pprint(ind[0])

{'directed': True,
 'graph': {'_id': 'DB00619_MESH_D015464_1',
           'disease': 'CML (ph+)',
           'disease_mesh': 'MESH:D015464',
           'drug': 'imatinib',
           'drug_mesh': 'MESH:D000068877',
           'drugbank': 'DB:DB00619'},
 'links': [{'key': 'decreases activity of',
            'source': 'MESH:D000068877',
            'target': 'UniProt:P00519'},
           {'key': 'causes',
            'source': 'UniProt:P00519',
            'target': 'MESH:D015464'}],
 'multigraph': True,
 'nodes': [{'id': 'MESH:D000068877', 'label': 'Drug', 'name': 'imatinib'},
           {'id': 'UniProt:P00519', 'label': 'Protein', 'name': 'BCR/ABL'},
           {'id': 'MESH:D015464', 'label': 'Disease', 'name': 'CML (ph+)'}]}


In [5]:
all_metapath_nodes = get_metapath_node(ind)
all_metapath_edges = get_metapath_edges(ind)

In [6]:
basic_stats = defaultdict(list)
all_metaedges = []
all_parings = []
all_targets = []
unique_metaedges = []
first_edge_type = []
all_nodes = []

id_to_name = {}
id_to_label = {}
#loop through each graph object p in ind and compute stats.

for i, p in enumerate(ind):
    _id = (p["graph"]["_id"])
    # _id: unique identifier of the graph/path

    #drug_id, dis_id: parsed identifiers for the drug and disease involved
    drug_id, dis_id = path_to_tup(p)
    paths = get_all_paths(p)
    G = path_to_G(p)
    
    G = add_metaedges(G)
    G = add_meanode_pairs(G)
    
    basic_stats['idx'].append(i) #index
    basic_stats['id'].append(p['graph']['_id']) #DrugMechDB id
    basic_stats['drug'].append(drug_id) #Drug id
    basic_stats['disease'].append(dis_id)#Disease id
    basic_stats['nodes'].append((G.nodes)) #nodes in metapath
    basic_stats['n_nodes'].append(len(G.nodes)) # number of nodes in metapath
    basic_stats['n_edges'].append(len(G.edges)) #number of edges in metapath
    basic_stats['n_paths'].append(len(all_metapath_nodes[_id])) #number of paths
    basic_stats['metapath'].append(all_metapath_nodes[_id])
    basic_stats['metapath_with_edges'].append(all_metapath_edges[_id])

    
    this_metaedges = [G.edges[e]['metaedge'] for e in G.edges]
    
    all_metaedges += this_metaedges
    unique_metaedges += list(set(this_metaedges))
    
    all_parings += [G.edges[e]['mn_pair'] for e in G.edges]
    all_targets += get_targets(G)
    first_edge_type += get_target_metaedges(G)
    all_nodes += list(G.nodes)
    
    id_to_label = {**id_to_label, **get_id_to_type(G)}
    id_to_name = {**id_to_name, **get_id_to_name(G)}
    
basic_stats = pd.DataFrame(basic_stats)

In [7]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [8]:
basic_stats.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges
0,0,DB00619_MESH_D015464_1,DB:DB00619,MESH:D015464,"(MESH:D000068877, UniProt:P00519, MESH:D015464)",3,2,1,[Drug - Protein - Disease],[Drug - decreases activity of - Protein - causes - Disease]


In [9]:
basic_stats = pd.DataFrame(basic_stats)

In [10]:
basic_stats.shape

(4846, 10)

In [11]:
basic_stats["n_paths"].value_counts()

n_paths
1     4258
2      440
4       48
3       41
0       29
5       15
6       13
21       1
10       1
Name: count, dtype: int64

In [12]:
type(basic_stats)

pandas.core.frame.DataFrame

In [13]:
basic_stats[basic_stats["id"]=="DB00674_MESH_D000544_1"]

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges
2805,2805,DB00674_MESH_D000544_1,DB:DB00674,MESH:D000544,"(MESH:D005702, UniProt:P22303, GO:0006581, CHEBI:15355, HP:0100543, MESH:D000544)",6,5,1,[Drug - Protein - BiologicalProcess - ChemicalSubstance - PhenotypicFeature - Disease],[Drug - decreases activity of - Protein - positively regulates - BiologicalProcess - decreases abundance of - ChemicalSubstance - negatively correlated with - PhenotypicFeature - manifestation of - Disease]


# Benchmark Creation

### Drug- Biological Process dataset creation

In [14]:
def count_go_occurrences(nodes_set):
    # Count how many times 'GO' appears in the nodes and filter for them
    return sum(1 for node in nodes_set if 'GO:' in node)

# Filter DataFrame
go_bp = basic_stats[basic_stats['nodes'].apply(count_go_occurrences)==1]

go_bp.shape

(1840, 10)

In [15]:
go_bp.iloc[0:1]

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges
1,1,DB00619_MESH_D034721_1,DB:DB00619,MESH:D034721,"(MESH:D000068877, UniProt:P10721, UniProt:P16234, GO:0008283, MESH:D034721)",5,5,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - decreases activity of - Protein - positively regulates - BiologicalProcess - causes - Disease]


In [16]:
def extract_go_ids(nodes):
    return ', '.join([node for node in nodes if node.startswith('GO:')])

go_bp['bp'] = go_bp['nodes'].apply(extract_go_ids)

In [17]:
go_bp['Drug_MeshID'] = go_bp['nodes'].apply(lambda x: list(x)[0])

In [18]:
go_bp.iloc[0:1]

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,bp,Drug_MeshID
1,1,DB00619_MESH_D034721_1,DB:DB00619,MESH:D034721,"(MESH:D000068877, UniProt:P10721, UniProt:P16234, GO:0008283, MESH:D034721)",5,5,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - decreases activity of - Protein - positively regulates - BiologicalProcess - causes - Disease],GO:0008283,MESH:D000068877


In [19]:
def get_names(row, id_to_name):
    drug_name = id_to_name.get(row['Drug_MeshID'], 'Unknown')
    disease_name = id_to_name.get(row['disease'], 'Unknown')
    
    # Handle multiple protein IDs
    bp_ids = row['bp'].split(', ') if isinstance(row['bp'], str) else [row['bp']]
    bp_names = [id_to_name.get(bp_id, 'Unknown') for bp_id in bp_ids]

    return pd.Series([drug_name, disease_name, bp_names])


In [20]:
def map_node_names_from_nodeview(node_view, id_to_name):
    if not hasattr(node_view, '__iter__'):
        return ['Unknown']
    
    return [id_to_name.get(str(node_id).strip(), 'Unknown') for node_id in node_view]


In [21]:
go_bp['node_names'] = go_bp['nodes'].apply(lambda x: map_node_names_from_nodeview(x, id_to_name))


In [22]:
go_bp.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,bp,Drug_MeshID,node_names
1,1,DB00619_MESH_D034721_1,DB:DB00619,MESH:D034721,"(MESH:D000068877, UniProt:P10721, UniProt:P16234, GO:0008283, MESH:D034721)",5,5,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - decreases activity of - Protein - positively regulates - BiologicalProcess - causes - Disease],GO:0008283,MESH:D000068877,"[imatinib, Mast/stem cell growth factor receptor Kit, Platelet-derived growth factor receptor alpha, cell population proliferation, Systemic mast cell disease]"


In [23]:
go_bp[['drug_name', 'disease_name', 'bp_name']] = go_bp.apply(get_names, axis=1, id_to_name=id_to_name)

In [24]:
go_bp.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,bp,Drug_MeshID,node_names,drug_name,disease_name,bp_name
1,1,DB00619_MESH_D034721_1,DB:DB00619,MESH:D034721,"(MESH:D000068877, UniProt:P10721, UniProt:P16234, GO:0008283, MESH:D034721)",5,5,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - decreases activity of - Protein - positively regulates - BiologicalProcess - causes - Disease],GO:0008283,MESH:D000068877,"[imatinib, Mast/stem cell growth factor receptor Kit, Platelet-derived growth factor receptor alpha, cell population proliferation, Systemic mast cell disease]",imatinib,Systemic mast cell disease,[cell population proliferation]


In [25]:
# convert bp_name into str
go_bp['bp_name'] = go_bp['bp_name'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

In [26]:
go_bp.shape

(1840, 16)

In [27]:

go_bp['question'] = "Which Drug can be used in the treatment of " + go_bp['disease_name'] + " by targeting biological process: " + go_bp['bp_name'] + "?"

In [28]:
go_bp.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,bp,Drug_MeshID,node_names,drug_name,disease_name,bp_name,question
1,1,DB00619_MESH_D034721_1,DB:DB00619,MESH:D034721,"(MESH:D000068877, UniProt:P10721, UniProt:P16234, GO:0008283, MESH:D034721)",5,5,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - decreases activity of - Protein - positively regulates - BiologicalProcess - causes - Disease],GO:0008283,MESH:D000068877,"[imatinib, Mast/stem cell growth factor receptor Kit, Platelet-derived growth factor receptor alpha, cell population proliferation, Systemic mast cell disease]",imatinib,Systemic mast cell disease,cell population proliferation,Which Drug can be used in the treatment of Systemic mast cell disease by targeting biological process: cell population proliferation?


In [29]:
go_bp["n_nodes"].value_counts()

n_nodes
5     655
6     496
7     231
4     171
8     136
9      47
3      41
10     36
11     16
13      5
12      4
14      2
Name: count, dtype: int64

In [30]:
 # for the drug-centric benchmark, for now will take nodes greaster than 2 and less than 6 to capture mechanistic relationships without them getting too complex with increasing the number of intermediate nodes

In [31]:
go_bp["n_paths"].value_counts()

n_paths
1    1704
2     116
5       8
0       6
3       4
4       2
Name: count, dtype: int64

In [32]:
go_bp["Drug_MeshID"] = go_bp["Drug_MeshID"].str.replace(
    r"^DB:", "DRUGBANK:", regex=True
)

In [33]:
go_bp["drug"] = go_bp["drug"].str.replace(
    r"^DB:", "DRUGBANK:", regex=True
)

In [34]:
# filter go_bp with n_paths == 1 and n_nodes >2 and n_nodes < 6
go_bp_filtered = go_bp[(go_bp["n_paths"] == 1) & (go_bp["n_nodes"] > 2) & (go_bp["n_nodes"] < 6)]
go_bp_filtered.shape

(842, 17)

In [35]:
columns_order = ['idx', 'id', 'drug', 'Drug_MeshID', 'disease', 'bp','drug_name','disease_name','bp_name','nodes', 'n_nodes', 'n_edges', 'n_paths', 'metapath', 'metapath_with_edges', 'question']
go_bp_filtered = go_bp_filtered[columns_order]

In [36]:
go_bp_filtered.head(1)

Unnamed: 0,idx,id,drug,Drug_MeshID,disease,bp,drug_name,disease_name,bp_name,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,question
1,1,DB00619_MESH_D034721_1,DRUGBANK:DB00619,MESH:D000068877,MESH:D034721,GO:0008283,imatinib,Systemic mast cell disease,cell population proliferation,"(MESH:D000068877, UniProt:P10721, UniProt:P16234, GO:0008283, MESH:D034721)",5,5,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - decreases activity of - Protein - positively regulates - BiologicalProcess - causes - Disease],Which Drug can be used in the treatment of Systemic mast cell disease by targeting biological process: cell population proliferation?


In [37]:
go_bp_filtered.shape

(842, 16)

In [38]:
# go_bp_filtered.to_csv("Benchmarks/DMDB_go_bp_filtered.csv", index=False)

### Metabolite dataset

In [39]:
def count_chebi_occurrences(nodes_set):
    # Count how many times 'CEHBI' appears in the nodes 
    return sum(1 for node in nodes_set if 'CHEBI:' in node)

# Filter DataFrame
chebi_metabolite = basic_stats[basic_stats['nodes'].apply(count_chebi_occurrences)==1]

chebi_metabolite.shape

(327, 10)

In [40]:
chebi_metabolite.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges
56,56,DB00994_MESH_D007634_1,DB:DB00994,MESH:D007634,"(MESH:D009355, CHEBI:18111, GO:0006412, taxonomy:1280, MESH:D007634)",5,4,1,[Drug - ChemicalSubstance - BiologicalProcess - OrganismTaxon - Disease],[Drug - decreases activity of - ChemicalSubstance - participates in - BiologicalProcess - in taxon - OrganismTaxon - causes - Disease]


In [41]:
def extract_chebi_ids(nodes):
    return ', '.join([node for node in nodes if node.startswith('CHEBI:')])

chebi_metabolite['metabolite'] = chebi_metabolite['nodes'].apply(extract_chebi_ids)

In [42]:
chebi_metabolite.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,metabolite
56,56,DB00994_MESH_D007634_1,DB:DB00994,MESH:D007634,"(MESH:D009355, CHEBI:18111, GO:0006412, taxonomy:1280, MESH:D007634)",5,4,1,[Drug - ChemicalSubstance - BiologicalProcess - OrganismTaxon - Disease],[Drug - decreases activity of - ChemicalSubstance - participates in - BiologicalProcess - in taxon - OrganismTaxon - causes - Disease],CHEBI:18111


In [43]:
chebi_metabolite['Drug_MeshID'] = chebi_metabolite['nodes'].apply(lambda x: list(x)[0])

In [44]:
def get_names(row, id_to_name):
    drug_name = id_to_name.get(row['Drug_MeshID'], 'Unknown')
    disease_name = id_to_name.get(row['disease'], 'Unknown')
    
    # Handle multiple protein IDs
    metabolite_ids = row['metabolite'].split(', ') if isinstance(row['metabolite'], str) else [row['metabolite']]
    metabolite_names = [id_to_name.get(metabolite_id, 'Unknown') for metabolite_id in metabolite_ids]

    return pd.Series([drug_name, disease_name, metabolite_names])


In [45]:
id_to_name.get("CHEBI:18111")

'ribosomal RNA'

In [46]:
print(repr(chebi_metabolite['nodes'].iloc[0]))


NodeView(('MESH:D009355', 'CHEBI:18111', 'GO:0006412', 'taxonomy:1280', 'MESH:D007634'))


In [47]:
def map_node_names_from_nodeview(node_view, id_to_name):
    if not hasattr(node_view, '__iter__'):
        return ['Unknown']
    
    return [id_to_name.get(str(node_id).strip(), 'Unknown') for node_id in node_view]


In [48]:
chebi_metabolite['node_names'] = chebi_metabolite['nodes'].apply(lambda x: map_node_names_from_nodeview(x, id_to_name))


In [49]:
chebi_metabolite.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,metabolite,Drug_MeshID,node_names
56,56,DB00994_MESH_D007634_1,DB:DB00994,MESH:D007634,"(MESH:D009355, CHEBI:18111, GO:0006412, taxonomy:1280, MESH:D007634)",5,4,1,[Drug - ChemicalSubstance - BiologicalProcess - OrganismTaxon - Disease],[Drug - decreases activity of - ChemicalSubstance - participates in - BiologicalProcess - in taxon - OrganismTaxon - causes - Disease],CHEBI:18111,MESH:D009355,"[neomycin, ribosomal RNA, translation, Staphylococcus aureus, Keratitis]"


In [50]:
# Check if 'ChemicalSubstance' is present in each row
has_chemical = chebi_metabolite['metapath_with_edges'].apply(lambda x: 'ChemicalSubstance' in str(x))

# Count how many rows have it and how many don't
count_with = has_chemical.sum()
count_without = (~has_chemical).sum()

print(f"Rows with 'ChemicalSubstance': {count_with}")
print(f"Rows without 'ChemicalSubstance': {count_without}")


Rows with 'ChemicalSubstance': 303
Rows without 'ChemicalSubstance': 24


In [51]:
chebi_metabolite[~chebi_metabolite['metapath_with_edges'].apply(lambda x: 'ChemicalSubstance' in str(x))].head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,metabolite,Drug_MeshID,node_names
542,542,DB00248_MESH_D006966_1,DB:DB00248,MESH:D006966,"(MESH:D000077465, UniProt:P14416, GO:0007195, CHEBI:17489, GO:0051209, GO:1902722, MESH:D006966)",7,6,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - increases activity of - Protein - participates in - BiologicalProcess - causes - Disease],CHEBI:17489,MESH:D000077465,"[Cabergoline, D(2) dopamine receptor, Adenylate cyclase-inhibiting dopamine receptor signaling pathway, 3',5'-cyclic AMP, Release of sequestered calcium ion into cytosol, Positive regulation of prolactin secretion, Hyperprolactinemia]"


In [52]:
chebi_metabolite[chebi_metabolite['metapath_with_edges'].apply(lambda x: 'ChemicalSubstance' in str(x))].head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,metabolite,Drug_MeshID,node_names
56,56,DB00994_MESH_D007634_1,DB:DB00994,MESH:D007634,"(MESH:D009355, CHEBI:18111, GO:0006412, taxonomy:1280, MESH:D007634)",5,4,1,[Drug - ChemicalSubstance - BiologicalProcess - OrganismTaxon - Disease],[Drug - decreases activity of - ChemicalSubstance - participates in - BiologicalProcess - in taxon - OrganismTaxon - causes - Disease],CHEBI:18111,MESH:D009355,"[neomycin, ribosomal RNA, translation, Staphylococcus aureus, Keratitis]"


In [53]:
chebi_metabolite[['drug_name', 'disease_name', 'metabolite_name']] = chebi_metabolite.apply(get_names, axis=1, id_to_name=id_to_name)

In [54]:
chebi_metabolite.shape

(327, 16)

In [55]:
chebi_metabolite["n_nodes"].value_counts()

n_nodes
7     83
5     55
8     48
6     34
4     21
10    18
11    18
9     18
3      9
13     8
12     7
14     5
15     2
18     1
Name: count, dtype: int64

In [56]:
def extract_edges(row):
    try:
        data = row['metapath_with_edges']
        
        # If it's a list with one string, flatten it
        if isinstance(data, list) and len(data) == 1 and isinstance(data[0], str):
            data = data[0]

        # If it's a string, parse it
        if isinstance(data, str):
            data = data.strip('[]')  # remove brackets
            elements = [x.strip() for x in data.split(' - ')]
        elif isinstance(data, list):
            elements = data  # already parsed
        else:
            return "Invalid format"
        
        edges = [elements[i] for i in range(1, len(elements), 2)]
        return edges
    except Exception as e:
        return f"Error: {e}"



In [57]:

chebi_metabolite['edges'] = chebi_metabolite.apply(extract_edges, axis=1)

In [58]:
def filter_and_format_metabolite_df(
    df,
    include_chemical_substance=True,
    exact_n_paths=1,
    columns_order=None
):
    
    df = df.copy()

    # Normalize DrugBank IDs
    df["Drug_MeshID"] = df["Drug_MeshID"].str.replace(r"^DB:", "DRUGBANK:", regex=True)
    df["drug"] = df["drug"].str.replace(r"^DB:", "DRUGBANK:", regex=True)

    # Exclude subclass metapaths
    df = df[~df["metapath_with_edges"].astype(str).str.contains("subclass")]

    # Include/exclude ChemicalSubstance
    if include_chemical_substance:
        df = df[df["metapath_with_edges"].astype(str).str.contains("ChemicalSubstance")]
    else:
        df = df[~df["metapath_with_edges"].astype(str).str.contains("ChemicalSubstance")]

    # Filter by exact path count
    if exact_n_paths is not None:
        df = df[df["n_paths"] == exact_n_paths]

    # Reorder columns if specified
    if columns_order:
        df = df[[col for col in columns_order if col in df.columns]]

    return df

In [59]:
chebi_metabolite.shape

(327, 17)

In [60]:
result_df = filter_and_format_metabolite_df(chebi_metabolite)
result_df.shape

(201, 17)

In [61]:
result_df["n_nodes"].value_counts()

n_nodes
7     63
5     44
8     28
6     27
4     18
9      9
3      6
10     2
12     1
11     1
14     1
13     1
Name: count, dtype: int64

In [62]:
result_df.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,metabolite,Drug_MeshID,node_names,drug_name,disease_name,metabolite_name,edges
56,56,DB00994_MESH_D007634_1,DRUGBANK:DB00994,MESH:D007634,"(MESH:D009355, CHEBI:18111, GO:0006412, taxonomy:1280, MESH:D007634)",5,4,1,[Drug - ChemicalSubstance - BiologicalProcess - OrganismTaxon - Disease],[Drug - decreases activity of - ChemicalSubstance - participates in - BiologicalProcess - in taxon - OrganismTaxon - causes - Disease],CHEBI:18111,MESH:D009355,"[neomycin, ribosomal RNA, translation, Staphylococcus aureus, Keratitis]",neomycin,Keratitis,[ribosomal RNA],"[decreases activity of, participates in, in taxon, causes]"


In [63]:
result_df.shape

(201, 17)

In [64]:
type(result_df["metabolite_name"])

pandas.core.series.Series

In [65]:
def unwrap_list(x):
    if isinstance(x, list) and x:
        return x[0]
    return None   

result_df['metabolite_name_str'] = result_df['metabolite_name'].apply(unwrap_list)

In [66]:
# result_df["question_3_mod"]

In [67]:
result_df["question"] = "Which biochemical entity is affected by the Drug " + result_df["drug_name"] + " via its mechanism of action in treating the Disease " + result_df["disease_name"] + " ?"

In [68]:
result_df.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,metabolite,Drug_MeshID,node_names,drug_name,disease_name,metabolite_name,edges,metabolite_name_str,question
56,56,DB00994_MESH_D007634_1,DRUGBANK:DB00994,MESH:D007634,"(MESH:D009355, CHEBI:18111, GO:0006412, taxonomy:1280, MESH:D007634)",5,4,1,[Drug - ChemicalSubstance - BiologicalProcess - OrganismTaxon - Disease],[Drug - decreases activity of - ChemicalSubstance - participates in - BiologicalProcess - in taxon - OrganismTaxon - causes - Disease],CHEBI:18111,MESH:D009355,"[neomycin, ribosomal RNA, translation, Staphylococcus aureus, Keratitis]",neomycin,Keratitis,[ribosomal RNA],"[decreases activity of, participates in, in taxon, causes]",ribosomal RNA,Which biochemical entity is affected by the Drug neomycin via its mechanism of action in treating the Disease Keratitis ?


In [69]:
# result_df.to_csv("Benchmarks/DMDB_chebi_metabolite_filtered.csv", index=False)

In [70]:
#result_df.to_csv("_data/DMDB_chebi_metabolite_filtered_04_23_2025.csv", index=False)

### Gene-centric dataset

In [71]:
basic_stats.shape

(4846, 10)

In [72]:
basic_stats.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges
0,0,DB00619_MESH_D015464_1,DB:DB00619,MESH:D015464,"(MESH:D000068877, UniProt:P00519, MESH:D015464)",3,2,1,[Drug - Protein - Disease],[Drug - decreases activity of - Protein - causes - Disease]


In [73]:
def count_uniprot_occurrences(nodes_set):
    # Count how many times 'UniProt' appears in the nodes (which is a set in this case)
    return sum(1 for node in nodes_set if 'UniProt' in node)

# Filter DataFrame
filtered_df = basic_stats[basic_stats['nodes'].apply(count_uniprot_occurrences) == 1]

In [74]:
# unique metapth find in basic_stats[(basic_stats["uniprot_count"]==1) & (basic_stats["n_paths"]==1)]["metapath"]
#basic_stats[(basic_stats["uniprot_count"]==1) & (basic_stats["n_paths"]==1)]["metapath"].value_counts()

In [75]:
filtered_df.shape

(1802, 10)

In [76]:
# filtered_df.groupby(["drug", "disease"]).head(1)

In [77]:
filtered_df["n_paths"].value_counts()

n_paths
1     1523
2      200
4       27
0       18
3       15
5       11
6        7
10       1
Name: count, dtype: int64

In [78]:
filtered_df.head(1)

Unnamed: 0,idx,id,drug,disease,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges
0,0,DB00619_MESH_D015464_1,DB:DB00619,MESH:D015464,"(MESH:D000068877, UniProt:P00519, MESH:D015464)",3,2,1,[Drug - Protein - Disease],[Drug - decreases activity of - Protein - causes - Disease]


In [79]:
def extract_uniprot_ids(nodes):
    return ', '.join([node for node in nodes if node.startswith('UniProt:')])

filtered_df['protein'] = filtered_df['nodes'].apply(extract_uniprot_ids)

In [80]:
columns_order = ['idx', 'id', 'drug', 'disease', 'protein', 'nodes', 'n_nodes', 'n_edges', 'n_paths', 'metapath', 'metapath_with_edges']
filtered_df = filtered_df[columns_order]
filtered_df['Drug_MeshID'] = filtered_df['nodes'].apply(lambda x: list(x)[0])
filtered_df.head(1)

Unnamed: 0,idx,id,drug,disease,protein,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,Drug_MeshID
0,0,DB00619_MESH_D015464_1,DB:DB00619,MESH:D015464,UniProt:P00519,"(MESH:D000068877, UniProt:P00519, MESH:D015464)",3,2,1,[Drug - Protein - Disease],[Drug - decreases activity of - Protein - causes - Disease],MESH:D000068877


In [81]:
def get_names(row, id_to_name):
    drug_name = id_to_name.get(row['Drug_MeshID'], 'Unknown')
    disease_name = id_to_name.get(row['disease'], 'Unknown')
    protein_name = id_to_name.get(row['protein'], 'Unknown')
    
    return pd.Series([drug_name, disease_name, protein_name])

filtered_df[['drug_name', 'disease_name', 'protein_name']] = filtered_df.apply(get_names, axis=1, id_to_name=id_to_name)

In [82]:
filtered_df.head(1)

Unnamed: 0,idx,id,drug,disease,protein,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,Drug_MeshID,drug_name,disease_name,protein_name
0,0,DB00619_MESH_D015464_1,DB:DB00619,MESH:D015464,UniProt:P00519,"(MESH:D000068877, UniProt:P00519, MESH:D015464)",3,2,1,[Drug - Protein - Disease],[Drug - decreases activity of - Protein - causes - Disease],MESH:D000068877,imatinib,Chronic myeloid leukemia,Tyrosine-protein kinase ABL1


In [83]:
def convert_protein_to_gene_symbol(uniprot_id):
    # Remove 'UniProt:' prefix
    uniprot_id = uniprot_id.replace('UniProt:', '')
    
    # Query mygene to get gene symbol
    gene_info = mg.query(uniprot_id, scopes='uniprot', fields='symbol', species='human')
    
    if gene_info and 'hits' in gene_info and gene_info['hits']:
        return gene_info['hits'][0].get('symbol', 'N/A')
    return 'N/A'

filtered_df['protein_gene_symbol'] = filtered_df['protein'].apply(convert_protein_to_gene_symbol)

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

In [84]:
filtered_df.shape

(1802, 16)

In [85]:
filtered_df = filtered_df[filtered_df["n_paths"]==1]
filtered_df.shape

(1523, 16)

In [86]:
filtered_df[filtered_df['protein_gene_symbol'] == 'N/A'].shape

(256, 16)

In [87]:
filtered_df.columns

Index(['idx', 'id', 'drug', 'disease', 'protein', 'nodes', 'n_nodes',
       'n_edges', 'n_paths', 'metapath', 'metapath_with_edges', 'Drug_MeshID',
       'drug_name', 'disease_name', 'protein_name', 'protein_gene_symbol'],
      dtype='object')

In [88]:
columns_order = ['idx', 'id', 'drug', 'Drug_MeshID', 'disease', 'protein',	'drug_name','disease_name',	'protein_name','protein_gene_symbol','nodes', 'n_nodes', 'n_edges', 'n_paths', 'metapath', 'metapath_with_edges']
filtered_df = filtered_df[columns_order]

In [89]:
filtered_df["question"] = "Which gene plays the most significant mechanistic role in how Drug " + filtered_df["drug_name"] + " treats or impacts the Disease " + filtered_df["disease_name"] + "?"
filtered_df.head(1)

Unnamed: 0,idx,id,drug,Drug_MeshID,disease,protein,drug_name,disease_name,protein_name,protein_gene_symbol,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,question
0,0,DB00619_MESH_D015464_1,DB:DB00619,MESH:D000068877,MESH:D015464,UniProt:P00519,imatinib,Chronic myeloid leukemia,Tyrosine-protein kinase ABL1,ABL1,"(MESH:D000068877, UniProt:P00519, MESH:D015464)",3,2,1,[Drug - Protein - Disease],[Drug - decreases activity of - Protein - causes - Disease],Which gene plays the most significant mechanistic role in how Drug imatinib treats or impacts the Disease Chronic myeloid leukemia?


In [90]:
# ids_to_select = ["DB08799_MESH_D012223_1", "DB00342_MESH_D017449_1", "DB01048_MESH_D015658_1"]
# # print rows in filtered_df where id is in ids_to_select
# filtered_df[filtered_df["id"].isin(ids_to_select)]

In [91]:
filtered_df.shape

(1523, 17)

In [92]:
drugmechdb_filtered_df  = filtered_df[filtered_df["n_paths"]==1]

In [93]:
drugmechdb_filtered_df.head(1)

Unnamed: 0,idx,id,drug,Drug_MeshID,disease,protein,drug_name,disease_name,protein_name,protein_gene_symbol,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,question
0,0,DB00619_MESH_D015464_1,DB:DB00619,MESH:D000068877,MESH:D015464,UniProt:P00519,imatinib,Chronic myeloid leukemia,Tyrosine-protein kinase ABL1,ABL1,"(MESH:D000068877, UniProt:P00519, MESH:D015464)",3,2,1,[Drug - Protein - Disease],[Drug - decreases activity of - Protein - causes - Disease],Which gene plays the most significant mechanistic role in how Drug imatinib treats or impacts the Disease Chronic myeloid leukemia?


In [94]:
na_count = (drugmechdb_filtered_df['protein_gene_symbol'] == 'N/A').sum()
print(f"Rows with protein_gene_symbol == 'N/A': {na_count}")

# Remove those rows
drugmechdb_filtered_df = drugmechdb_filtered_df[ 
    drugmechdb_filtered_df['protein_gene_symbol'] != 'N/A'
]

print("New dataframe shape:", drugmechdb_filtered_df.shape)

Rows with protein_gene_symbol == 'N/A': 256
New dataframe shape: (1267, 17)


In [95]:
drugmechdb_filtered_df["n_nodes"].value_counts().sort_index()

n_nodes
3      12
4     130
5     389
6     416
7     245
8      60
9      12
11      2
13      1
Name: count, dtype: int64

In [96]:
# for BTE ID matching and query
drugmechdb_filtered_df["Drug_MeshID"] = drugmechdb_filtered_df["Drug_MeshID"].str.replace(r"^DB:", "DRUGBANK:", regex=True)

# Filter out rows with missing protein_gene_symbol values
filtered_df = drugmechdb_filtered_df.dropna(subset=["protein_gene_symbol"])
print("shape of filtered_df:",filtered_df.shape)

# Group by disease and Drug_MeshID and aggregate the required columns
grouped_with_genes = (
    filtered_df.groupby(["disease", "Drug_MeshID"])
    .agg({
        "protein_gene_symbol": lambda x: list(set(x)),  # Concatenate unique gene symbols
        "id": lambda x: list(set(x)),  # Concatenate IDs into a list
        "protein": lambda x: list(set(x)),  # Concatenate proteins into a list
        "protein_name": lambda x: list(set(x)),  # Concatenate protein names into a list
        "drug": "first",  # Take the first value for drug (assuming it's the same per group)
        "drug_name": "first",  # Take the first value for drug_name
        "disease_name": "first",  # Take the first value for disease_name
        "question": "first",  # Take the first value for the question
    })
    .reset_index()
)
print("shape of grouped_with_genes:",grouped_with_genes.shape)

# Add a count column 
grouped_with_genes["count"] = grouped_with_genes["protein_gene_symbol"].apply(len)

# Select only the specified columns for the final DataFrame
final_df = grouped_with_genes[
    [
        "id",
        "drug",
        "Drug_MeshID",
        "disease",
        "protein",
        "drug_name",
        "disease_name",
        "protein_name",
        "protein_gene_symbol",
        "question",
        "count",  
        
    ]
]


shape of filtered_df: (1267, 17)
shape of grouped_with_genes: (1207, 10)


In [97]:
final_df["count"].value_counts()

count
1    1183
2      14
3       7
6       1
4       1
5       1
Name: count, dtype: int64

In [98]:
# Question generation 
final_df = final_df.copy()


In [99]:
final_df = final_df.drop_duplicates(subset=["drug_name", "disease_name"])
print("shape of final_df after dropping duplicates:", final_df.shape)

shape of final_df after dropping duplicates: (1206, 11)


In [100]:
final_df[["count"]].value_counts().sort_index()

count
1        1182
2          14
3           7
4           1
5           1
6           1
Name: count, dtype: int64

In [101]:
#select only the rows where count is 1
final_df_count_1 = final_df[final_df["count"] == 1]
final_df_count_1 = final_df_count_1.reset_index(drop=True)
print("shape of final_df_count_1:", final_df_count_1.shape)

shape of final_df_count_1: (1182, 11)


In [102]:
# find for each drug how many diseases it is associated with
drug_disease_counts = final_df_count_1.groupby("drug_name")["disease_name"].nunique().reset_index()
drug_disease_counts["disease_name"].value_counts()

disease_name
1     338
2      94
3      33
4      27
6      23
5      13
8       9
7       9
9       2
11      2
59      1
12      1
Name: count, dtype: int64

In [103]:
# from final_df select only the rows where count is 1 and drug_name is in the drug_disease_counts with disease_name <6
final_df_count_1_filtered = final_df_count_1[
    final_df_count_1["drug_name"].isin(drug_disease_counts[drug_disease_counts["disease_name"] < 6]["drug_name"])
]
print("shape of final_df_count_1_filtered:", final_df_count_1_filtered.shape)

shape of final_df_count_1_filtered: (798, 11)


In [104]:
final_df_count_1_filtered.head(1)

Unnamed: 0,id,drug,Drug_MeshID,disease,protein,drug_name,disease_name,protein_name,protein_gene_symbol,question,count
0,[DB01219_MESH_C535694_1],DB:DB01219,MESH:D003620,MESH:C535694,[UniProt:P21817],Dantrolene,Malignant hyperthermia,[Ryanodine receptor 1],[RYR1],Which gene plays the most significant mechanistic role in how Drug Dantrolene treats or impacts the Disease Malignant hyperthermia?,1


In [105]:
# final_df_count_1_filtered.to_csv(
#     "Benchmarks/DMDB_mechanistic_genes_filtered.csv", index=False
# )

In [106]:
# final_df_count_1_filtered.to_csv(
#     "data/DMDB_questions_sampled/drugmechDB_mechanistic_genes_df_final_single_count_798qa.csv", index=False
# )

In [107]:
# save the filtered_df
# filtered_df.to_csv("_data/drugmechDB_mechanistic_genes_df.csv", index=False)

In [108]:
import mygene
mg = mygene.MyGeneInfo()

In [109]:
mg = mygene.MyGeneInfo()

# List of NCBI Gene IDs
gene_ids = ['80336', '246329', '31111', '8835', '79602', '23030', '54997', '5413', '2185', '5742', '2995', '10678']

gene_info = mg.querymany(gene_ids, scopes='entrezgene', fields='symbol', species='human')

gene_name_dict = {item['query']: item.get('symbol', 'N/A') for item in gene_info}
df = pd.DataFrame(list(gene_name_dict.items()), columns=['NCBI Gene ID', 'Gene Name'])

print(df)

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
1 input query terms found no hit:	['31111']


   NCBI Gene ID Gene Name
0         80336   PABPC1L
1        246329     STAC3
2         31111       N/A
3          8835     SOCS2
4         79602   ADIPOR2
5         23030     KDM4B
6         54997      TESC
7          5413   SEPTIN5
8          2185     PTK2B
9          5742     PTGS1
10         2995      GYPC
11        10678    B3GNT2


In [110]:
uniprot_ids = ['P00519', 'P35354', 'P0A7S3', 'Q12791', 'P10613']

# Query mygene to get gene symbols
gene_info = mg.querymany(uniprot_ids, scopes='uniprot', fields='symbol', species='human')

gene_name_dict = {item['query']: item.get('symbol', 'N/A') for item in gene_info}
df = pd.DataFrame(list(gene_name_dict.items()), columns=['UniProt ID', 'Gene Symbol'])

print(df)

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
2 input query terms found no hit:	['P0A7S3', 'P10613']


  UniProt ID Gene Symbol
0     P00519        ABL1
1     P35354       PTGS2
2     P0A7S3         N/A
3     Q12791      KCNMA1
4     P10613         N/A


# References/Sources scrapping

In [111]:
import requests
from bs4 import BeautifulSoup

def get_reference_from_page(entry_id):
    url_id = entry_id.lower().replace('_', '-')
    url = f'https://sulab.github.io/DrugMechDB/{url_id}.html'
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Failed to fetch {url} — status {resp.status_code}")
        return []

    soup = BeautifulSoup(resp.text, 'html.parser')

    # Locate the "Reference:" heading
    ref_heading = soup.find(text='Reference:')
    if not ref_heading:
        return []

    parent = ref_heading.parent
    a_tag = parent.find_next('a')
    if a_tag and a_tag.has_attr('href'):
        return [a_tag['href']]

    return []


In [112]:
refs = get_reference_from_page("DB00619_MESH_D015464_1")
print(refs)

['https://go.drugbank.com/drugs/DB00619#mechanism-of-action']


In [113]:
filtered_df.shape

(1267, 17)

In [114]:
filtered_df.columns

Index(['idx', 'id', 'drug', 'Drug_MeshID', 'disease', 'protein', 'drug_name',
       'disease_name', 'protein_name', 'protein_gene_symbol', 'nodes',
       'n_nodes', 'n_edges', 'n_paths', 'metapath', 'metapath_with_edges',
       'question'],
      dtype='object')

In [115]:
def get_reference_from_page(entry_id):
    url_id = entry_id.lower().replace('_', '-')
    url = f'https://sulab.github.io/DrugMechDB/{url_id}.html'
    resp = requests.get(url)
    if resp.status_code != 200:
        return []
    soup = BeautifulSoup(resp.text, 'html.parser')

    # Find the “Reference:” heading 
    ref_heading = soup.find(text='Reference:')
    if not ref_heading:
        return []
    a_tag = ref_heading.parent.find_next('a')
    return [a_tag['href']] if a_tag and a_tag.has_attr('href') else []


def refs_for_idx(idx):
    entry = ind[idx]
    entry_id = entry['graph']['_id']
    return get_reference_from_page(entry_id)

In [116]:
filtered_df['references'] = filtered_df['idx'].apply(refs_for_idx)

In [117]:
mask_na = filtered_df['references'].isna()
#catches cases where you have an empty Python []
mask_empty_list = filtered_df['references'].apply(
    lambda x: isinstance(x, list) and len(x) == 0
)
#catches blank strings ("" or " ")
mask_empty_str = filtered_df['references'].apply(
    lambda x: isinstance(x, str) and not x.strip()
)

# Combine them
mask_empty = mask_na | mask_empty_list | mask_empty_str

# View the offending rows
empty_refs_df = filtered_df[mask_empty]
print(f"Found {len(empty_refs_df)} rows with empty/missing references:")
print(empty_refs_df)

# indices
empty_indices = filtered_df.index[mask_empty].tolist()
print("Row indices with no references:", empty_indices)

Found 0 rows with empty/missing references:
Empty DataFrame
Columns: [idx, id, drug, Drug_MeshID, disease, protein, drug_name, disease_name, protein_name, protein_gene_symbol, nodes, n_nodes, n_edges, n_paths, metapath, metapath_with_edges, question, references]
Index: []
Row indices with no references: []


In [118]:
def find_empty_values(df: pd.DataFrame,
                      column: str):
    """
    Return a DataFrame of all rows where df[column] is:
      - NaN
      - an empty list ([])
      - an empty or all‑whitespace string
    """
    if column not in df.columns:
        raise KeyError(f"Column {column!r} not found in DataFrame.")
    
    mask_na = df[column].isna()
    
    #for empty lists
    mask_empty_list = df[column].apply(
        lambda x: isinstance(x, list) and len(x) == 0
    )
    
    #blank strings
    mask_empty_str = df[column].apply(
        lambda x: isinstance(x, str) and not x.strip()
    )
    
    # Combine all
    mask_empty = mask_na | mask_empty_list | mask_empty_str
    
    return df[mask_empty]

In [119]:
empty_refs = find_empty_values(filtered_df, 'references')
print(f"{len(empty_refs)} rows with empty/missing references:")
print(empty_refs)

0 rows with empty/missing references:
Empty DataFrame
Columns: [idx, id, drug, Drug_MeshID, disease, protein, drug_name, disease_name, protein_name, protein_gene_symbol, nodes, n_nodes, n_edges, n_paths, metapath, metapath_with_edges, question, references]
Index: []


In [120]:
go_bp_filtered['references'] = go_bp_filtered['idx'].apply(refs_for_idx)

In [121]:
result_df['references'] = result_df['idx'].apply(refs_for_idx)

In [122]:
go_bp_filtered.head(1)

Unnamed: 0,idx,id,drug,Drug_MeshID,disease,bp,drug_name,disease_name,bp_name,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,question,references
1,1,DB00619_MESH_D034721_1,DRUGBANK:DB00619,MESH:D000068877,MESH:D034721,GO:0008283,imatinib,Systemic mast cell disease,cell population proliferation,"(MESH:D000068877, UniProt:P10721, UniProt:P16234, GO:0008283, MESH:D034721)",5,5,1,[Drug - Protein - BiologicalProcess - Disease],[Drug - decreases activity of - Protein - positively regulates - BiologicalProcess - causes - Disease],Which Drug can be used in the treatment of Systemic mast cell disease by targeting biological process: cell population proliferation?,[https://go.drugbank.com/drugs/DB00619#mechanism-of-action]


In [123]:
empty_refs = find_empty_values(go_bp_filtered, 'references')
print(f"{len(empty_refs)} rows with empty/missing references:")
print(empty_refs)

#https://sulab.github.io/DrugMechDB/mesh-c106301-mesh-d003233-1.html

1 rows with empty/missing references:
    idx                           id  drug   Drug_MeshID       disease  \
50   50  MESH_C106301_MESH_D003233_1  None  MESH:C106301  MESH:D003233   

            bp   drug_name             disease_name  \
50  GO:0002553  tazanolast  Allergic conjunctivitis   

                             bp_name  \
50  Histamine secretion by mast cell   

                                       nodes  n_nodes  n_edges  n_paths  \
50  (MESH:C106301, GO:0002553, MESH:D003233)        3        2        1   

                                metapath  \
50  [Drug - BiologicalProcess - Disease]   

                                                              metapath_with_edges  \
50  [Drug - negatively regulates - BiologicalProcess - correlated with - Disease]   

                                                                                                                                 question  \
50  Which Drug can be used in the treatment of Allergic conjunctiviti

In [124]:
empty_refs = find_empty_values(result_df, 'references')
print(f"{len(empty_refs)} rows with empty/missing references:")
print(empty_refs)

0 rows with empty/missing references:
Empty DataFrame
Columns: [idx, id, drug, disease, nodes, n_nodes, n_edges, n_paths, metapath, metapath_with_edges, metabolite, Drug_MeshID, node_names, drug_name, disease_name, metabolite_name, edges, metabolite_name_str, question, references]
Index: []
