# Mechanistic genes data


In [11]:
import pandas as pd
import numpy as np

In [12]:
drugmechdb_filtered_df = pd.read_csv("data/benchmark_data/drugmechDB_mechanistic_genes_df.csv")
print(drugmechdb_filtered_df.shape)
drugmechdb_filtered_df.head(1)

(1523, 17)


Unnamed: 0,idx,id,drug,Drug_MeshID,disease,protein,drug_name,disease_name,protein_name,protein_gene_symbol,nodes,n_nodes,n_edges,n_paths,metapath,metapath_with_edges,question
0,0,DB00619_MESH_D015464_1,DB:DB00619,MESH:D000068877,MESH:D015464,UniProt:P00519,imatinib,Chronic myeloid leukemia,Tyrosine-protein kinase ABL1,ABL1,"['MESH:D000068877', 'UniProt:P00519', 'MESH:D015464']",3,2,1,['Drug - Protein - Disease'],['Drug - decreases activity of - Protein - causes - Disease'],What gene is most mechanistically related to the indication of Drug imatinib to Disease Chronic myeloid leukemia?


In [12]:
# find how many rows have protein_gene_symbol as NaN 
print(drugmechdb_filtered_df["protein_gene_symbol"].isnull().sum())
# remove rows where protein_gene_symbol is NaN
drugmechdb_filtered_df = drugmechdb_filtered_df.dropna(subset=["protein_gene_symbol"])
drugmechdb_filtered_df.shape

0


(1267, 17)

In [None]:
drugmechdb_filtered_df.columns

Index(['idx', 'id', 'drug', 'Drug_MeshID', 'disease', 'protein', 'drug_name',
       'disease_name', 'protein_name', 'protein_gene_symbol', 'nodes',
       'n_nodes', 'n_edges', 'n_paths', 'metapath', 'metapath_with_edges',
       'question'],
      dtype='object')

In [13]:
drugmechdb_filtered_df["n_nodes"].value_counts().sort_index()

n_nodes
3      12
4     130
5     389
6     416
7     245
8      60
9      12
11      2
13      1
Name: count, dtype: int64

In [14]:
# for BTE ID matching and query
drugmechdb_filtered_df["Drug_MeshID"] = drugmechdb_filtered_df["Drug_MeshID"].str.replace(r"^DB:", "DRUGBANK:", regex=True)

# Filter out rows with missing protein_gene_symbol values
filtered_df = drugmechdb_filtered_df.dropna(subset=["protein_gene_symbol"])
print("shape of filtered_df:",filtered_df.shape)

# Group by disease and Drug_MeshID and aggregate the required columns
grouped_with_genes = (
    filtered_df.groupby(["disease", "Drug_MeshID"])
    .agg({
        "protein_gene_symbol": lambda x: list(set(x)),  # Concatenate unique gene symbols
        "id": lambda x: list(set(x)),  # Concatenate IDs into a list
        "protein": lambda x: list(set(x)),  # Concatenate proteins into a list
        "protein_name": lambda x: list(set(x)),  # Concatenate protein names into a list
        "drug": "first",  # Take the first value for drug (assuming it's the same per group)
        "drug_name": "first",  # Take the first value for drug_name
        "disease_name": "first",  # Take the first value for disease_name
        "question": "first",  # Take the first value for the question
    })
    .reset_index()
)
print("shape of grouped_with_genes:",grouped_with_genes.shape)

# Add a count column 
grouped_with_genes["count"] = grouped_with_genes["protein_gene_symbol"].apply(len)

# Select only the specified columns for the final DataFrame
final_df = grouped_with_genes[
    [
        "id",
        "drug",
        "Drug_MeshID",
        "disease",
        "protein",
        "drug_name",
        "disease_name",
        "protein_name",
        "protein_gene_symbol",
        "question",
        "count",  
        
    ]
]


shape of filtered_df: (1267, 17)
shape of grouped_with_genes: (1207, 10)


In [15]:
final_df["count"].value_counts()

count
1    1183
2      14
3       7
6       1
4       1
5       1
Name: count, dtype: int64

In [16]:
final_df[final_df["count"]==5].head(5)

Unnamed: 0,id,drug,Drug_MeshID,disease,protein,drug_name,disease_name,protein_name,protein_gene_symbol,question,count
749,"[DB00508_MESH_D011618_1, DB00508_MESH_D011618_...",DB:DB00508,MESH:D014273,MESH:D011618,"[UniProt:P14416, UniProt:P21728, UniProt:P1122...",Triflupromazine,Psychotic Disorders,"[Muscarinic acetylcholine receptor M2, D(2) do...","[CHRM1, HTR2B, DRD2, DRD1, CHRM2]",What gene is most mechanistically related to t...,5


In [17]:
# Question generation 
final_df = final_df.copy()
final_df.loc[:, 'question_V3'] = final_df.apply(
    lambda row: f"Which gene plays the most significant mechanistic role in how Drug {row['drug_name']} treats or impacts the Disease {row['disease_name']}?",
    axis=1
)

In [18]:
final_df.head(1)

Unnamed: 0,id,drug,Drug_MeshID,disease,protein,drug_name,disease_name,protein_name,protein_gene_symbol,question,count,question_V3
0,[DB01219_MESH_C535694_1],DB:DB01219,MESH:D003620,MESH:C535694,[UniProt:P21817],Dantrolene,Malignant hyperthermia,[Ryanodine receptor 1],[RYR1],What gene is most mechanistically related to t...,1,Which gene plays the most significant mechanis...


In [19]:
final_df[["drug_name", "disease_name"]].drop_duplicates().shape

(1206, 2)

In [20]:
final_df = final_df.drop_duplicates(subset=["drug_name", "disease_name"])
print("shape of final_df after dropping duplicates:", final_df.shape)

shape of final_df after dropping duplicates: (1206, 12)


In [21]:
final_df[["count"]].value_counts().sort_index()

count
1        1182
2          14
3           7
4           1
5           1
6           1
Name: count, dtype: int64

In [22]:
#select only the rows where count is 1
final_df_count_1 = final_df[final_df["count"] == 1]
final_df_count_1 = final_df_count_1.reset_index(drop=True)
print("shape of final_df_count_1:", final_df_count_1.shape)

shape of final_df_count_1: (1182, 12)


In [23]:
# find for each drug how many diseases it is associated with
drug_disease_counts = final_df_count_1.groupby("drug_name")["disease_name"].nunique().reset_index()
drug_disease_counts["disease_name"].value_counts()

disease_name
1     338
2      94
3      33
4      27
6      23
5      13
8       9
7       9
9       2
11      2
59      1
12      1
Name: count, dtype: int64

In [24]:
338+94+94+33+33+33+27+27+27+27+13+13+13+13+13

798

In [25]:
# from final_df select only the rows where count is 1 and drug_name is in the drug_disease_counts with disease_name <6
final_df_count_1_filtered = final_df_count_1[
    final_df_count_1["drug_name"].isin(drug_disease_counts[drug_disease_counts["disease_name"] < 6]["drug_name"])
]
print("shape of final_df_count_1_filtered:", final_df_count_1_filtered.shape)

shape of final_df_count_1_filtered: (798, 12)


In [26]:
# save the final DataFrame to a CSV file
final_df_count_1_filtered.to_csv(
    "data/DMDB_questions_sampled/drugmechDB_mechanistic_genes_df_final_single_count_798qa.csv", index=False
)

In [3776]:
final_df_count_1_filtered.head(1)

Unnamed: 0,id,drug,Drug_MeshID,disease,protein,drug_name,disease_name,protein_name,protein_gene_symbol,question,count,question_V3
0,[DB01219_MESH_C535694_1],DB:DB01219,MESH:D003620,MESH:C535694,[UniProt:P21817],Dantrolene,Malignant hyperthermia,[Ryanodine receptor 1],[RYR1],What gene is most mechanistically related to the indication of Drug Dantrolene to Disease Malignant hyperthermia?,1,Which gene plays the most significant mechanistic role in how Drug Dantrolene treats or impacts the Disease Malignant hyperthermia?


In [3780]:
#set seed and select random 500 from final_df_count_1_filtered
import pandas as pd
import numpy as np
np.random.seed(42)
random_sample = final_df_count_1_filtered.sample(n=500, random_state=42).reset_index(drop=True)
print("shape of random_sample:", random_sample.shape)
# Save the final DataFrame to a CSV file
random_sample.to_csv("data/DMDB_questions_sampled/drugmechDB_mechanistic_genes_df_final_single_count_1.csv", index=False)


shape of random_sample: (500, 12)


## BTE querying format

In [2937]:
df_drug = drugmechdb_filtered_df[["Drug_MeshID", "drug_name"]]

df_drug[df_drug.duplicated(keep=False)].value_counts()

Drug_MeshID       drug_name                           
MESH:D003907      dexamethasone                           59
MESH:D010632      pheniramine                             12
MESH:D000068656   mometasone furoate                      11
MESH:D000068298   fluticasone propionate                  11
MESH:D011395      promazine                                9
MESH:C531958      lenvatinib                               9
MESH:D004809      Ephedrine                                9
MESH:D004319      doxylamine                               9
MESH:D003501      cyclizine                                8
MESH:D003915      dextromethorphan                         8
MESH:C004649      carbinoxamine                            8
MESH:D017374      Paroxetine                               8
MESH:C006656      azatadine                                8
MESH:D006206      halcinonide                              8
MESH:D002927      Cimetidine                               8
MESH:D000069470   venlafaxine 

In [2943]:
drugmechdb_drug = drugmechdb_filtered_df[["Drug_MeshID", "drug_name"]].drop_duplicates()
drugmechdb_disease = drugmechdb_filtered_df[["disease", "disease_name"]].drop_duplicates()
print("Number of unique drugs",drugmechdb_drug.shape[0])
print("Number of unique diseases",drugmechdb_disease.shape[0])
print("Number of unique drug-disease pairs",drugmechdb_filtered_df.shape[0])
print("Number of unique drugs and diseases", drugmechdb_filtered_df[["Drug_MeshID", "disease"]].drop_duplicates().shape[0])

Number of unique drugs 571
Number of unique diseases 366
Number of unique drug-disease pairs 1267
Number of unique drugs and diseases 1207


In [None]:
571+366

937

In [2949]:
# BTE query results for drug and disease IDs
bte_parquet = pd.read_parquet("data/analysis_results/bte_results.parquet")
bte_parquet.shape

(937, 4)

In [2962]:
bte_parquet.columns

Index(['entity_id', 'type', 'context', 'context_decoded'], dtype='object')