In [19]:
import pandas as pd
import numpy as np

# Load DGIdb files with only necessary columns
interactions_columns = ['gene_name', 'drug_concept_id', 'interaction_group_score']
genes_columns = ['gene_name', 'gene_claim_name']
drugs_columns = ['drug_claim_name', 'concept_id']

# DGIdb files
drugs_df = pd.read_csv('../data/indication_gene_link/dgidb/drugs.tsv', sep='\t', usecols=drugs_columns)
genes_df = pd.read_csv('../data/indication_gene_link/dgidb/genes.tsv', sep='\t', usecols=genes_columns)
interactions_df = pd.read_csv('../data/indication_gene_link/dgidb/interactions.tsv', sep='\t', usecols=interactions_columns)

In [20]:
# Handle missing values before merging
drugs_df.dropna(subset=['concept_id'], inplace=True)
genes_df.dropna(subset=['gene_name'], inplace=True)
interactions_df.dropna(subset=['drug_concept_id'], inplace=True)

In [21]:
# Merge DGIdb data
dgIdb_merged = pd.merge(interactions_df, genes_df, on='gene_name', how='inner')
dgIdb_merged = pd.merge(dgIdb_merged, drugs_df, left_on='drug_concept_id', right_on='concept_id', how='inner')
dgIdb_merged['drug_concept_id'] = dgIdb_merged['drug_concept_id'].str.replace('chembl:', '', regex=False)

dgIdb_merged.tail(30)

Unnamed: 0,gene_name,drug_concept_id,interaction_group_score,gene_claim_name,drug_claim_name,concept_id
4230272,GCH1,CHEMBL219568,127.3,GTP cyclohydrolase-I,chembl:CHEMBL219568,chembl:CHEMBL219568
4230273,GCH1,CHEMBL219568,127.3,GTP cyclohydrolase-I,Guanine,chembl:CHEMBL219568
4230274,TNFSF12,CHEMBL2109600,127.3,ENSG00000239697,chembl:CHEMBL2109600,chembl:CHEMBL2109600
4230275,TNFSF12,CHEMBL2109600,127.3,ENSG00000239697,RO-5458640,chembl:CHEMBL2109600
4230276,TNFSF12,CHEMBL2109600,127.3,ENSG00000239697,chembl:CHEMBL2109600,chembl:CHEMBL2109600
4230277,TNFSF12,CHEMBL2109600,127.3,ENSG00000239697,RO-5458640,chembl:CHEMBL2109600
4230278,TNFSF12,CHEMBL2109600,127.3,TNF-related weak inducer of apoptosis,chembl:CHEMBL2109600,chembl:CHEMBL2109600
4230279,TNFSF12,CHEMBL2109600,127.3,TNF-related weak inducer of apoptosis,RO-5458640,chembl:CHEMBL2109600
4230280,LIFR,CHEMBL2108400,127.3,3977,Emfilermin,chembl:CHEMBL2108400
4230281,LIFR,CHEMBL2108400,127.3,3977,chembl:CHEMBL2108400,chembl:CHEMBL2108400


In [22]:
# Ruiz data
ruiz_df = pd.read_csv('../data/indication_gene_link/ruiz_filtered_data.csv')

from chembl_webresource_client.new_client import new_client

# Create a function to fetch ChEMBL ID using DrugBank ID
def get_chembl_id_from_drugbank(drugbank_id):
    try:
        # Use the new_client to search for the drug
        res = new_client.molecule.search(drugbank_id)
        # If there are results and ChEMBL ID is present, return it
        if res:
            chembl_id = res[0]['molecule_chembl_id']
            return chembl_id if chembl_id else None
        return None
    except Exception as e:
        print(f"Error fetching ChEMBL ID for DrugBank ID {drugbank_id}: {e}")
        return None


ruiz_df['chembl_id'] = ruiz_df['drug'].apply(get_chembl_id_from_drugbank)
ruiz_df.head(100)

Unnamed: 0,drug,drug_name,indication,indication_name,Disease,chembl_id
0,DB00130,glutamine-(l),C0003467,anxiety,Anxiety Disorder,CHEMBL930
1,DB00163,Vitamin E,C0002395,Alzheimer's disease,Alzheimer’s Disease,CHEMBL47
2,DB00176,fluvoxamine,C0028768,obsessive compulsive disorder (OCD),OCD & Tourette Syndrome,CHEMBL814
3,DB00182,Amphetamine,C1263846,attention-deficit/hyperactivity disorder (ADHD),Attention Deficit Hyperactivity Disorder,CHEMBL405
4,DB00186,lorazepam,C0003467,anxiety,Anxiety Disorder,CHEMBL580
...,...,...,...,...,...,...
95,DB01151,desipramine,C0011581,Depressive disorder,Major Depressive Disorder,CHEMBL72
96,DB01156,bupropion,C0011581,Depressive disorder,Major Depressive Disorder,CHEMBL894
97,DB01156,bupropion,C1269683,Major Depressive Disorder,Major Depressive Disorder,CHEMBL894
98,DB01171,moclobemide,C0011581,Depressive disorder,Major Depressive Disorder,CHEMBL86304


In [23]:
# merge with the DGIdb data using the 'chembl_id' column
merged_df = pd.merge(ruiz_df, dgIdb_merged, left_on='chembl_id', right_on='drug_concept_id', how='inner')

# Select and rename required columns
final_df = merged_df[['indication', 'indication_name', 'Disease', 'chembl_id', 'drug_name', 'gene_name', 'gene_claim_name']]
final_df.rename(columns={'chembl_id': 'drugbank_id', 'drug_name': 'db_name'}, inplace=True)

# Rename the 'drug' column to 'drugbank_id'
final_df.rename(columns={'drug': 'drugbank_id'}, inplace=True)

# Remove duplicate rows
final_df.drop_duplicates(inplace=True)

# Display the head of the final DataFrame to check the changes
final_df.head(100)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns={'chembl_id': 'drugbank_id', 'drug_name': 'db_name'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns={'drug': 'drugbank_id'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop_duplicates(inplace=True)


Unnamed: 0,indication,indication_name,Disease,drugbank_id,db_name,gene_name,gene_claim_name
0,C0003467,anxiety,Anxiety Disorder,CHEMBL930,glutamine-(l),GLS,GLS
5,C0003467,anxiety,Anxiety Disorder,CHEMBL930,glutamine-(l),GLS,ENSG00000115419
15,C0003467,anxiety,Anxiety Disorder,CHEMBL930,glutamine-(l),GLS,Glutaminase
20,C0003467,anxiety,Anxiety Disorder,CHEMBL930,glutamine-(l),GLS,2744
25,C0003467,anxiety,Anxiety Disorder,CHEMBL930,glutamine-(l),DDIT3,DDIT3
...,...,...,...,...,...,...,...
883,C0028768,obsessive compulsive disorder (OCD),OCD & Tourette Syndrome,CHEMBL814,fluvoxamine,HTR2A,P28223
891,C0028768,obsessive compulsive disorder (OCD),OCD & Tourette Syndrome,CHEMBL814,fluvoxamine,HTR2A,HTR2A
899,C0028768,obsessive compulsive disorder (OCD),OCD & Tourette Syndrome,CHEMBL814,fluvoxamine,HTR2A,5-HT 2A receptor
915,C0028768,obsessive compulsive disorder (OCD),OCD & Tourette Syndrome,CHEMBL814,fluvoxamine,HTR2A,3356


In [24]:
# Save to a single file
final_df.to_csv('../data/indication_gene_link/ruiz_dgidb.csv', index=False)