In [8]:
import pandas as pd

chembl_to_drugbank_mapping = pd.read_csv('../data/Chembl_to_drugbank_dgidb_202202.txt', sep='\t', header=None)
chembl_to_drugbank_mapping.columns = ['ChEMBL_ID', 'DrugBank_ID']
chembl_to_drugbank_mapping.head()

Unnamed: 0,ChEMBL_ID,DrugBank_ID
0,CHEMBL296468,DB05969
1,CHEMBL360861,
2,CHEMBL942,DB09020
3,CHEMBL75967,
4,CHEMBL635,DB00635


In [9]:
drugbank_df = pd.read_csv('../data/indication_gene_link/filtered_drugbank.csv')
drugbank_df.head()

Unnamed: 0,DrugBank ID,Generic Name,Indication,Disease
0,DB00002,Cetuximab,Cetuximab indicated for the treatment of local...,Eating Disorders
1,DB00068,Interferon beta-1b,Interferon beta-1b is a drug used for the trea...,Substance Use Disorders
2,DB00109,Enfuvirtide,Enfuvirtide is an antiretroviral drug used in ...,Substance Use Disorders
3,DB00114,Pyridoxal phosphate,For nutritional supplementation and for treati...,Eating Disorders
4,DB00116,Tetrahydrofolic acid,"For nutritional supplementation, also for trea...",Eating Disorders


In [10]:
# Load DGIdb files with only necessary columns
interactions_columns = ['gene_name', 'drug_concept_id', 'interaction_group_score']
genes_columns = ['gene_name', 'gene_claim_name']
drugs_columns = ['drug_claim_name', 'concept_id']

# DGIdb files
drugs_df = pd.read_csv('../data/indication_gene_link/dgidb/drugs.tsv', sep='\t', usecols=drugs_columns)
genes_df = pd.read_csv('../data/indication_gene_link/dgidb/genes.tsv', sep='\t', usecols=genes_columns)
interactions_df = pd.read_csv('../data/indication_gene_link/dgidb/interactions.tsv', sep='\t', usecols=interactions_columns)

# Handle missing values before merging
drugs_df.dropna(subset=['concept_id'], inplace=True)
genes_df.dropna(subset=['gene_name'], inplace=True)
interactions_df.dropna(subset=['drug_concept_id'], inplace=True)

# Merge DGIdb data
dgIdb_merged = pd.merge(interactions_df, genes_df, on='gene_name', how='inner')
dgIdb_merged = pd.merge(dgIdb_merged, drugs_df, left_on='drug_concept_id', right_on='concept_id', how='inner')
dgIdb_merged['drug_concept_id'] = dgIdb_merged['drug_concept_id'].str.replace('chembl:', '', regex=False)

dgIdb_merged.tail(30)

Unnamed: 0,gene_name,drug_concept_id,interaction_group_score,gene_claim_name,drug_claim_name,concept_id
4230272,GCH1,CHEMBL219568,127.3,GTP cyclohydrolase-I,chembl:CHEMBL219568,chembl:CHEMBL219568
4230273,GCH1,CHEMBL219568,127.3,GTP cyclohydrolase-I,Guanine,chembl:CHEMBL219568
4230274,TNFSF12,CHEMBL2109600,127.3,ENSG00000239697,chembl:CHEMBL2109600,chembl:CHEMBL2109600
4230275,TNFSF12,CHEMBL2109600,127.3,ENSG00000239697,RO-5458640,chembl:CHEMBL2109600
4230276,TNFSF12,CHEMBL2109600,127.3,ENSG00000239697,chembl:CHEMBL2109600,chembl:CHEMBL2109600
4230277,TNFSF12,CHEMBL2109600,127.3,ENSG00000239697,RO-5458640,chembl:CHEMBL2109600
4230278,TNFSF12,CHEMBL2109600,127.3,TNF-related weak inducer of apoptosis,chembl:CHEMBL2109600,chembl:CHEMBL2109600
4230279,TNFSF12,CHEMBL2109600,127.3,TNF-related weak inducer of apoptosis,RO-5458640,chembl:CHEMBL2109600
4230280,LIFR,CHEMBL2108400,127.3,3977,Emfilermin,chembl:CHEMBL2108400
4230281,LIFR,CHEMBL2108400,127.3,3977,chembl:CHEMBL2108400,chembl:CHEMBL2108400


In [11]:
merged_df = pd.merge(dgIdb_merged, chembl_to_drugbank_mapping, left_on='drug_concept_id', right_on='ChEMBL_ID', how='inner')
merged_df.head()

Unnamed: 0,gene_name,drug_concept_id,interaction_group_score,gene_claim_name,drug_claim_name,concept_id,ChEMBL_ID,DrugBank_ID
0,CDK7,CHEMBL296468,0.82,ENSG00000134058,SNS-032,chembl:CHEMBL296468,CHEMBL296468,DB05969
1,CDK7,CHEMBL296468,0.82,ENSG00000134058,SNS-032,chembl:CHEMBL296468,CHEMBL296468,DB05969
2,CDK7,CHEMBL296468,0.82,ENSG00000134058,SNS-032,chembl:CHEMBL296468,CHEMBL296468,DB05969
3,CDK7,CHEMBL296468,0.82,ENSG00000134058,chembl:CHEMBL296468,chembl:CHEMBL296468,CHEMBL296468,DB05969
4,CDK7,CHEMBL296468,0.82,ENSG00000134058,178102298,chembl:CHEMBL296468,CHEMBL296468,DB05969


In [12]:
final_merged_df = pd.merge(merged_df, drugbank_df, left_on='DrugBank_ID', right_on='DrugBank ID', how='inner')
final_merged_df.head()

Unnamed: 0,gene_name,drug_concept_id,interaction_group_score,gene_claim_name,drug_claim_name,concept_id,ChEMBL_ID,DrugBank_ID,DrugBank ID,Generic Name,Indication,Disease
0,VDR,CHEMBL1113,0.01,ENSG00000111424,CHEMBL1113,chembl:CHEMBL1113,CHEMBL1113,DB00543,DB00543,Amoxapine,For the relief of symptoms of depression in pa...,Anxiety Disorder
1,VDR,CHEMBL1113,0.01,ENSG00000111424,AMOXAPINE,chembl:CHEMBL1113,CHEMBL1113,DB00543,DB00543,Amoxapine,For the relief of symptoms of depression in pa...,Anxiety Disorder
2,VDR,CHEMBL1113,0.01,ENSG00000111424,AMOXAPINE,chembl:CHEMBL1113,CHEMBL1113,DB00543,DB00543,Amoxapine,For the relief of symptoms of depression in pa...,Anxiety Disorder
3,VDR,CHEMBL1113,0.01,ENSG00000111424,Amoxapine,chembl:CHEMBL1113,CHEMBL1113,DB00543,DB00543,Amoxapine,For the relief of symptoms of depression in pa...,Anxiety Disorder
4,VDR,CHEMBL1113,0.01,ENSG00000111424,AMOXAPINE,chembl:CHEMBL1113,CHEMBL1113,DB00543,DB00543,Amoxapine,For the relief of symptoms of depression in pa...,Anxiety Disorder


In [13]:
columns_to_keep = ['gene_name', 'ChEMBL_ID', 'DrugBank_ID', 'Disease', 'Indication']
processed_df = final_merged_df[columns_to_keep]
unique_df = processed_df.drop_duplicates()

unique_df.head()               

Unnamed: 0,gene_name,ChEMBL_ID,DrugBank_ID,Disease,Indication
0,VDR,CHEMBL1113,DB00543,Anxiety Disorder,For the relief of symptoms of depression in pa...
144,DRD4,CHEMBL1113,DB00543,Anxiety Disorder,For the relief of symptoms of depression in pa...
234,SLC6A4,CHEMBL1113,DB00543,Anxiety Disorder,For the relief of symptoms of depression in pa...
531,CYP2D6,CHEMBL1113,DB00543,Anxiety Disorder,For the relief of symptoms of depression in pa...
882,DRD2,CHEMBL1113,DB00543,Anxiety Disorder,For the relief of symptoms of depression in pa...


In [14]:
unique_df.to_csv('../data/indication_gene_link/drugbank_dgidb.csv', index=False)