In [1]:
import ast
import concurrent.futures
import time

import pandas as pd
import requests
from tqdm import tqdm

In [2]:
df = pd.read_csv("drugbank.csv", index_col=0)
df.head()

Unnamed: 0,Drug Name,DrugBank ID,PubChem CID,PubChem SID,SMILES,Targets Name,Targets
0,Lepirudin,DB00001,,46507011.0,,['Prothrombin'],['P00734']
1,Cetuximab,DB00002,,46507042.0,,"['Epidermal growth factor receptor', 'Low affi...","['P00533', 'O75015', 'P02745', 'P02746', 'P027..."
2,Denileukin diftitox,DB00004,,46506950.0,,"['Interleukin-2 receptor subunit alpha', 'Inte...","['P01589', 'P14784', 'P31785']"
3,Etanercept,DB00005,,46506732.0,,"['Tumor necrosis factor', 'Lymphotoxin-alpha',...","['P01375', 'P01374', 'P12314', 'P12318', 'P319..."
4,Bivalirudin,DB00006,16129704.0,46507415.0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,['Prothrombin'],['P00734']


In [3]:
df = (
    df.dropna(subset=["PubChem CID", "PubChem SID", "SMILES"], how="all")
    .drop(["Targets Name"], axis=1)
    .reset_index(drop=True)
)
df["Targets"] = df["Targets"].apply(ast.literal_eval)
df = df.explode("Targets")
df.shape

(18576, 6)

In [4]:
df.dropna(subset="PubChem CID")

Unnamed: 0,Drug Name,DrugBank ID,PubChem CID,PubChem SID,SMILES,Targets
4,Bivalirudin,DB00006,16129704.0,46507415.0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,P00734
12,Goserelin,DB00014,5311128.0,46507336.0,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,P22888
12,Goserelin,DB00014,5311128.0,46507336.0,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,P30968
43,Cetrorelix,DB00050,25074887.0,46505494.0,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...,P30968
43,Cetrorelix,DB00050,25074887.0,46505494.0,CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...,P22888
...,...,...,...,...,...,...
7441,Dexamethasone acetate,DB14649,236702.0,,[H][C@@]12C[C@@H](C)[C@](O)(C(=O)COC(C)=O)[C@@...,P04083
7441,Dexamethasone acetate,DB14649,236702.0,,[H][C@@]12C[C@@H](C)[C@](O)(C(=O)COC(C)=O)[C@@...,P35228
7441,Dexamethasone acetate,DB14649,236702.0,,[H][C@@]12C[C@@H](C)[C@](O)(C(=O)COC(C)=O)[C@@...,P51843
7441,Dexamethasone acetate,DB14649,236702.0,,[H][C@@]12C[C@@H](C)[C@](O)(C(=O)COC(C)=O)[C@@...,O75469


In [5]:
df.dropna(subset="PubChem SID")

Unnamed: 0,Drug Name,DrugBank ID,PubChem CID,PubChem SID,SMILES,Targets
0,Lepirudin,DB00001,,46507011.0,,P00734
1,Cetuximab,DB00002,,46507042.0,,P00533
1,Cetuximab,DB00002,,46507042.0,,O75015
1,Cetuximab,DB00002,,46507042.0,,P02745
1,Cetuximab,DB00002,,46507042.0,,P02746
...,...,...,...,...,...,...
7330,Equine Botulinum Neurotoxin D Immune FAB2,DB13902,,347911471.0,,P19321
7331,Equine Botulinum Neurotoxin B Immune FAB2,DB13903,,347911472.0,,P10844
7333,Volixibat,DB13914,24987688.0,347829333.0,[H][C@@]1([C@@H](O)[C@](CC)(CCCC)CS(=O)(=O)C2=...,Q12908
7334,Axicabtagene ciloleucel,DB13915,,347911476.0,,P15391


In [6]:
df.dropna(subset="SMILES")

Unnamed: 0,Drug Name,DrugBank ID,PubChem CID,PubChem SID,SMILES,Targets
4,Bivalirudin,DB00006,16129704.0,46507415.0,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,P00734
5,Leuprolide,DB00007,,46507635.0,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,P30968
12,Goserelin,DB00014,5311128.0,46507336.0,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,P22888
12,Goserelin,DB00014,5311128.0,46507336.0,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,P30968
30,Desmopressin,DB00035,,,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,P30518
...,...,...,...,...,...,...
7561,Lutetium Lu-177 vipivotide tetraxetan,DB16778,,,[177Lu+3].OC(=O)CC[C@H](NC(=O)N[C@@H](CCCCNC(=...,P07288
7562,Indigo,DB16862,,,O=C1\C(NC2=C1C=CC=C2)=C1/NC2=C(C=CC=C2)C1=O,P35869
7563,Linzagolix,DB17083,,,COC1=C(COC2=C(OC)C=C(F)C(=C2)N2C(=O)NC3=CSC(C(...,P30968
7564,UCB7362,DB17096,,,C[C@H]1C[C@H](CCO1)N1C(=O)C[C@](C)(NC1=N)C1=C(...,Q8IAS0


In [7]:
tmp = pd.read_csv(
    "../../-code-drugbank_xml2csv_python/nsc_cid_smiles_mechanism_name_dbid.csv",
    index_col=0,
)
tmp = tmp.rename(columns={"CID": "PubChem CID", "Name": "Drug Name"})
tmp.head()

Unnamed: 0,NSC,PubChem CID,SMILES,MECHANISM,Drug Name,DrugBank ID
0,1,11122,CC1=CC(=O)C=CC1=O,Other,"2-methylcyclohexa-2,5-diene-1,4-dione",
1,17,219123,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N,Other,4-amino-3-pentadecylphenol,
2,89,92844,CN(C)CCC(=O)C1=CC=CC=C1.Cl,Other,3-(dimethylamino)-1-phenylpropan-1-one;hydroch...,
3,185,6197,CC1CC(C(=O)C(C1)C(CC2CC(=O)NC(=O)C2)O)C,Other,"4-[(2R)-2-[(1S,3S,5S)-3,5-dimethyl-2-oxocycloh...",
4,185,6197,CC1CC(C(=O)C(C1)C(CC2CC(=O)NC(=O)C2)O)C,Other,"4-[(2R)-2-[(1S,3S,5S)-3,5-dimethyl-2-oxocycloh...",


In [8]:
def run_job(ids):
    url = "https://rest.uniprot.org/idmapping/run"
    data = {"from": "UniProtKB_AC-ID", "to": "Gene_Name", "ids": ",".join(ids)}
    response = requests.post(url, data=data)
    response.raise_for_status()
    return response.json().get("jobId")


def get_results(job_id):
    url = f"https://rest.uniprot.org/idmapping/results/{job_id}"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()


def process_batch(ids):
    job_id = run_job(ids)
    result_data = get_results(job_id)
    return pd.DataFrame(result_data["results"])


def parallel_id_mapping(tar, num_workers=4):
    res = pd.DataFrame()
    batches = [tar[i : i + 10] for i in range(0, len(tar), 10)]

    with tqdm(total=len(batches), desc="Processing batches") as pbar:
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
            futures = [executor.submit(process_batch, batch) for batch in batches]

            for future in concurrent.futures.as_completed(futures):
                res = pd.concat([res, future.result()])
                pbar.update(1)

    return res


res = parallel_id_mapping(df["Targets"].unique())

Processing batches: 100%|█████████████████████████████████████████████████████████████| 472/472 [01:37<00:00,  4.85it/s]


In [9]:
res.columns = ["Targets", "Gene"]
res

Unnamed: 0,Targets,Gene
0,P14784,IL2RB
1,P31785,IL2RG
2,P01375,TNF
3,P01374,LTA
4,P31994,FCGR2B
...,...,...
5,Q504U8,EGFR
6,Q92952,KCNN1
7,Q9H2S1,KCNN2
8,Q8IAS0,PMX


In [10]:
df = df.merge(res).drop("Targets", axis=1).dropna(subset="Gene")
df

Unnamed: 0,Drug Name,DrugBank ID,PubChem CID,PubChem SID,SMILES,Gene
0,Lepirudin,DB00001,,46507011.0,,F2
1,Cetuximab,DB00002,,46507042.0,,EGFR
2,Cetuximab,DB00002,,46507042.0,,FCGR3B
3,Cetuximab,DB00002,,46507042.0,,C1QA
4,Cetuximab,DB00002,,46507042.0,,C1QB
...,...,...,...,...,...,...
18302,Lutetium Lu-177 vipivotide tetraxetan,DB16778,,,[177Lu+3].OC(=O)CC[C@H](NC(=O)N[C@@H](CCCCNC(=...,KLK3
18303,Indigo,DB16862,,,O=C1\C(NC2=C1C=CC=C2)=C1/NC2=C(C=CC=C2)C1=O,AHR
18304,Linzagolix,DB17083,,,COC1=C(COC2=C(OC)C=C(F)C(=C2)N2C(=O)NC3=CSC(C(...,GNRHR
18305,UCB7362,DB17096,,,C[C@H]1C[C@H](CCO1)N1C(=O)C[C@](C)(NC1=N)C1=C(...,PMX


In [11]:
final = (
    pd.concat(
        [
            df.merge(tmp[["DrugBank ID", "NSC"]].dropna(), how="left"),
            df.merge(tmp[["PubChem CID", "NSC"]].dropna(), how="left"),
            df.merge(tmp[["SMILES", "NSC"]].dropna(), how="left"),
            df.merge(tmp[["Drug Name", "NSC"]].dropna(), how="left"),
        ]
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
# final.to_csv('../data/full_table.csv', index=False)
final

Unnamed: 0,Drug Name,DrugBank ID,PubChem CID,PubChem SID,SMILES,Gene,NSC
0,Lepirudin,DB00001,,46507011.0,,F2,
1,Cetuximab,DB00002,,46507042.0,,EGFR,
2,Cetuximab,DB00002,,46507042.0,,FCGR3B,
3,Cetuximab,DB00002,,46507042.0,,C1QA,
4,Cetuximab,DB00002,,46507042.0,,C1QB,
...,...,...,...,...,...,...,...
19495,Menadione,DB00170,4055.0,46505447.0,CC1=CC(=O)C2=CC=CC=C2C1=O,NQO2,
19496,Menadione,DB00170,4055.0,46505447.0,CC1=CC(=O)C2=CC=CC=C2C1=O,NQO1,
19497,Menadione,DB00170,4055.0,46505447.0,CC1=CC(=O)C2=CC=CC=C2C1=O,BGLAP,
19498,Tributyrin,DB12709,6050.0,347828907.0,CCCC(=O)OCC(COC(=O)CCC)OC(=O)CCC,CASP3,
