In [26]:
import os
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd
from tqdm import tqdm

In [27]:
# XMLファイルのパス
xml_file_path = os.path.join("../../Downloads/", "full database.xml")

# XMLファイルをパース
tree = ET.parse(xml_file_path)
root = tree.getroot()

# DrugBankのXML名前空間
namespace = "{http://www.drugbank.ca}"

# 結果を格納するリスト
drug_data_list = []

# 各薬物から必要な情報を抽出
for drug in tqdm(root.findall(f"{namespace}drug")):
    drug_info = {
        "Drug Name": (
            drug.find(f"{namespace}name").text
            if drug.find(f"{namespace}name") is not None
            else "N/A"
        ),
        "DrugBank ID": None,
        "PubChem CID": None,
        "PubChem SID": None,
        "SMILES": None,
        "InChIKey": None,
        "Targets Name": [],
        "Targets": [],
    }

    # DrugBank IDの抽出
    drug_id_element = drug.find(f"{namespace}drugbank-id[@primary='true']")
    if drug_id_element is not None:
        drug_info["DrugBank ID"] = drug_id_element.text

    # 'external-identifiers'セクションからPubChem CID、SIDの抽出
    external_identifiers = drug.find(f"{namespace}external-identifiers")
    if external_identifiers is not None:
        for identifier in external_identifiers:
            resource = identifier.find(f"{namespace}resource")
            id_value = identifier.find(f"{namespace}identifier")
            if resource is not None and id_value is not None:
                if resource.text == "PubChem Compound":
                    drug_info["PubChem CID"] = id_value.text
                elif resource.text == "PubChem Substance":
                    drug_info["PubChem SID"] = id_value.text

    # 'calculated-properties'セクションからSMILESとInChIKeyの抽出
    calculated_properties = drug.find(f"{namespace}calculated-properties")
    if calculated_properties is not None:
        for prop in calculated_properties:
            kind = prop.find(f"{namespace}kind")
            value = prop.find(f"{namespace}value")
            if kind is not None and value is not None:
                if kind.text == "SMILES":
                    drug_info["SMILES"] = value.text
                elif kind.text == "InChIKey":
                    drug_info["InChIKey"] = value.text

    # 'targets'セクションからターゲット情報を抽出
    targets = drug.find(f"{namespace}targets")
    if targets is not None:
        for target in targets:
            target_name = target.find(f"{namespace}name")
            if target_name is not None:
                drug_info["Targets Name"].append(target_name.text)

            polypeptide = target.find(f"{namespace}polypeptide")
            if polypeptide is not None and "id" in polypeptide.attrib:
                drug_info["Targets"].append(polypeptide.attrib["id"])

    drug_data_list.append(drug_info)

# Pandas DataFrameに変換
drug_df = pd.DataFrame(drug_data_list)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15235/15235 [00:00<00:00, 18353.06it/s]


In [28]:
drug_info

{'Drug Name': 'Xenon Xe-129',
 'DrugBank ID': 'DB17386',
 'PubChem CID': None,
 'PubChem SID': None,
 'SMILES': '[129Xe]',
 'InChIKey': 'FHNFHKCVQCLJFQ-YPZZEJLDSA-N',
 'Targets Name': [],
 'Targets': []}

In [29]:
drug_df

Unnamed: 0,Drug Name,DrugBank ID,PubChem CID,PubChem SID,SMILES,InChIKey,Targets Name,Targets
0,Lepirudin,DB00001,,46507011,,,[Prothrombin],[P00734]
1,Cetuximab,DB00002,,46507042,,,"[Epidermal growth factor receptor, Low affinit...","[P00533, O75015, P02745, P02746, P02747, P0863..."
2,Dornase alfa,DB00003,,46507792,,,[DNA],[]
3,Denileukin diftitox,DB00004,,46506950,,,"[Interleukin-2 receptor subunit alpha, Interle...","[P01589, P14784, P31785]"
4,Etanercept,DB00005,,46506732,,,"[Tumor necrosis factor, Lymphotoxin-alpha, Hig...","[P01375, P01374, P12314, P12318, P31994, P3199..."
...,...,...,...,...,...,...,...,...
15230,AUM-601,DB17382,,,,,[],[]
15231,FN-1501,DB17383,,,CN1CCN(CC2=CC=C(NC(=O)C3=NNC=C3NC3=C4C=CNC4=NC...,VXLAKHWYGRKCGI-UHFFFAOYSA-N,[],[]
15232,Tinengotinib,DB17384,,,CC1=C2N=C(C3=CC=CC=C3Cl)C3=C(NC2=NN1)C=C(N=C3)...,DQFCVOOFMXEPOC-UHFFFAOYSA-N,[],[]
15233,Lipotecan,DB17385,,,CC[C@@]1(OC(=O)C(C)ON=C2C3=C(C4=C2C=C(C=C4[N+]...,JCCCLGDYMMTBPM-HXDHBHDHSA-N,[],[]


In [22]:
drug_df = drug_df[
    (drug_df["Targets"].astype(bool))
    #     & (~(drug_df['PubChem CID'].isna() & drug_df['PubChem SID'].isna() & drug_df['SMILES'].isna()))
].reset_index(drop=True)
drug_df

Unnamed: 0,Drug Name,DrugBank ID,PubChem CID,PubChem SID,SMILES,Targets Name,Targets
0,Lepirudin,DB00001,,46507011,,[Prothrombin],[P00734]
1,Cetuximab,DB00002,,46507042,,"[Epidermal growth factor receptor, Low affinit...","[P00533, O75015, P02745, P02746, P02747, P0863..."
2,Denileukin diftitox,DB00004,,46506950,,"[Interleukin-2 receptor subunit alpha, Interle...","[P01589, P14784, P31785]"
3,Etanercept,DB00005,,46506732,,"[Tumor necrosis factor, Lymphotoxin-alpha, Hig...","[P01375, P01374, P12314, P12318, P31994, P3199..."
4,Bivalirudin,DB00006,16129704,46507415,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,[Prothrombin],[P00734]
...,...,...,...,...,...,...,...
7751,Betibeglogene autotemcel,DB16900,,,,[Alpha globin],[V9H1D9]
7752,KW-6356,DB17080,,,,[Adenosine receptor A2a],[P29274]
7753,Linzagolix,DB17083,,,COC1=C(COC2=C(OC)C=C(F)C(=C2)N2C(=O)NC3=CSC(C(...,[Gonadotropin-releasing hormone receptor],[P30968]
7754,JNJ-17216498,DB17087,,,,[Histamine H3 receptor],[Q9Y5N1]


In [471]:
drug_df = drug_df.drop(["PubChem CID", "PubChem SID"], axis=1)
drug_df.head()

Unnamed: 0,Drug Name,DrugBank ID,SMILES,Targets Name,Targets
0,Lepirudin,DB00001,,['Prothrombin'],['P00734']
1,Cetuximab,DB00002,,"['Epidermal growth factor receptor', 'Low affi...","['P00533', 'O75015', 'P02745', 'P02746', 'P027..."
2,Denileukin diftitox,DB00004,,"['Interleukin-2 receptor subunit alpha', 'Inte...","['P01589', 'P14784', 'P31785']"
3,Etanercept,DB00005,,"['Tumor necrosis factor', 'Lymphotoxin-alpha',...","['P01375', 'P01374', 'P12314', 'P12318', 'P319..."
4,Bivalirudin,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,['Prothrombin'],['P00734']


In [472]:
db = pd.read_csv("/Users/yoshitakainoue/Downloads/drug links.csv")[
    [
        "DrugBank ID",
        "CAS Number",
        "KEGG Compound ID",
        "KEGG Drug ID",
        "PubChem Compound ID",
        "PubChem Substance ID",
        "ChEBI ID",
        "UniProt ID",
        "BindingDB ID",
    ]
]
db.head()

Unnamed: 0,DrugBank ID,CAS Number,KEGG Compound ID,KEGG Drug ID,PubChem Compound ID,PubChem Substance ID,ChEBI ID,UniProt ID,BindingDB ID
0,DB00001,138068-37-8,,D06880,,46507011.0,,P01050,
1,DB00002,205923-56-4,,D03455,,46507042.0,,,
2,DB00003,143831-71-4,,,,46507792.0,,P24855,
3,DB00004,173146-27-5,,,,46506950.0,,P00587,
4,DB00005,185243-69-0,C07897,D00742,,46506732.0,,P20333,


In [473]:
drug_df = drug_df.merge(db)
drug_df.head()

Unnamed: 0,Drug Name,DrugBank ID,SMILES,Targets Name,Targets,CAS Number,KEGG Compound ID,KEGG Drug ID,PubChem Compound ID,PubChem Substance ID,ChEBI ID,UniProt ID,BindingDB ID
0,Lepirudin,DB00001,,['Prothrombin'],['P00734'],138068-37-8,,D06880,,46507011.0,,P01050,
1,Cetuximab,DB00002,,"['Epidermal growth factor receptor', 'Low affi...","['P00533', 'O75015', 'P02745', 'P02746', 'P027...",205923-56-4,,D03455,,46507042.0,,,
2,Denileukin diftitox,DB00004,,"['Interleukin-2 receptor subunit alpha', 'Inte...","['P01589', 'P14784', 'P31785']",173146-27-5,,,,46506950.0,,P00587,
3,Etanercept,DB00005,,"['Tumor necrosis factor', 'Lymphotoxin-alpha',...","['P01375', 'P01374', 'P12314', 'P12318', 'P319...",185243-69-0,C07897,D00742,,46506732.0,,P20333,
4,Bivalirudin,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,['Prothrombin'],['P00734'],128270-60-0,,D03136,16129704.0,46507415.0,59173.0,,50248103.0


In [474]:
drug_df["DrugBank ID"].to_csv("dbid.csv", index=False, header=None)

In [475]:
db_cid = pd.read_csv(
    "/Users/yoshitakainoue/Downloads/4512713032607354874.txt.gz", sep="\t", header=None
)
db_cid.dropna()

Unnamed: 0,0,1
4,DB00006,16129704.0
5,DB00007,657181.0
12,DB00014,5311128.0
30,DB00035,5311065.0
43,DB00050,25074887.0
...,...,...
7621,DB15588,64945.0
7623,DB15598,169535.0
7625,DB15617,86278348.0
7626,DB15623,53361968.0


In [476]:
conv = dict(
    pd.DataFrame(
        np.concatenate(
            [
                db_cid.dropna().values,
                drug_df[["DrugBank ID", "PubChem Compound ID"]].dropna().values,
            ]
        )
    )
    .drop_duplicates()
    .values
)

In [477]:
drug_df["PubChem Compound ID"] = [
    conv[i] if i in conv.keys() else None for i in drug_df["DrugBank ID"]
]
drug_df

Unnamed: 0,Drug Name,DrugBank ID,SMILES,Targets Name,Targets,CAS Number,KEGG Compound ID,KEGG Drug ID,PubChem Compound ID,PubChem Substance ID,ChEBI ID,UniProt ID,BindingDB ID
0,Lepirudin,DB00001,,['Prothrombin'],['P00734'],138068-37-8,,D06880,,46507011.0,,P01050,
1,Cetuximab,DB00002,,"['Epidermal growth factor receptor', 'Low affi...","['P00533', 'O75015', 'P02745', 'P02746', 'P027...",205923-56-4,,D03455,,46507042.0,,,
2,Denileukin diftitox,DB00004,,"['Interleukin-2 receptor subunit alpha', 'Inte...","['P01589', 'P14784', 'P31785']",173146-27-5,,,,46506950.0,,P00587,
3,Etanercept,DB00005,,"['Tumor necrosis factor', 'Lymphotoxin-alpha',...","['P01375', 'P01374', 'P12314', 'P12318', 'P319...",185243-69-0,C07897,D00742,,46506732.0,,P20333,
4,Bivalirudin,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,['Prothrombin'],['P00734'],128270-60-0,,D03136,16129704.0,46507415.0,59173.0,,50248103.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7751,Betibeglogene autotemcel,DB16900,,['Alpha globin'],['V9H1D9'],,,,,,,,
7752,KW-6356,DB17080,,['Adenosine receptor A2a'],['P29274'],,,,,,,,
7753,Linzagolix,DB17083,COC1=C(COC2=C(OC)C=C(F)C(=C2)N2C(=O)NC3=CSC(C(...,['Gonadotropin-releasing hormone receptor'],['P30968'],935283-04-8,,,,,,,160329.0
7754,JNJ-17216498,DB17087,,['Histamine H3 receptor'],['Q9Y5N1'],1035626-05-1,,,,,,,


In [478]:
db_smiles = pd.read_csv(
    "/Users/yoshitakainoue/Downloads/119911432016837011.txt.gz", sep="\t", header=None
).dropna()
db_smiles

Unnamed: 0,0,1
4,DB00006,CC[C@H](C)[C@@H](C(=O)N1CCC[C@H]1C(=O)N[C@@H](...
5,DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCN=C(N)N)NC(=...
12,DB00014,CC(C)C[C@@H](C(=O)N[C@@H](CCCN=C(N)N)C(=O)N1CC...
30,DB00035,C1C[C@H](N(C1)C(=O)[C@@H]2CSSCCC(=O)N[C@H](C(=...
43,DB00050,C[C@H](C(=O)N)NC(=O)[C@@H]1CCCN1C(=O)[C@H](CCC...
...,...,...
7621,DB15588,C[C@@H]1CC[C@@]2(CC[C@@]3(C(=CC[C@H]4[C@]3(CC[...
7623,DB15598,CC1=C(C(=O)C=CO1)[O-].CC1=C(C(=O)C=CO1)[O-].CC...
7625,DB15617,C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)OC[C@@H]2...
7626,DB15623,CC(C)CN(C[C@H]([C@H](CC1=CC=CC=C1)NC(=O)O[C@H]...


In [479]:
conv = dict(
    pd.DataFrame(
        np.concatenate(
            [
                db_smiles.dropna().values,
                drug_df[["DrugBank ID", "SMILES"]].dropna().values,
            ]
        )
    )
    .drop_duplicates()
    .values
)

In [480]:
len(conv.keys())

7055

In [481]:
drug_df["SMILES"] = [
    conv[i] if i in conv.keys() else None for i in drug_df["DrugBank ID"]
]
drug_df.head()

Unnamed: 0,Drug Name,DrugBank ID,SMILES,Targets Name,Targets,CAS Number,KEGG Compound ID,KEGG Drug ID,PubChem Compound ID,PubChem Substance ID,ChEBI ID,UniProt ID,BindingDB ID
0,Lepirudin,DB00001,,['Prothrombin'],['P00734'],138068-37-8,,D06880,,46507011.0,,P01050,
1,Cetuximab,DB00002,,"['Epidermal growth factor receptor', 'Low affi...","['P00533', 'O75015', 'P02745', 'P02746', 'P027...",205923-56-4,,D03455,,46507042.0,,,
2,Denileukin diftitox,DB00004,,"['Interleukin-2 receptor subunit alpha', 'Inte...","['P01589', 'P14784', 'P31785']",173146-27-5,,,,46506950.0,,P00587,
3,Etanercept,DB00005,,"['Tumor necrosis factor', 'Lymphotoxin-alpha',...","['P01375', 'P01374', 'P12314', 'P12318', 'P319...",185243-69-0,C07897,D00742,,46506732.0,,P20333,
4,Bivalirudin,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,['Prothrombin'],['P00734'],128270-60-0,,D03136,16129704.0,46507415.0,59173.0,,50248103.0


In [482]:
drug_df[
    [
        "Drug Name",
        "DrugBank ID",
        "Targets Name",
        "Targets",
        "SMILES",
        "PubChem Compound ID",
        "PubChem Substance ID",
    ]
].to_csv("drug_bank.csv")