In [6]:
import pandas as pd
import numpy as np
import io
import urllib.request
import matplotlib.pyplot as plt
import os
import gzip
import collections
import re
import json
import xml.etree.ElementTree as ET
import zipfile
import math

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

For Drugbank Import, download from the website the compressed file, see [here](https://www.drugbank.ca/releases/latest)  
The gzip format works best, so if in *.zip* format, convert to *.gz*

In [2]:
def drugbank_import(path, create_alias='N'):
    
    with gzip.open(path) as xml_file:
        tree = ET.parse(xml_file)
    root = tree.getroot()

    ns = '{http://www.drugbank.ca}'
    calc = "{ns}calculated-properties/{ns}property"
    exp = "{ns}experimental-properties/{ns}property"
    extern = "{ns}external-identifiers/{ns}external-identifier"
    inchikey_template = calc+"[{ns}kind='InChIKey']/{ns}value"
    inchi_template = calc+"[{ns}kind='InChI']/{ns}value"

    melt_point_template = exp+"[{ns}kind='Melting Point']/{ns}value"
    Hydrophobicity_template = exp+"[{ns}kind='Hydrophobicity']/{ns}value"
    isoelectric_template = exp+"[{ns}kind='Isoelectric Point']/{ns}value"
    molweight_template = exp+"[{ns}kind='Molecular Weight']/{ns}value"
    molform_template = exp+"[{ns}kind='Molecular Formula']/{ns}value"
    logP_template = exp+"[{ns}kind='logP']/{ns}value"
    logS_template = exp+"[{ns}kind='logS']/{ns}value"
    
    boil_template = exp+"[{ns}kind='Boiling Point']/{ns}value"
    caco_template = exp+"[{ns}kind='caco2 Permeability']/{ns}value"
    water_exp_template = exp+"[{ns}kind='Water Solubility']/{ns}value"
    pKa_template = exp+"[{ns}kind='pKa']/{ns}value"

    psa_template = calc+"[{ns}kind='Polar Surface Area (PSA)']/{ns}value"
    refr_template = calc+"[{ns}kind='Refractivity']/{ns}value"
    pola_template = calc+"[{ns}kind='Polarizability']/{ns}value"
    bioa_template = calc+"[{ns}kind='Bioavailability']/{ns}value"
    ghose_template = calc+"[{ns}kind='Ghose Filter']/{ns}value"
    mddr_template = calc+"[{ns}kind='MDDR-Like Rule']/{ns}value"
    smiles_template=calc+"[{ns}kind='SMILES']/{ns}value"

    # external identifiers
    DPD_template = extern + \
        "[{ns}resource='Drugs Product Database (DPD)']/{ns}identifier"
    PubChem_template = extern+"[{ns}resource='PubChem Substance']/{ns}identifier"
    kegg_template = extern+"[{ns}resource='KEGG Drug']/{ns}identifier"
    GKB_template = extern+"[{ns}resource='PharmGKB']/{ns}identifier"
    UPKB_template = extern+"[{ns}resource='UniProtKB']/{ns}identifier"
    TTD_template = extern + \
        "[{ns}resource='Therapeutic Targets Database']/{ns}identifier"
    wiki_template = extern+"[{ns}resource='Wikipedia']/{ns}identifier"
    ChEMBL_template = extern+"[{ns}resource='ChEMBL']/{ns}identifier"

    rows = list()
    for i, drug in enumerate(root):
        row = collections.OrderedDict()
        assert drug.tag == ns + 'drug'
        row['type'] = drug.get('type')
        row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
        row['average-mass'] = drug.findtext(ns + "average-mass")
        row['monoisotopic-mass'] = drug.findtext(ns + "monoisotopic-mass")
        row['name'] = drug.findtext(ns + "name")
        # free text
        row['volume-of-distribution'] = drug.findtext(
            ns + "volume-of-distribution")
        row['clearance'] = drug.findtext(ns + "clearance")
        row['half-life'] = drug.findtext(ns + "half-life")
        row['toxicity'] = drug.findtext(ns + "toxicity")
        row['metabolism'] = drug.findtext(ns + "metabolism")
        row['absorption'] = drug.findtext(ns + "absorption")
        
        row['smiles'] = drug.findtext(smiles_template.format(ns=ns))
        # experimental
        row['melting point'] = drug.findtext(melt_point_template.format(ns=ns))
        row['Hydrophobicity'] = drug.findtext(
            Hydrophobicity_template.format(ns=ns))
        row['Isoelectric Point'] = drug.findtext(
            isoelectric_template.format(ns=ns))
        row['Molecular Weight'] = drug.findtext(molweight_template.format(ns=ns))
        row['Molecular Formula'] = drug.findtext(molform_template.format(ns=ns))
        row['logP EXP'] = drug.findtext(logS_template.format(ns=ns))
        row['logS EXP'] = drug.findtext(melt_point_template.format(ns=ns))
        row['pKa EXP'] = drug.findtext(pKa_template.format(ns=ns))
        row['Boiling Point'] = drug.findtext(boil_template.format(ns=ns))
        row['Caco2 Permeability'] = drug.findtext(caco_template.format(ns=ns))
        row['Water Solubility EXP'] = drug.findtext(
            water_exp_template.format(ns=ns))
        # calculated
        row['PSA calc'] = drug.findtext(psa_template.format(ns=ns))
        row['Refractivity calc'] = drug.findtext(refr_template.format(ns=ns))
        row['Polarizability'] = drug.findtext(pola_template.format(ns=ns))
        row['Bioavailability'] = drug.findtext(bioa_template.format(ns=ns))
        row['Ghose Filter'] = drug.findtext(ghose_template.format(ns=ns))
        row['MDDR-Like Rule'] = drug.findtext(mddr_template.format(ns=ns))
        # external
        row['Drugs Product Database (DPD)'] = drug.findtext(
            DPD_template.format(ns=ns))
        row['PubChem Substance'] = drug.findtext(PubChem_template.format(ns=ns))
        row['KEGG Drug'] = drug.findtext(kegg_template.format(ns=ns))
        row['PharmGKB'] = drug.findtext(GKB_template.format(ns=ns))
        row['UniProtKB'] = drug.findtext(UPKB_template.format(ns=ns))
        row['Therapeutic Targets Database'] = drug.findtext(
            TTD_template.format(ns=ns))
        row['Wikipedia'] = drug.findtext(wiki_template.format(ns=ns))
        row['ChEMBL'] = drug.findtext(ChEMBL_template.format(ns=ns))

        # others
        row['groups'] = [group.text for group in
                         drug.findall("{ns}groups/{ns}group".format(ns=ns))]
        row['atc_codes'] = [code.get('code') for code in
                            drug.findall("{ns}atc-codes/{ns}atc-code".format(ns=ns))]
        row['categories'] = [x.findtext(ns + 'category') for x in
                             drug.findall("{ns}categories/{ns}category".format(ns=ns))]
        row['inchi'] = drug.findtext(inchi_template.format(ns=ns))
        row['inchikey'] = drug.findtext(inchikey_template.format(ns=ns))
        
        # Add drug aliases
        aliases = {
            elem.text for elem in
            drug.findall("{ns}international-brands/{ns}international-brand".format(ns=ns)) +
            drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns=ns)) +
            drug.findall("{ns}international-brands/{ns}international-brand".format(ns=ns)) +
            drug.findall("{ns}products/{ns}product/{ns}name".format(ns=ns))

        }
        aliases.add(row['name'])
        row['aliases'] = sorted(aliases)

        rows.append(row)
        
    if create_alias=='Y':
        alias_dict = {row['drugbank_id']: row['aliases'] for row in rows}
        with open('aliases.json', 'w') as fp:
            json.dump(alias_dict, fp, indent=2, sort_keys=True)
        


    def collapse_list_values(row):
        for key, value in row.items():
            if isinstance(value, list):
                row[key] = '|'.join(value)
        return row


    rows = list(map(collapse_list_values, rows))

    columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes',
               'categories', 'inchikey', 'inchi', 'average-mass',
               'monoisotopic-mass', 'volume-of-distribution', 'clearance', 'half-life',
               'toxicity', 'metabolism', 'metabolism', 'absorption', 'smiles',
               'melting point', 'logS EXP', 'logP EXP', 'pKa EXP', 'Isoelectric Point', 'Molecular Weight', 'Molecular Formula',
               'Hydrophobicity', 'Boiling Point', 'Caco2 Permeability', 'Water Solubility EXP', 'PSA calc', 'Refractivity calc', 'Polarizability', 'Ghose Filter', 'MDDR-Like Rule',
               'Drugs Product Database (DPD)', 'PubChem Substance', 'KEGG Drug', 'PharmGKB', 'UniProtKB', 'Therapeutic Targets Database', 'ChEMBL', 'Wikipedia']

    drugbank_df = pd.DataFrame.from_dict(rows)[columns]

    drugbank_slim_df = drugbank_df[
        drugbank_df.groups.map(lambda x: 'approved' in x) &
        drugbank_df.inchi.map(lambda x: x is not None) &
        drugbank_df.type.map(lambda x: x == 'small molecule')
    ]

    return drugbank_df,drugbank_slim_df

In [3]:
dbdata,dbdataslim=drugbank_import('../files/full database.xml.gz')

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,average-mass,monoisotopic-mass,...,Ghose Filter,MDDR-Like Rule,Drugs Product Database (DPD),PubChem Substance,KEGG Drug,PharmGKB,UniProtKB,Therapeutic Targets Database,ChEMBL,Wikipedia
0,DB00001,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,,,,...,,,11916.0,46507011,D06880,PA450195,P01050,DAP000541,CHEMBL1201666,Lepirudin
1,DB00002,Cetuximab,biotech,approved,L01XC06,"Amino Acids, Peptides, and Proteins|Antibodies...",,,,,...,,,13175.0,46507042,D03455,PA10040,,DNC000788,CHEMBL1201577,Cetuximab
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,,,,...,,,650.0,46507792,,PA10318,P24855,DAP000981,CHEMBL1201431,Dornase_alfa
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,,,,...,,,,46506950,,PA164750594,P00587,DAP001098,CHEMBL1201550,Denileukin_diftitox
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Amino Acids, Peptides, and Proteins|Analgesics...",,,,,...,,,12032.0,46506732,D00742,PA449515,P20333,DNC000605,CHEMBL1201572,Etanercept


In [5]:
dbdata.head()
dbdata.describe()

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,average-mass,monoisotopic-mass,volume-of-distribution,clearance,half-life,toxicity,metabolism,metabolism.1,absorption,smiles,melting point,logS EXP,logP EXP,pKa EXP,Isoelectric Point,Molecular Weight,Molecular Formula,Hydrophobicity,Boiling Point,Caco2 Permeability,Water Solubility EXP,PSA calc,Refractivity calc,Polarizability,Ghose Filter,MDDR-Like Rule,Drugs Product Database (DPD),PubChem Substance,KEGG Drug,PharmGKB,UniProtKB,Therapeutic Targets Database,ChEMBL,Wikipedia
0,DB00001,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,,,,"* 12.2 L [Healthy young subjects (n = 18, age ...",* 164 ml/min [Healthy 18-60 yrs]\r\n* 139 ml/m...,Approximately 1.3 hours,"In case of overdose (eg, suggested by excessiv...",Lepirudin is thought to be metabolized by rele...,Lepirudin is thought to be metabolized by rele...,Bioavailability is 100% following injection.,,65 °C,65 °C,,,4.04,6963.425,C287H440N80O110S6,-0.777,,,,,,,,,11916.0,46507011,D06880,PA450195,P01050,DAP000541,CHEMBL1201666,Lepirudin
1,DB00002,Cetuximab,biotech,approved,L01XC06,"Amino Acids, Peptides, and Proteins|Antibodies...",,,,,appeared to be independent of dose and approxi...,Female patients had 25% lower intrinsic cleara...,The mean half-life for Cetuximab is 114 hours ...,Pulmonary Toxicity\r\n\r\nInterstitial lung di...,,,,,"61 °C (FAB fragment), 71 °C (whole mAb)","61 °C (FAB fragment), 71 °C (whole mAb)",,,8.48,145781.6,C6484H10042N1732O2023S36,-0.413,,,,,,,,,13175.0,46507042,D03455,PA10040,,DNC000788,CHEMBL1201577,Cetuximab
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,,,,"In studies in rats and monkeys, the initial vo...","Studies in rats indicate that, following aeros...",,Adverse reactions occur at a frequency of < 1/...,While no conclusive studies have yet been publ...,While no conclusive studies have yet been publ...,Studies in rats and monkeys after inhalation o...,,67 °C,67 °C,,,4.58,29253.9,C1321H1999N339O396S9,-0.083,,,,,,,,,650.0,46507792,,PA10318,P24855,DAP000981,CHEMBL1201431,Dornase_alfa
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,,,,* 0.06 to 0.09 L/kg,* 0.6 - 2.0 mL/min/kg [Lymphoma],70-80 min,,,,,,,,,,5.45,57647.3,C2560H4042N678O799S17,-0.301,,,,,,,,,,46506950,,PA164750594,P00587,DAP001098,CHEMBL1201550,Denileukin_diftitox
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Amino Acids, Peptides, and Proteins|Analgesics...",,,,,,* 160 +/- 80 mL/hr [RA patients],102 +/- 30 hrs in individuals with rheumatoid ...,,,,Bioavailability following sub-Q administration...,,71 °C (whole mAb),71 °C (whole mAb),,,7.89,51234.9,C2224H3475N621O698S36,-0.529,,,,,,,,,12032.0,46506732,D00742,PA449515,P20333,DNC000605,CHEMBL1201572,Etanercept


Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,average-mass,monoisotopic-mass,volume-of-distribution,clearance,half-life,toxicity,metabolism,metabolism.1,absorption,smiles,melting point,logS EXP,logP EXP,pKa EXP,Isoelectric Point,Molecular Weight,Molecular Formula,Hydrophobicity,Boiling Point,Caco2 Permeability,Water Solubility EXP,PSA calc,Refractivity calc,Polarizability,Ghose Filter,MDDR-Like Rule,Drugs Product Database (DPD),PubChem Substance,KEGG Drug,PharmGKB,UniProtKB,Therapeutic Targets Database,ChEMBL,Wikipedia
count,11292,11292,11292,11292,11292.0,11292.0,9299,9299,9297.0,9297.0,11292.0,11292.0,11292.0,11292.0,11292.0,11292.0,11292.0,9296,1728,1728,192.0,470.0,99.0,216.0,186,81.0,422,83.0,1482,9296,9296,9291.0,9296,9296,1965,10484,1838,1795,90,1427,6555,4767
unique,11292,11292,2,46,2414.0,4735.0,9297,9297,7954.0,7987.0,1071.0,990.0,1734.0,1963.0,1649.0,1649.0,1883.0,9294,1300,1300,159.0,388.0,75.0,188.0,171,62.0,382,70.0,1050,3950,6627,4730.0,2,2,1942,10484,1838,1784,64,1426,6555,4718
top,DB03178,Formoterol,small molecule,experimental,,,MUMGGOZAMZWBJJ-DYKIIFRCSA-N,InChI=1S/C19H28O2/c1-18-9-7-13(20)11-12(18)3-4...,180.1559,180.063388116,,,,,,,,[H][C@@]12CC[C@H](O)[C@@]1(C)CC[C@@]1([H])[C@@...,< 25 °C,< 25 °C,-3.37,4.5,5.99,148000.0,C860H1353N227O255S9,-0.33,Decomposes,-4.69,Insoluble,0,0,1.78,1,0,309,99444950,D00439,PA153906323,P01857,DAP000903,CHEMBL1200572,Phosphoinositide_3-kinase_inhibitor
freq,1,1,9949,4942,8870.0,5150.0,2,2,12.0,12.0,10156.0,10253.0,9281.0,9281.0,9378.0,9378.0,9285.0,2,25,25,3.0,4.0,7.0,5.0,3,4.0,13,3.0,85,150,55,57.0,4956,7118,4,1,1,3,8,2,1,4


In [22]:
dbdata['average-mass']=pd.to_numeric(dbdata['average-mass'], errors='coerce')
dbdata['monoisotopic-mass']=pd.to_numeric(dbdata['monoisotopic-mass'], errors='coerce')
dbdata['logP EXP'].describe()

count       192
unique      159
top       -3.37
freq          3
Name: logP EXP, dtype: object

In [24]:
dbdata.to_csv('drugbank.csv',index=False)