# Dependencies

In [4]:
import xml.etree.ElementTree as ET
import collections
import pandas as pd
import matplotlib
import random as rd
from rdkit import Chem

# Drug Selection

## Open xml file from DrugBank (version = 5.1)

In [6]:
# Parse drugbank file
tree = ET.parse("Data_files/drugbank.xml")
root = tree.getroot()

In [7]:
# defining nodes to extract from xml file 
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template    = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"
logP_template     = "{ns}calculated-properties/{ns}property[{ns}kind='logP']/{ns}value"
cb_pKa_template   = "{ns}calculated-properties/{ns}property[{ns}kind='pKa (strongest basic)']/{ns}value"
ca_pKa_template   = "{ns}calculated-properties/{ns}property[{ns}kind='pKa (strongest acidic)']/{ns}value"
ex_pKa_template   = "{ns}experimental-properties/{ns}property[{ns}kind='pKa']/{ns}value"
SMILES_template   = "{ns}calculated-properties/{ns}property[{ns}kind='SMILES']/{ns}value"
chg_template      = "{ns}calculated-properties/{ns}property[{ns}kind='Physiological Charge']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['cas'] = drug.findtext(ns + 'cas-number')
    row['name'] = drug.findtext(ns + "name")
    row['description'] = drug.findtext(ns + "description")
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['MW'] = drug.findtext(ns + "average-mass")
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    row['logP'] = drug.findtext(logP_template.format(ns = ns))
    row['cb_pKa'] = drug.findtext(cb_pKa_template.format(ns = ns))
    row['ca_pKa'] = drug.findtext(ca_pKa_template.format(ns = ns))
    row['ex_pKa'] = drug.findtext(ex_pKa_template.format(ns = ns))
    row['SMILES'] = drug.findtext(SMILES_template.format(ns = ns))
    row['Phys Chg'] = drug.findtext(chg_template.format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))
    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

# Define names of columns for drug properties of interest
columns = ['drugbank_id','cas','name', 'type', 'groups', 'categories','MW','logP','cb_pKa','ca_pKa','ex_pKa','Phys Chg','SMILES']
# Store database data in a Pandas DataFrame
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
# List properties of DataFrame
drugbank_df.describe()

Unnamed: 0,drugbank_id,cas,name,type,groups,categories,MW,logP,cb_pKa,ca_pKa,ex_pKa,Phys Chg,SMILES
count,14315,14315.0,14315,14315,14315,14315,11172.0,11170.0,10362,9404.0,590.0,11170,11168
unique,14315,8922.0,14315,2,47,6348,9613.0,986.0,1442,1860.0,469.0,22,11167
top,DB00001,,Lepirudin,small molecule,[experimental],[],180.1559,-1.4,-3,3.09,9.2,0,CCNC1=NC(Cl)=NC(NC(C)(CC)C#N)=N1
freq,1,5372.0,1,11834,5994,5848,14.0,112.0,350,37.0,6.0,5161,2


# Export file

In [8]:
# Save drugbank dataframe with desired column to file
drugbank_df.to_pickle('Data_files/drugbank_df.pkl')

# Dataset inspection

In [9]:
# Extract all small molecule drugs
db_sm = drugbank_df[drugbank_df["type"]=="small molecule"]
db_sm

Unnamed: 0,drugbank_id,cas,name,type,groups,categories,MW,logP,cb_pKa,ca_pKa,ex_pKa,Phys Chg,SMILES
5,DB00006,128270-60-0,Bivalirudin,small molecule,"[approved, investigational]","[Amino Acids, Peptides, and Proteins, Anticoag...",2180.2853,-0.76,11.88,2.79,,-4,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
6,DB00007,53714-56-0,Leuprolide,small molecule,"[approved, investigational]","[Adrenal Cortex Hormones, Agents Causing Muscl...",1209.3983,1.04,11.92,9.49,9.6,1,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...
13,DB00014,65807-02-5,Goserelin,small molecule,[approved],"[Adrenal Cortex Hormones, Amino Acids, Peptide...",1269.4105,0.3,10.82,9.27,,2,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
25,DB00027,1405-97-6,Gramicidin D,small molecule,[approved],"[Amino Acids, Peptides, and Proteins, Anti-Bac...",1811.253,4.38,,11.56,,0,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
33,DB00035,16679-58-6,Desmopressin,small molecule,[approved],"[Agents that produce hypertension, Amino Acids...",1069.22,-1,11.77,9.5,,1,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14293,DB16403,21637-25-2,Isoquercitrin,small molecule,[investigational],"[Benzopyrans, Chromones, Flavonoids, Flavonols...",464.379,0.57,-3,6.37,,-1,[H][C@@]1(O[C@@H](OC2=C(OC3=C(C(O)=CC(O)=C3)C2...
14294,DB16404,499-75-2,Carvacrol,small molecule,[experimental],[Terpenes],150.221,3.2,-5.5,10.42,,0,CC(C)C1=CC(O)=C(C)C=C1
14296,DB16406,1858276-04-6,Subasumstat,small molecule,[investigational],[],578.1,2.47,7.73,11.4,,1,CC1=C(C=C(S1)C(=O)C1=C(N[C@H]2C[C@H](O)[C@@H](...
14297,DB16407,1011529-10-4,Azvudine,small molecule,[investigational],"[Carbohydrates, Deoxyribonucleosides, Glycosid...",287.231,-1.2,,,,,NC1=NC(=O)N(C=C1)[C@@H]1O[C@@](CO)(N=[N]=N)[C@...


In [10]:
# Not many drugs have experimental pKa values listed
drugbank_df["ex_pKa"].describe()

count     590
unique    469
top       9.2
freq        6
Name: ex_pKa, dtype: object

In [11]:
drugbank_df["cb_pKa"].describe()

count     10362
unique     1442
top          -3
freq        350
Name: cb_pKa, dtype: object

In [11]:
# several drugs in DrugBank are missing CAS numbers
no_cas = db_sm_LPC[db_sm_LPC["cas"] == ""]
len(no_cas)

334