In [1]:
import xml.etree.ElementTree as ET
import pandas as pd


from collections import OrderedDict

from itertools import islice

In [2]:
drugbank_xml = 'resources/drugbank_all_full_database.xml'
drugbank_schema = 'resources/drugbank.xsd'

In [3]:
tree = ET.parse(drugbank_xml)
root = tree.getroot()

In [4]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['description'] = drug.findtext(ns + "description")
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    
#     # Add drug aliases
#     aliases = {
#         elem.text for elem in 
#         drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
#         drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
#         drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
#         drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

#     }
#     aliases.add(row['name'])
#     row['aliases'] = sorted(aliases)

    rows.append(row)

In [5]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi', 'description']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
drugbank_df.head(7)

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
0,DB00001,Lepirudin,biotech,[approved],[B01AE02],"[Amino Acids, Peptides, and Proteins, Anticoag...",,,Lepirudin is identical to natural hirudin exce...
1,DB00002,Cetuximab,biotech,[approved],[L01XC06],"[Amino Acids, Peptides, and Proteins, Antibodi...",,,Cetuximab is an epidermal growth factor recept...
2,DB00003,Dornase alfa,biotech,[approved],[],"[Amino Acids, Peptides, and Proteins, Deoxyrib...",,,Dornase alfa is a biosynthetic form of human d...
3,DB00004,Denileukin diftitox,biotech,"[approved, investigational]",[L01XX29],"[ADP Ribose Transferases, Amino Acids, Peptide...",,,A recombinant DNA-derived cytotoxic protein co...
4,DB00005,Etanercept,biotech,"[approved, investigational]",[L04AB01],"[Amino Acids, Peptides, and Proteins, Analgesi...",,,Dimeric fusion protein consisting of the extra...
5,DB00006,Bivalirudin,small molecule,"[approved, investigational]",[B01AE06],"[Amino Acids, Peptides, and Proteins, Anticoag...",OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,Bivalirudin is a synthetic 20 residue peptide ...
6,DB00007,Leuprolide,biotech,"[approved, investigational]",[L02AE02],"[Amino Acids, Peptides, and Proteins, Antineop...",,,Leuprolide belongs to the general class of dru...


In [6]:
drugbank_df_filtered = \
    drugbank_df[['name', 'atc_codes']]

# only keep ATC codes with exactly one entry
valid_len = lambda x: 0 if x==[None] else len(x)

atc_counts = drugbank_df_filtered['atc_codes'].apply(valid_len)
drugbank_df_filtered =\
    drugbank_df_filtered[atc_counts == 1]

# transform ATC codes from list to strings
pop = lambda x: x[0]
# drugbank_df_filtered =\
#     drugbank_df_filtered['atc_codes'].apply(pop)
    
drugbank_df_filtered['atc_codes'].apply(pop)
drugbank_df_filtered['atc_codes'] =\
    drugbank_df_filtered['atc_codes'].apply(pop)


drugbank_df_filtered.head(10)

Unnamed: 0,name,atc_codes
0,Lepirudin,B01AE02
1,Cetuximab,L01XC06
3,Denileukin diftitox,L01XX29
4,Etanercept,L04AB01
5,Bivalirudin,B01AE06
6,Leuprolide,L02AE02
10,Interferon alfa-n1,L03AB06
11,Darbepoetin alfa,B03XA02
12,Urokinase,B01AD04
13,Goserelin,L02AE03


In [7]:
# save to file
file_name = 'resources/' + 'drugbank_filter.csv'
drugbank_df_filtered.to_csv(file_name)