In [73]:
import xml.etree.ElementTree as ET
import collections
import pandas as pd

In [2]:
xml_file = "full database.xml"
tree = ET.parse(xml_file)

In [3]:
root = tree.getroot()


In [11]:
ns = '{http://www.drugbank.ca}'


In [76]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row


In [77]:
rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name").lower()
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['indication'] = drug.findtext(ns + "indication")
    row['patents'] = sorted({group.text for group in
        drug.findall("{ns}patents/{ns}patent/{ns}number".format(ns = ns))})
    row['target'] = [group.text for group in
        drug.findall("{ns}targets/{ns}target/{ns}name".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))
    }
    aliases.add(row['name'])
    row['products'] = sorted(aliases)
    rows.append(row)
    

In [78]:
rows = list(map(collapse_list_values, rows))
drugbank_df = pd.DataFrame.from_dict(rows)
drugbank_df.to_csv("drugbank.csv", sep='\t', index=False)

In [82]:
"2065150|5196404|7582727|7598343".split("|")

['2065150', '5196404', '7582727', '7598343']

In [81]:
drugbank_df[drugbank_df.type == "small molecule"]

Unnamed: 0,type,drugbank_id,name,groups,indication,patents,target,categories,products
5,small molecule,DB00006,bivalirudin,approved|investigational,For treatment of heparin-induced thrombocytope...,2065150|5196404|7582727|7598343,Prothrombin,"Amino Acids, Peptides, and Proteins|Anticoagul...",Angiomax|Bivalirudin|Bivalirudin for Injection...
13,small molecule,DB00014,goserelin,approved,Used to treat hormone-sensitive cancers of the...,7118552|7220247|7500964,Lutropin-choriogonadotropic hormone receptor|G...,"Adrenal Cortex Hormones|Amino Acids, Peptides,...",Zoladex|Zoladex LA|goserelin
25,small molecule,DB00027,gramicidin d,approved,"For treatment of skin lesions, surface wounds ...",,,"Amino Acids, Peptides, and Proteins|Anti-Bacte...",Antibiotic Cream|Antibiotic Cream Plus Pain Re...
33,small molecule,DB00035,desmopressin,approved,- Indicated for the treatment of nocturia due ...,2484724|2486833|5500413|7022340|7405203|756042...,Vasopressin V2 receptor|Vasopressin V1a recept...,"Agents that produce hypertension|Amino Acids, ...",DDAVP Rhinal Tube|Ddavp|Ddavp Inj 4mcg/ml|Ddav...
47,small molecule,DB00050,cetrorelix,approved|investigational,For the inhibition of premature LH surges in w...,2115943|5198533|6319192,Gonadotropin-releasing hormone receptor|Lutrop...,"Amino Acids, Peptides, and Proteins|Anti-Gonad...",Cetrotide|cetrorelix
63,small molecule,DB00067,vasopressin,approved,"For the treatment of enuresis, polyuria, diabe...",9375478|9687526|9744209|9744239|9750785|9937223,Vasopressin V2 receptor|Vasopressin V1a recept...,"Amino Acids, Peptides, and Proteins|Antidiuret...",Pitressin|Pitressin Inj 10 Unit/0.5ml|Pressyn ...
74,small molecule,DB00080,daptomycin,approved|investigational,For the treatment of complicated skin and skin...,2344318|6468967|6852689|8003673|8058238|812934...,Bacterial outer membrane|Lipoteichoic acid syn...,"Amino Acids, Peptides, and Proteins|Anti-Bacte...",Cubicin|Cubicin RF|Daptomycin|daptomycin
84,small molecule,DB00091,ciclosporin,approved|investigational|vet_approved,"For treatment of transplant (kidney, liver, an...",1332150|2108018|4839342|5985321|8292129|856185...,Calcium signal-modulating cyclophilin ligand|C...,Agents causing hyperkalemia|Agents that produc...,Apo-cyclosporine Oral Solution|Aqua-stasis|Ceq...
86,small molecule,DB00093,felypressin,experimental,For use as an alternative to adrenaline as a l...,,Vasopressin V1a receptor,"Amino Acids, Peptides, and Proteins|Cardiovasc...",felypressin
95,small molecule,DB00104,octreotide,approved|investigational,For treatment of acromegaly and reduction of s...,1328402|5538739|5728396|5753618|5922338|592268...,Somatostatin receptor type 1|Somatostatin rece...,"Amino Acids, Peptides, and Proteins|Antineopla...",Ocphyl|Octreotide|Octreotide Acetate|Octreotid...


In [90]:
drugbank_df[drugbank_df.patents!=""].shape

(1006, 9)