In [1]:
import xml.etree.ElementTree as ET
import collections
import pandas as pd
import itertools

In [2]:
xml_file = "full database.xml"
tree = ET.parse(xml_file)

In [173]:
root = tree.getroot()


In [47]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row


In [176]:
drugbank_df = pd.DataFrame(columns=["drugbank_id", "name", "type", "approved", "products", 
                                "indication", "inchikey", "chembl_id","patent_number", "patent_country", "patent_date"])

In [177]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
chembl_id_template = "{ns}external-identifiers/{ns}external-identifier[{ns}resource='ChEMBL']/{ns}identifier"

for i, drug in enumerate(root):
    #row = collections.OrderedDict()
    row = dict()
    assert drug.tag == ns + 'drug'
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name").lower()
    row['type'] = drug.get('type')    
    # check if approved
    groups = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    if "approved" in groups:
        row["approved"] = True
    else:
        row["approved"] = False
    row['indication'] = drug.findtext(ns + "indication")
    #row['target'] = [group.text for group in
    #    drug.findall("{ns}targets/{ns}target/{ns}name".format(ns = ns))]
    #row['categories'] = [x.findtext(ns + 'category') for x in
    #    drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    row['chembl_id'] = drug.findtext(chembl_id_template.format(ns = ns))
    patents = [group.text for group in
        drug.findall("{ns}patents/{ns}patent/{ns}number".format(ns = ns))]
    patent_countries = [group.text for group in
        drug.findall("{ns}patents/{ns}patent/{ns}country".format(ns = ns))]
    patent_dates = [group.text for group in
        drug.findall("{ns}patents/{ns}patent/{ns}approved".format(ns = ns))]
    
    aliases = {
        elem.text.lower() for elem in 
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))
    }
    aliases.add(row['name'])
    aliases = sorted(aliases)
    aliases_str = '|'.join(aliases)
    row['products'] = aliases_str
    
    for patent, country, date in zip(patents, patent_countries, patent_dates):
        row["patent_number"] = patent
        row["patent_country"] = country
        row["patent_date"] = date
        drugbank_df = drugbank_df.append(row, ignore_index=True)
    

In [178]:
drugbank_df

Unnamed: 0,drugbank_id,name,type,approved,products,indication,inchikey,chembl_id,patent_number,patent_country,patent_date
0,DB00001,lepirudin,biotech,True,lepirudin|refludan,For the treatment of heparin-induced thrombocy...,,CHEMBL1201666,5180668,United States,1993-01-19
1,DB00002,cetuximab,biotech,True,cetuximab|erbitux,"Cetuximab, used in combination with irinotecan...",,CHEMBL1201577,1340417,Canada,1999-03-02
2,DB00003,dornase alfa,biotech,True,dornase alfa|pulmozyme|pulmozyme 1mg/ml,Used as adjunct therapy in the treatment of cy...,,CHEMBL1201431,2184581,Canada,2005-02-22
3,DB00003,dornase alfa,biotech,True,dornase alfa|pulmozyme|pulmozyme 1mg/ml,Used as adjunct therapy in the treatment of cy...,,CHEMBL1201431,2137237,Canada,2004-10-26
4,DB00005,etanercept,biotech,True,brenzys|enbrel|erelzi|etanercept,Etanercept is indicated for the treatment of m...,,CHEMBL1201572,2476934,Canada,2009-06-16
5,DB00005,etanercept,biotech,True,brenzys|enbrel|erelzi|etanercept,Etanercept is indicated for the treatment of m...,,CHEMBL1201572,2123593,Canada,2000-03-14
6,DB00005,etanercept,biotech,True,brenzys|enbrel|erelzi|etanercept,Etanercept is indicated for the treatment of m...,,CHEMBL1201572,7276477,United States,2007-10-02
7,DB00006,bivalirudin,small molecule,True,angiomax|bivalirudin|bivalirudin for injection...,For treatment of heparin-induced thrombocytope...,OIRCOABEOLEUMC-GEJPAHFPSA-N,CHEMBL2103749,5196404,United States,1993-03-23
8,DB00006,bivalirudin,small molecule,True,angiomax|bivalirudin|bivalirudin for injection...,For treatment of heparin-induced thrombocytope...,OIRCOABEOLEUMC-GEJPAHFPSA-N,CHEMBL2103749,2065150,Canada,1999-12-14
9,DB00006,bivalirudin,small molecule,True,angiomax|bivalirudin|bivalirudin for injection...,For treatment of heparin-induced thrombocytope...,OIRCOABEOLEUMC-GEJPAHFPSA-N,CHEMBL2103749,7598343,United States,2009-10-06


In [180]:
drugbank_df.drugbank_id.nunique()

1006

In [185]:
drugbank_df.patent_country.nunique()

2

In [191]:
drugbank_df.loc[(~drugbank_df.inchikey.isnull()) & (drugbank_df.chembl_id.isnull())]

Unnamed: 0,drugbank_id,name,type,approved,products,indication,inchikey,chembl_id,patent_number,patent_country,patent_date
414,DB00185,cevimeline,small molecule,True,cevimeline|cevimeline hydrochloride|evoxac,For the treatment of symptoms of dry mouth in ...,WUTYZMFRCNBCHQ-LHIURRSHSA-N,,5340821,United States,1994-08-23
415,DB00185,cevimeline,small molecule,True,cevimeline|cevimeline hydrochloride|evoxac,For the treatment of symptoms of dry mouth in ...,WUTYZMFRCNBCHQ-LHIURRSHSA-N,,4855290,United States,1989-08-08
561,DB00225,gadodiamide,small molecule,True,gadodiamide|omniscan|omniscan liq iv 287mg/ml,For intravenous use in MRI to visualize lesion...,HZHFFEYYPYZMNU-UHFFFAOYSA-K,,5560903,United States,1996-10-01
562,DB00225,gadodiamide,small molecule,True,gadodiamide|omniscan|omniscan liq iv 287mg/ml,For intravenous use in MRI to visualize lesion...,HZHFFEYYPYZMNU-UHFFFAOYSA-K,,5362475,United States,1994-11-08
563,DB00225,gadodiamide,small molecule,True,gadodiamide|omniscan|omniscan liq iv 287mg/ml,For intravenous use in MRI to visualize lesion...,HZHFFEYYPYZMNU-UHFFFAOYSA-K,,1335819,Canada,1995-06-06
2323,DB00686,pentosan polysulfate,small molecule,True,elmiron|pentosan polysulfate,For the relief of bladder pain or discomfort a...,FCCNSUIJIOOXEZ-SJYYZXOBSA-N,,5180715,United States,1993-01-19
4668,DB01345,potassium cation,small molecule,True,potassium cation|rurina|telom-x-gene,Potassium is used to regulate hypokalemia as a...,NPYPAHLBTDXSSS-UHFFFAOYSA-N,,6946149,United States,2005-09-20
4804,DB01592,iron,small molecule,True,active fe|active ob|actyform|advanced b & t fo...,Used in preventing and treating iron-deficienc...,XEEYBQQBJWHFJM-UHFFFAOYSA-N,,6667050,United States,2003-12-23
5284,DB06215,ferumoxytol,small molecule,True,feraheme|feridex|ferumoxytol,This drug is indicated for the treatment of ir...,WTFXARWRTYJXII-UHFFFAOYSA-N,,8501158,United States,2013-08-06
5285,DB06215,ferumoxytol,small molecule,True,feraheme|feridex|ferumoxytol,This drug is indicated for the treatment of ir...,WTFXARWRTYJXII-UHFFFAOYSA-N,,8591864,United States,2013-11-26


In [188]:
drugbank_df.loc[(~drugbank_df.inchikey.isnull()) | (~drugbank_df.chembl_id.isnull())].patent_number.nunique()

2

In [187]:
drugbank_df.approved.unique()

array([True, False], dtype=object)

In [192]:
drugbank_df.patent_number.nunique()

5616

In [195]:
drugbank_df.to_csv("drugbank_df.csv", sep='\t', index=False)

In [2]:
drugbank_df = pd.read_csv("drugbank_df.csv", sep='\t')

In [4]:
drugbank_df.shape

(7463, 11)

In [6]:
hiv_list = ["nelfinavir", "efavirenz", "lamivudine", "zidovudine", "latanoprost", "ritonavir",
           "didanosine", "nevirapine", "stavudine", "lopinavir", "viramune", "3TC", "ledipasvir", 
           "sofosbuvir", "nexavar", "triAvir", "gleevic", "viread", "kaletra", "indinavir", "abacavir"]


In [9]:
hiv_df = pd.DataFrame(columns=drugbank_df.columns)
for drug in hiv_list:
    drug_df = drugbank_df.loc[drugbank_df.products.str.contains(drug)]
    print(drug_df.shape)
    hiv_df = hiv_df.append(drug_df)
    

(2, 11)
(100, 11)
(33, 11)
(33, 11)
(25, 11)
(59, 11)
(3, 11)
(23, 11)
(1, 11)
(59, 11)
(3, 11)
(0, 11)
(40, 11)
(56, 11)
(10, 11)
(0, 11)
(0, 11)
(33, 11)
(59, 11)
(4, 11)
(30, 11)


In [11]:
def get_cc(country):
    if country == "United States":
        return "US"
    if country == "Canada":
        return "CA"
hiv_df["country_code"] = hiv_df["patent_country"].apply(lambda x: get_cc(x))

In [13]:
hiv_df["publication_number"] = hiv_df["country_code"] + "-" + hiv_df["patent_number"]

In [15]:
hiv_df.to_csv("drugbank_hiv_list.csv", sep="\t", index=False)