<font color='red' size=5><b> Software Requirements: The below scripts are tested in Python version 3.7.0</b> </font><br>   
<font color='red' size=5><b> Note: Reading drugbank.xml file will take more time</b> </font><br>   
# <font color='blue'>Extraction of drug details from drugbank XML </font> <br>
# <font color='blue'> Important Instructions </font>
## <font color='blue'> Setup Working directory, copy files, execute scripts </font>
<font color='blue'> <br>
1) Download drugbank.xml from https://go.drugbank.com/releases/latestCopy<br>
2) Create a new directory called drugbank under HOME/working directory directory<br>
3) Copy the downloaded XML file in the drugbank folder<br>
4) Make sure that the XML file is named as drug_bank.xml<br>
5) Copy this ipython (ipynb) notebook into your HOME/working directory<br>
6) Execute below scripts by sequence order </font> <br>


In [6]:
#import required packages to read the XML file
import pandas
import json
import os
import collections
import re
import xml.etree.ElementTree as ET

### Reading XML file downloaded from the drugbank.ca

In [7]:
#load drugbank.xml
tree = ET.parse("./drugbank/drug_bank.xml")
#get the root of the tree
root = tree.getroot()

### Preprocess the XML format to extract required columns and rows

In [12]:
#Pre-process drug data and extract required information
ns = '{http://www.drugbank.ca}'
rows = list()

for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['description'] = drug.findtext(ns + "description")
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]  
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)
    rows.append(row)
drugbank_df = pandas.DataFrame.from_dict(rows) 


In [13]:
#Extract additional details fro the XML file
protein_rows = list()
for i, drug in enumerate(root):
    drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
    for category in ['target', 'enzyme', 'carrier', 'transporter']:
        proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))
        for protein in proteins:
            row = {'drugbank_id': drugbank_id, 'category': category}
            row['organism'] = protein.findtext('{}organism'.format(ns))
            row['known_action'] = protein.findtext('{}known-action'.format(ns))
            actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
            row['actions'] = '|'.join(action.text for action in actions)
            uniprot_ids = [polypep.text for polypep in protein.findall(
                "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]            
            if len(uniprot_ids) != 1:
                continue
            row['uniprot_id'] = uniprot_ids[0]
            ref_text = protein.findtext("{ns}references[@format='textile']".format(ns=ns))
            protein_rows.append(row)
protein_df = pandas.DataFrame.from_dict(protein_rows)

In [23]:
#Merge both drug dataframe and protein dataframe to create a final dataframe
drugs_final_df = drugbank_df.merge(protein_df, on='drugbank_id')
drugs_final_df['name'] = drugs_final_df['name'].str.lower() 
#Check final dataframe
drugs_final_df.head()

Unnamed: 0,type,drugbank_id,name,description,groups,categories,aliases,category,organism,known_action,actions,uniprot_id
0,biotech,DB00001,lepirudin,Lepirudin is identical to natural hirudin exce...,[approved],"[Amino Acids, Peptides, and Proteins, Anticoag...","[Lepirudin, Refludan]",target,Humans,yes,inhibitor,P00734
1,biotech,DB00002,cetuximab,Cetuximab is an epidermal growth factor recept...,[approved],"[Amino Acids, Peptides, and Proteins, Antibodi...","[Cetuximab, Erbitux]",target,Humans,yes,antagonist,P00533
2,biotech,DB00002,cetuximab,Cetuximab is an epidermal growth factor recept...,[approved],"[Amino Acids, Peptides, and Proteins, Antibodi...","[Cetuximab, Erbitux]",target,Humans,unknown,,O75015
3,biotech,DB00002,cetuximab,Cetuximab is an epidermal growth factor recept...,[approved],"[Amino Acids, Peptides, and Proteins, Antibodi...","[Cetuximab, Erbitux]",target,Humans,unknown,,P02745
4,biotech,DB00002,cetuximab,Cetuximab is an epidermal growth factor recept...,[approved],"[Amino Acids, Peptides, and Proteins, Antibodi...","[Cetuximab, Erbitux]",target,Humans,unknown,,P02746


#Merge both drug dataframe and protein dataframe to create a final dataframe
drugs_final_df = drugbank_df.merge(protein_df, on='drugbank_id')

### Save data into a CSV file for later use

In [24]:
#Generate CSV file from the newly created dataframe
path = os.path.join('drugbank', 'drugbank_df.csv')
drugs_final_df.to_csv(path, sep=',', index=False)

### Check the saved data

In [25]:
#Import the data from the csv file to a pandas dataframe
df_drugs = pd.read_csv(path)
df_drugs.head()

Unnamed: 0,type,drugbank_id,name,description,groups,categories,aliases,category,organism,known_action,actions,uniprot_id
0,biotech,DB00001,lepirudin,Lepirudin is identical to natural hirudin exce...,['approved'],"['Amino Acids, Peptides, and Proteins', 'Antic...","['Lepirudin', 'Refludan']",target,Humans,yes,inhibitor,P00734
1,biotech,DB00002,cetuximab,Cetuximab is an epidermal growth factor recept...,['approved'],"['Amino Acids, Peptides, and Proteins', 'Antib...","['Cetuximab', 'Erbitux']",target,Humans,yes,antagonist,P00533
2,biotech,DB00002,cetuximab,Cetuximab is an epidermal growth factor recept...,['approved'],"['Amino Acids, Peptides, and Proteins', 'Antib...","['Cetuximab', 'Erbitux']",target,Humans,unknown,,O75015
3,biotech,DB00002,cetuximab,Cetuximab is an epidermal growth factor recept...,['approved'],"['Amino Acids, Peptides, and Proteins', 'Antib...","['Cetuximab', 'Erbitux']",target,Humans,unknown,,P02745
4,biotech,DB00002,cetuximab,Cetuximab is an epidermal growth factor recept...,['approved'],"['Amino Acids, Peptides, and Proteins', 'Antib...","['Cetuximab', 'Erbitux']",target,Humans,unknown,,P02746


In [18]:
#Check the rows and columns of the dataframe
df_drugs.shape

(26965, 12)

In [19]:
#check information of the dataframe
df_drugs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26965 entries, 0 to 26964
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          26965 non-null  object
 1   drugbank_id   26965 non-null  object
 2   name          26965 non-null  object
 3   description   19997 non-null  object
 4   groups        26965 non-null  object
 5   categories    26965 non-null  object
 6   aliases       26965 non-null  object
 7   category      26965 non-null  object
 8   organism      26814 non-null  object
 9   known_action  26965 non-null  object
 10  actions       15637 non-null  object
 11  uniprot_id    26965 non-null  object
dtypes: object(12)
memory usage: 2.5+ MB


In [20]:
# Inspecting the percentages of Null values of the dataframe
null_data = round(100*(df_drugs.isnull().sum()/len(df_drugs.index)), 2)
null_data.sort_values()

type             0.00
drugbank_id      0.00
name             0.00
groups           0.00
categories       0.00
aliases          0.00
category         0.00
known_action     0.00
uniprot_id       0.00
organism         0.56
description     25.84
actions         42.01
dtype: float64

In [21]:
#Check for symptom based filter to validate the drugbank dataframe content
df_drugs_for_symptoms = df_drugs[df_drugs['description'].str.contains("fever") | df_drugs['description'].str.contains('dry cough') | df_drugs['description'].str.contains('tiredness') | 
         df_drugs['description'].str.contains('aches and pains') | df_drugs['description'].str.contains('sore throat') 
         | df_drugs['description'].str.contains('diarrhoea') | df_drugs['description'].str.contains('headache') | df_drugs['description'].str.contains('loss of taste or smell')
        | df_drugs['description'].str.contains('anti viral') | df_drugs['description'].str.contains('corona')] 
print(df_drugs_for_symptoms['name'].unique())

['Insulin human' 'Insulin lispro' 'Insulin glargine' 'Oseltamivir'
 'Pantoprazole' 'Eletriptan' 'Lovastatin' 'Butalbital' 'Methysergide'
 'Argatroban' 'Clemastine' 'Venlafaxine' 'Acetaminophen' 'Indomethacin'
 'Dexbrompheniramine' 'Ketorolac' 'Hexylcaine' 'Raloxifene' 'Bentiromide'
 'Nicardipine' 'Docosanol' 'Simvastatin' 'Dyclonine' 'Acamprosate'
 'Sumatriptan' 'Esomeprazole' 'Clopidogrel' 'Tirofiban' 'Perindopril'
 'Tripelennamine' 'Candesartan cilexetil' 'Ribavirin' 'Almotriptan'
 'Acetylsalicylic acid' 'Naratriptan' 'Rizatriptan' 'Methylprednisolone'
 'Alosetron' 'Frovatriptan' 'Isosorbide mononitrate' 'Irbesartan'
 'Mequitazine' 'Perhexiline' 'Atorvastatin' 'Rosuvastatin' 'Arbutamine'
 'Chlorpheniramine' 'Clomipramine' 'Bepridil' 'Darunavir' 'Insulin aspart'
 'Insulin detemir' 'Insulin glulisine' 'Lithium cation' 'Phenindamine'
 'D-Treitol' 'Erythritol' 'Huperzine A' 'Egaptivon pegol'
 'Fenoxaprop-ethyl' 'Dexlansoprazole' 'SR-123781A' 'Succinobucol'
 'OPC-28326' 'Golotimod' 'Techn

In [22]:
#Check for COVID-19 based filter to validate the drugbank dataframe content
df_drugs_for_COVID = df_drugs[df_drugs['description'].str.contains("coronavirus") | df_drugs['description'].str.contains("covid-19") | df_drugs['description'].str.contains("SARS") | df_drugs['description'].str.contains("nCoV")| df_drugs['description'].str.contains("SARS-CoV-2") | df_drugs['description'].str.contains("COVID-19")]
df_drugs_for_COVID['name'].unique()

array(['Bevacizumab', 'Azithromycin', 'Chloroquine', 'Methylprednisolone',
       'Dexamethasone', 'Darunavir', 'Lopinavir', 'Hydroxychloroquine',
       'Leronlimab', 'Tocilizumab', 'Fingolimod', 'Elbasvir',
       'Galidesivir', 'Favipiravir', 'Metenkefalin', 'Umifenovir',
       'Camostat', 'Dexamethasone acetate', 'Remdesivir',
       'Human interferon beta', 'TMC-310911',
       'N-(2-Aminoethyl)-1-aziridineethanamine', 'GS-441524',
       'Vazegepant'], dtype=object)

### There are already some of the drugs are approved or under clinical investigations for COVID-19. This helps in our investigation later

## END of drugbank data extraction