# DrugBank Processing 

**Author:** Laetitia Tam

**Last Modified**: 2020-03-06

In this notebook, files from the CVADR dataset are imported and merged. Preliminary data cleaning is performed. 

In [2]:
# Import libraries

import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_columns', 1000)

In [4]:
data_dir = '..\data\DrugBank\db\\'

In [16]:
# import files

# all drugs
drugs = pd.read_csv(data_dir + 'drugs.csv')

#reduce to necessary columns
drugs = drugs[['primary_key','name', 'average_mass']]

# ATC codes
atc_codes = pd.read_csv(data_dir + 'drug_atc_codes.csv')

# Calculated properties - for atomic weight and SMILES
calc_prop = pd.read_csv(data_dir + 'drug_calculated_properties.csv')

# Drug Targets
targets = pd.read_csv(data_dir + 'drug_targets.csv')

In [9]:
atc_codes.head()

Unnamed: 0,atc_code,level_1,code_1,level_2,code_2,level_3,code_3,level_4,code_4,parent_key
0,B01AE02,Direct thrombin inhibitors,B01AE,ANTITHROMBOTIC AGENTS,B01A,ANTITHROMBOTIC AGENTS,B01,BLOOD AND BLOOD FORMING ORGANS,B,DB00001
1,L01XC06,Monoclonal antibodies,L01XC,OTHER ANTINEOPLASTIC AGENTS,L01X,ANTINEOPLASTIC AGENTS,L01,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L,DB00002
2,R05CB13,Mucolytics,R05CB,"EXPECTORANTS, EXCL. COMBINATIONS WITH COUGH SU...",R05C,COUGH AND COLD PREPARATIONS,R05,RESPIRATORY SYSTEM,R,DB00003
3,L01XX29,Other antineoplastic agents,L01XX,OTHER ANTINEOPLASTIC AGENTS,L01X,ANTINEOPLASTIC AGENTS,L01,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L,DB00004
4,L04AB01,Tumor necrosis factor alpha (TNF-α) inhibitors,L04AB,IMMUNOSUPPRESSANTS,L04A,IMMUNOSUPPRESSANTS,L04,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L,DB00005


In [10]:
# There are number of ATC levels:
#  1st level, anatomical main group
#  2nd level, therapeutic subgroup
#  3rd level, pharmacological subgroup
#  4th level, chemical subgroup
#  5th level, chemical substance
# For the preliminary analysis, restrict to level 2, therapeutic subgroup. 

atc_codes.nunique()

atc_code      4482
level_1        678
code_1         745
level_2        234
code_2         239
level_3         90
code_3          90
level_4         14
code_4          14
parent_key    3150
dtype: int64

In [11]:
# Since a single drug can have one or more level 2 codes and there are 234 unique ATC level 2 codes, 
# hasing is used to transform the categorical data so that it can be included in the model

# first, the ATC codes are transformed into a dictionary for each drug and then scikitlearn's FeatureHasher
# is applied. The hashing is prefromed right before modeling to avoid the issue of converting from an array 
# to a dataframe and then back to an array.

atc_1 = atc_codes.groupby(['parent_key','level_2'])[['atc_code']].count().reset_index()[['parent_key','level_2','atc_code']]
atc_1['atc_code'] = 1
atc_1.head(2)

atc_2 = atc_1.groupby(['parent_key'])['level_2','atc_code'].apply(lambda x: dict(zip(x['level_2'], x['atc_code']))).reset_index(name='atc_level_2')
atc_2.head(3)

Unnamed: 0,parent_key,atc_level_2
0,DB00001,{'ANTITHROMBOTIC AGENTS': 1}
1,DB00002,{'OTHER ANTINEOPLASTIC AGENTS': 1}
2,DB00003,"{'EXPECTORANTS, EXCL. COMBINATIONS WITH COUGH ..."


In [15]:
drugbank_1 = drugs.merge(atc_2, left_on='primary_key', right_on='parent_key')

drugbank_1 = drugbank_1[['primary_key', 'name', 'average_mass', 'atc_level_2']]

drugbank_1.head()

Unnamed: 0,primary_key,name,average_mass,atc_level_2
0,DB00001,Lepirudin,,{'ANTITHROMBOTIC AGENTS': 1}
1,DB00002,Cetuximab,,{'OTHER ANTINEOPLASTIC AGENTS': 1}
2,DB00003,Dornase alfa,,"{'EXPECTORANTS, EXCL. COMBINATIONS WITH COUGH ..."
3,DB00004,Denileukin diftitox,,{'OTHER ANTINEOPLASTIC AGENTS': 1}
4,DB00005,Etanercept,,{'IMMUNOSUPPRESSANTS': 1}


In [20]:
# Next, get the SMILES chemical formula

calc_prop.kind.unique()

array(['logP', 'logS', 'Water Solubility', 'IUPAC Name',
       'Traditional IUPAC Name', 'Molecular Weight',
       'Monoisotopic Weight', 'SMILES', 'Molecular Formula', 'InChI',
       'InChIKey', 'Polar Surface Area (PSA)', 'Refractivity',
       'Polarizability', 'Rotatable Bond Count', 'H Bond Acceptor Count',
       'H Bond Donor Count', 'pKa (strongest acidic)',
       'pKa (strongest basic)', 'Physiological Charge', 'Number of Rings',
       'Bioavailability', 'Rule of Five', 'Ghose Filter',
       'MDDR-Like Rule'], dtype=object)

In [21]:
smiles = calc_prop[calc_prop['kind']=='SMILES']

In [22]:
smiles.head()

Unnamed: 0,kind,value,source,parent_key
8,SMILES,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,ChemAxon,DB00006
34,SMILES,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,ChemAxon,DB00007
60,SMILES,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,ChemAxon,DB00014
86,SMILES,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,ChemAxon,DB00027
111,SMILES,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,ChemAxon,DB00035


In [23]:
drugbank_2 = drugbank_1.merge(smiles, left_on='primary_key', right_on='parent_key')[['primary_key', 'name', 'average_mass', 'atc_level_2', 'value']]
      
drugbank_2.head(2)

Unnamed: 0,primary_key,name,average_mass,atc_level_2,value
0,DB00006,Bivalirudin,2180.2853,{'ANTITHROMBOTIC AGENTS': 1},CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,DB00007,Leuprolide,1209.3983,{'HORMONES AND RELATED AGENTS': 1},CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...


In [24]:
# next, bring in the drug targets
# a single druig can have zero or more targets. Also, there are large number of unique targets, 
# so hashing is also used for this variable

targets.nunique()

id              4887
name            4387
organism         539
known_action       3
position         301
parent_key      7565
dtype: int64

In [25]:
# create a dictionary of targets for hashing later

target_1 = targets.groupby(['parent_key','name'])[['id']].count().reset_index()[['parent_key','name','id']]
target_1['id'] = 1
target_1.head(2)

target_2 = target_1.groupby(['parent_key'])['name','id'].apply(lambda x: dict(zip(x['name'], x['id']))).reset_index(name='targets')
target_2.head(5)

Unnamed: 0,parent_key,targets
0,DB00001,{'Prothrombin': 1}
1,DB00002,"{'Complement C1q subcomponent subunit A': 1, '..."
2,DB00003,{'DNA': 1}
3,DB00004,"{'Cytokine receptor common subunit gamma': 1, ..."
4,DB00005,"{'Complement C1q subcomponent subunit A': 1, '..."


In [29]:
# merge targets to the dataframe

drugbank_3 = drugbank_2.merge(target_2, left_on='primary_key', right_on='parent_key')[['primary_key', 'name', 'average_mass', 'atc_level_2', 'value', 'targets']]
drugbank_3.columns = ['primary_key', 'name', 'average_mass', 'atc_level_2', 'smiles', 'targets']
drugbank_3.head()

Unnamed: 0,primary_key,name,average_mass,atc_level_2,smiles,targets
0,DB00006,Bivalirudin,2180.2853,{'ANTITHROMBOTIC AGENTS': 1},CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,{'Prothrombin': 1}
1,DB00007,Leuprolide,1209.3983,{'HORMONES AND RELATED AGENTS': 1},CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,{'Gonadotropin-releasing hormone receptor': 1}
2,DB00014,Goserelin,1269.4105,{'HORMONES AND RELATED AGENTS': 1},CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,"{'Gonadotropin-releasing hormone receptor': 1,..."
3,DB00035,Desmopressin,1069.22,{'POSTERIOR PITUITARY LOBE HORMONES': 1},NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,"{'Vasopressin V1a receptor': 1, 'Vasopressin V..."
4,DB00050,Cetrorelix,1431.038,{'HYPOTHALAMIC HORMONES': 1},CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...,"{'Gonadotropin-releasing hormone receptor': 1,..."


In [30]:
drugbank_3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1743 entries, 0 to 1742
Data columns (total 6 columns):
primary_key     1743 non-null object
name            1743 non-null object
average_mass    1742 non-null float64
atc_level_2     1743 non-null object
smiles          1743 non-null object
targets         1743 non-null object
dtypes: float64(1), object(5)
memory usage: 95.3+ KB


In [32]:
synonyms = pd.read_csv(data_dir + 'drug_syn.csv')
synonyms.head()

Unnamed: 0,parent_key,synonym,language,coder
0,DB00001,Hirudin variant-1,english,
1,DB00001,Lepirudin recombinant,english,
2,DB00002,Cetuximab,english/spanish/german,inn
3,DB00002,Cétuximab,french,inn
4,DB00002,Cetuximabum,latin,inn


In [36]:
# finally, add the drug synonyms to help with matching to CVADR later

drugbank_4 = drugbank_3.merge(synonyms, left_on='primary_key', right_on='parent_key', how='left')[['primary_key', 'name', 'average_mass', 'atc_level_2', 'smiles', 'targets','synonym']]
drugbank_4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10092 entries, 0 to 10091
Data columns (total 7 columns):
primary_key     10092 non-null object
name            10092 non-null object
average_mass    10091 non-null float64
atc_level_2     10092 non-null object
smiles          10092 non-null object
targets         10092 non-null object
synonym         10035 non-null object
dtypes: float64(1), object(6)
memory usage: 630.8+ KB


In [37]:
drugbank_4.head()

Unnamed: 0,primary_key,name,average_mass,atc_level_2,smiles,targets,synonym
0,DB00006,Bivalirudin,2180.2853,{'ANTITHROMBOTIC AGENTS': 1},CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,{'Prothrombin': 1},Bivalirudin
1,DB00006,Bivalirudin,2180.2853,{'ANTITHROMBOTIC AGENTS': 1},CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,{'Prothrombin': 1},Bivalirudina
2,DB00006,Bivalirudin,2180.2853,{'ANTITHROMBOTIC AGENTS': 1},CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,{'Prothrombin': 1},Bivalirudinum
3,DB00007,Leuprolide,1209.3983,{'HORMONES AND RELATED AGENTS': 1},CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,{'Gonadotropin-releasing hormone receptor': 1},Leuprorelin
4,DB00007,Leuprolide,1209.3983,{'HORMONES AND RELATED AGENTS': 1},CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,{'Gonadotropin-releasing hormone receptor': 1},Leuprorelina


In [38]:
# export data
drugbank_4.to_pickle('../data/drugbank_processed.pkl')