In [1]:
# Import libraries

import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', 1000)

In [3]:
data_dir = '..\data\DrugBank\db\\'

In [4]:
# import files

drugs = pd.read_csv(data_dir + 'drugs.csv')


In [5]:
#reduce to necessary columns
drugs = drugs[['primary_key','name', 'average_mass']]

In [6]:
categories = pd.read_csv(data_dir + 'drug_categories.csv')


In [7]:
categories.head()

Unnamed: 0,category,mesh-id,parent_key
0,"Amino Acids, Peptides, and Proteins",D000602,DB00001
1,Anticoagulants,D000925,DB00001
2,Antithrombin Proteins,D058833,DB00001
3,Antithrombins,D000991,DB00001
4,Blood and Blood Forming Organs,,DB00001


In [8]:
categories.nunique()

category      4047
mesh-id       2124
parent_key    7910
dtype: int64

In [9]:
atc_codes = pd.read_csv(data_dir + 'drug_atc_codes.csv')

atc_codes.head(3)

Unnamed: 0,atc_code,level_1,code_1,level_2,code_2,level_3,code_3,level_4,code_4,parent_key
0,B01AE02,Direct thrombin inhibitors,B01AE,ANTITHROMBOTIC AGENTS,B01A,ANTITHROMBOTIC AGENTS,B01,BLOOD AND BLOOD FORMING ORGANS,B,DB00001
1,L01XC06,Monoclonal antibodies,L01XC,OTHER ANTINEOPLASTIC AGENTS,L01X,ANTINEOPLASTIC AGENTS,L01,ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS,L,DB00002
2,R05CB13,Mucolytics,R05CB,"EXPECTORANTS, EXCL. COMBINATIONS WITH COUGH SU...",R05C,COUGH AND COLD PREPARATIONS,R05,RESPIRATORY SYSTEM,R,DB00003


In [10]:
atc_codes.nunique()

atc_code      4482
level_1        678
code_1         745
level_2        234
code_2         239
level_3         90
code_3          90
level_4         14
code_4          14
parent_key    3150
dtype: int64

In [11]:
atc_codes.groupby('parent_key').nunique().sort_values(by='atc_code', ascending=False)

Unnamed: 0_level_0,atc_code,level_1,code_1,level_2,code_2,level_3,code_3,level_4,code_4,parent_key
parent_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DB00783,39,6,8,6,6,2,2,1,1,1
DB00381,26,9,9,7,7,4,4,1,1,1
DB00977,25,4,4,3,3,2,2,2,2,1
DB00945,20,7,7,7,7,6,6,5,5,1
DB00860,19,13,18,14,16,10,10,7,7,1
...,...,...,...,...,...,...,...,...,...,...
DB04812,1,1,1,1,1,1,1,1,1,1
DB04818,1,1,1,1,1,1,1,1,1,1
DB04819,1,1,1,1,1,1,1,1,1,1
DB04820,1,1,1,1,1,1,1,1,1,1


In [12]:
# prep for hashign

atc_1 = atc_codes.groupby(['parent_key','level_2'])[['atc_code']].count().reset_index()[['parent_key','level_2','atc_code']]
atc_1['atc_code'] = 1
atc_1.head(2)

atc_2 = atc_1.groupby(['parent_key'])['level_2','atc_code'].apply(lambda x: dict(zip(x['level_2'], x['atc_code']))).reset_index(name='atc_level_2')
atc_2.head(3)

Unnamed: 0,parent_key,atc_level_2
0,DB00001,{'ANTITHROMBOTIC AGENTS': 1}
1,DB00002,{'OTHER ANTINEOPLASTIC AGENTS': 1}
2,DB00003,"{'EXPECTORANTS, EXCL. COMBINATIONS WITH COUGH ..."


In [13]:
from sklearn.feature_extraction import FeatureHasher
import sys

h = FeatureHasher(n_features=234)
D = atc_2['atc_level_2'].to_list()
f = h.transform(D)
atc_array = f.toarray().tolist()


In [14]:
atc_series = pd.Series(atc_array)

atc_3 = pd.concat([atc_2, atc_series], axis=1)
atc_3.columns = ['parent_key','atc_level_2','atc_vector']
atc_3.head(2)

Unnamed: 0,parent_key,atc_level_2,atc_vector
0,DB00001,{'ANTITHROMBOTIC AGENTS': 1},"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,DB00002,{'OTHER ANTINEOPLASTIC AGENTS': 1},"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [15]:
atc_3.count()

parent_key     3150
atc_level_2    3150
atc_vector     3150
dtype: int64

In [16]:
drugbank_1 = drugs.merge(atc_3, left_on='primary_key', right_on='parent_key')

# keep only level 2. Not sure what to do with mulitple ATC per drug .For now, just take first ATC 

drugbank_1 = drugbank_1[['primary_key', 'name', 'average_mass', 'atc_level_2', 'atc_vector']]
#drugbank_1.nunique()

In [17]:
calc_prop = pd.read_csv(data_dir + 'drug_calculated_properties.csv')

calc_prop.head()

Unnamed: 0,kind,value,source,parent_key
0,logP,-0.76,ALOGPS,DB00006
1,logS,-4.7,ALOGPS,DB00006
2,Water Solubility,4.64e-02 g/l,ALOGPS,DB00006
3,logP,-14,ChemAxon,DB00006
4,IUPAC Name,(4S)-4-[(2S)-2-[(2S)-2-[(2S)-2-{2-[(2S)-2-(2-{...,ChemAxon,DB00006


In [18]:
calc_prop.kind.unique()

array(['logP', 'logS', 'Water Solubility', 'IUPAC Name',
       'Traditional IUPAC Name', 'Molecular Weight',
       'Monoisotopic Weight', 'SMILES', 'Molecular Formula', 'InChI',
       'InChIKey', 'Polar Surface Area (PSA)', 'Refractivity',
       'Polarizability', 'Rotatable Bond Count', 'H Bond Acceptor Count',
       'H Bond Donor Count', 'pKa (strongest acidic)',
       'pKa (strongest basic)', 'Physiological Charge', 'Number of Rings',
       'Bioavailability', 'Rule of Five', 'Ghose Filter',
       'MDDR-Like Rule'], dtype=object)

In [19]:
smiles = calc_prop[calc_prop['kind']=='SMILES']

In [20]:
smiles.head()

Unnamed: 0,kind,value,source,parent_key
8,SMILES,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...,ChemAxon,DB00006
34,SMILES,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...,ChemAxon,DB00007
60,SMILES,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...,ChemAxon,DB00014
86,SMILES,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...,ChemAxon,DB00027
111,SMILES,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...,ChemAxon,DB00035


In [21]:
drugbank_2 = drugbank_1.merge(smiles, left_on='primary_key', right_on='parent_key')[['primary_key', 'name', 'average_mass', 'atc_level_2', 'atc_vector', 'value']]
# drugbank_2.rename(columns={'value':'smiles', 'level_2':'atc_level_2'})
                                                                                    
drugbank_2.head(2)

Unnamed: 0,primary_key,name,average_mass,atc_level_2,atc_vector,value
0,DB00006,Bivalirudin,2180.2853,{'ANTITHROMBOTIC AGENTS': 1},"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1,DB00007,Leuprolide,1209.3983,{'HORMONES AND RELATED AGENTS': 1},"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(N)=N)NC(=...


In [22]:
targets = pd.read_csv(data_dir + 'drug_targets.csv')

targets.head()

FileNotFoundError: [Errno 2] File b'..\\data\\DrugBank\\db\\drug_targets.csv' does not exist: b'..\\data\\DrugBank\\db\\drug_targets.csv'

In [None]:
targets.nunique()

In [None]:
target_1 = targets.groupby(['parent_key','name'])[['id']].count().reset_index()[['parent_key','name','id']]
target_1['id'] = 1
target_1.head(2)

target_2 = target_1.groupby(['parent_key'])['name','id'].apply(lambda x: dict(zip(x['name'], x['id']))).reset_index(name='targets')
target_2.head(5)

In [None]:
h = FeatureHasher(n_features=4887)
D = target_2['targets'].to_list()
f = h.transform(D)
target_array = f.toarray().tolist()


In [None]:
target_series = pd.Series(target_array)

target_3 = pd.concat([target_2, target_series], axis=1)
target_3.columns = ['parent_key','targets','target_vector']

In [None]:
targets_final = target_3 

In [None]:
drugbank_3.columns.values

In [None]:
drugbank_3 = drugbank_2.merge(target_3, left_on='primary_key', right_on='parent_key')[['primary_key', 'name', 'average_mass', 'level_2', 'value', 'targets', 'target_vector']]

drugbank_3.columns = ['primary_key', 'name', 'average_mass', 'atc_level_2', 'value', 'targets', 'target_vector']
drugbank_3.head()

In [None]:
drugbank_3.info()

In [None]:
synonyms = pd.read_csv(data_dir + 'drug_syn.csv')
synonyms.head()

In [None]:
drugbank_4 = drugbank_3.merge(synonyms, left_on='primary_key', right_on='parent_key', how='left')[['primary_key', 'name', 'average_mass', 'atc_level_2', 'value',
       'targets', 'target_vector','synonym']]
drugbank_4.info()

In [None]:
drugbank_4[['primary_key', 'name', 'average_mass', 'atc_level_2', 'value',
       'synonym']].nunique()

In [None]:
drugbank_4

In [None]:
drugbank_4.to_csv('data/drugbank_processed.csv', index=False)

In [None]:
drugbank_4