In [1]:
import psycopg2 as ps
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem as ch

## Grab all DrugBank molecules

In [2]:
suppl = ch.SDMolSupplier('/home/uzivatel/data/ivan/projekty/chemspacescripts/scratch/drugbank.sdf')
mols = [x for x in suppl if x is not None]

## Connect to the DB && get a cursor

In [3]:
connection = ps.connect(user='uzivatel',
                        dbname='drugbank',
                        port='9000') # change as needed
cursor = connection.cursor()

## Load DrugBank into the database

In [4]:
cursor.execute("delete from drugbank.drugbank") # in case there are old values
query = "insert into drugbank.drugbank (smiles) values %s" % (', '.join(["('%s')" % (ch.MolToSmiles(mol))
                                                                         for mol in mols]))
cursor.execute(query)
cursor.execute("select count(*) from drugbank.drugbank")
print(cursor.fetchall())
cursor.execute("select id, smiles from drugbank.drugbank limit 10")
print(cursor.fetchall())

[(7109L,)]
[(49764, 'CCC(C)C(NC(=O)C(CCC(=O)O)NC(=O)C(CCC(=O)O)NC(=O)C(Cc1ccccc1)NC(=O)C(CC(=O)O)NC(=O)CNC(=O)C(CC(N)=O)NC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)C1CCCN1C(=O)C(CCCNC(=N)N)NC(=O)C1CCCN1C(=O)C(N)Cc1ccccc1)C(=O)N1CCCC1C(=O)NC(CCC(=O)O)C(=O)NC(CCC(=O)O)C(=O)NC(Cc1ccc(O)cc1)C(=O)NC(CC(C)C)C(=O)O', None, None, None, None, None, None), (49765, 'CC(C)CC(NC(=O)C(COC(C)(C)C)NC(=O)C(Cc1ccc(O)cc1)NC(=O)C(CO)NC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)C(Cc1cnc[nH]1)NC(=O)C1CCC(=O)N1)C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NNC(N)=O', None, None, None, None, None, None), (49766, 'N=C(N)NCCCC(NC(=O)C1CCCN1C(=O)C1CSSCCC(=O)NC(Cc2ccc(O)cc2)C(=O)NC(Cc2ccccc2)C(=O)NC(CCC(N)=O)C(=O)NC(CC(N)=O)C(=O)N1)C(=O)NCC(N)=O', None, None, None, None, None, None), (49767, 'CC(=O)NC(Cc1ccc2ccccc2c1)C(=O)NC(Cc1ccc(Cl)cc1)C(=O)NC(Cc1cccnc1)C(=O)NC(CO)C(=O)NC(Cc1ccc(O)cc1)C(=O)NC(CCCNC(N)=O)C(=O)NC(CC(C)C)C(=O)NC(CCCNC(=N)N)C(=O)N1CCCC1C(=O)NC(C)C(N)=O', None, None, None, None, None, None), (49768, 'CC=CCC(C)C(O)C1C(=O)NC(CC)C(=O

## Create pickled mol instance in the database

In [5]:
cursor.execute("update drugbank.drugbank set molinstance = mol_from_smiles(smiles::cstring)")
cursor.execute("select * from drugbank.drugbank limit 10")
print(cursor.fetchall())

[(49764, 'CCC(C)C(NC(=O)C(CCC(=O)O)NC(=O)C(CCC(=O)O)NC(=O)C(Cc1ccccc1)NC(=O)C(CC(=O)O)NC(=O)CNC(=O)C(CC(N)=O)NC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)C1CCCN1C(=O)C(CCCNC(=N)N)NC(=O)C1CCCN1C(=O)C(N)Cc1ccccc1)C(=O)N1CCCC1C(=O)NC(CCC(=O)O)C(=O)NC(CCC(=O)O)C(=O)NC(Cc1ccc(O)cc1)C(=O)NC(CC(C)C)C(=O)O', 'CCC(C)C(NC(=O)C(CCC(=O)O)NC(=O)C(CCC(=O)O)NC(=O)C(Cc1ccccc1)NC(=O)C(CC(=O)O)NC(=O)CNC(=O)C(CC(N)=O)NC(=O)CNC(=O)CNC(=O)CNC(=O)CNC(=O)C1CCCN1C(=O)C(CCCNC(=N)N)NC(=O)C1CCCN1C(=O)C(N)Cc1ccccc1)C(=O)N1CCCC1C(=O)NC(CCC(=O)O)C(=O)NC(CCC(=O)O)C(=O)NC(Cc1ccc(O)cc1)C(=O)NC(CC(C)C)C(=O)O', None, None, None, None, None), (49765, 'CC(C)CC(NC(=O)C(COC(C)(C)C)NC(=O)C(Cc1ccc(O)cc1)NC(=O)C(CO)NC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)C(Cc1cnc[nH]1)NC(=O)C1CCC(=O)N1)C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NNC(N)=O', 'CC(C)CC(NC(=O)C(COC(C)(C)C)NC(=O)C(Cc1ccc(O)cc1)NC(=O)C(CO)NC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)C(Cc1cnc[nH]1)NC(=O)C1CCC(=O)N1)C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NNC(N)=O', None, None, None, None, None), (49766, 'N=C(N)

## Fingerprint the created mol instances

In [6]:
cursor.execute("update drugbank.drugbank set morganfp = morganbv_fp(molinstance)")
cursor.execute("select * from drugbank.drugbank limit 10")
print(cursor.fetchall())

[(50951, '[Li+]', '[Li+]', '\\x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002000000000000000001000000000000000', None, None, None, None), (51453, 'O=CO', 'O=CO', '\\x00000000000000000000000000000000000400000000400000000000000000000000000080000001000000000000000800000000000000000000000000100000', None, None, None, None), (51578, 'NOCC(=O)O', 'NOCC(=O)O', '\\x00000000000400000000010000000000000408000010800000020000000000000000000080000000040000000000003021000000000000000000100000000000', None, None, None, None), (52195, 'c1cn[nH]c1', 'c1cn[nH]c1', '\\x00800000000000000100001402000000000000000004800000100000000000000000000001000000000402000000000400000000000000000000000004000000', None, None, None, None), (52560, 'CC(=O)[O-]', 'CC(=O)[O-]', '\\x000000000200000000000000000000000004000000000000000c0000000000000000020080000000000000000000002000000000000000000000000000000002', None, None, None, None), (52658, 'NC(=O)C(N)CS', 'NC(=O)C(N)CS', '\\

## Similarity search the fingerprints

In [7]:
cursor.execute("select * from drugbank.drugbank where morganfp%morganbv_fp('O=C(C)Oc1ccccc1C(=O)O'::mol)") # aspirin
print(cursor.fetchall())

[(50588, 'CC(=O)Oc1ccccc1C(=O)O', 'CC(=O)Oc1ccccc1C(=O)O', '\\x00088000020000000100010000000000000400000080800002004000008000000000000080000000000002001000002024200000800004108001000000000002', None, None, None, None), (50977, 'O=C(Oc1ccccc1C(=O)O)c1ccccc1O', 'O=C(Oc1ccccc1C(=O)O)c1ccccc1O', '\\x00008000000000008500010000000001000402000080800802004000000000000000000080004000000002001000002024200000800084008001000000000000', None, None, None, None), (52185, 'O=C(O)c1ccccc1C(=O)O', 'O=C(O)c1ccccc1C(=O)O', '\\x00008000000000000100000000800000000400000080000000004000000000000000000080000000000002001000002020000000800000008001000000000000', None, None, None, None), (56390, 'CC(=O)Oc1cc(C(F)(F)F)ccc1C(=O)O', 'CC(=O)Oc1cc(C(F)(F)F)ccc1C(=O)O', '\\x00088000020800000810010000000400020400000000800002004000009000000000000080020000000002001008002020030000002004108001000000000402', None, None, None, None)]


## Substructure search

In [10]:
cursor.execute("select * from drugbank.drugbank where molinstance@>'c1ccccc1' limit 100")
print(cursor.fetchall())

[(51314, 'NS(=O)(=O)c1ccc(F)cc1', 'NS(=O)(=O)c1ccc(F)cc1', '\\x00000000000000008000000400000000400408000020000800084000000000000000000000000080020002401000000004010000000000001000020000000000', 3, 2, 175.184005737305, 0.473100006580353), (49846, 'CC(C)(N)Cc1ccccc1', 'CC(C)(N)Cc1ccccc1', '\\x00000000024400000100010000000400800008400980000000004004000000000000000000000000000002001000000120010000000000000000000000000000', 1, 2, 149.236999511719, 1.96640002727509), (51667, 'O=CCc1ccccc1', 'O=CCc1ccccc1', '\\x00000000000400000100010000800000000400000080400000004004000100000000000000000000000002001000000020008000000000000000002000100000', 1, 0, 120.151000976562, 1.42799997329712), (51681, 'OCCc1ccccc1', 'OCCc1ccccc1', '\\x0000200000040040010001000000000000000000008000000000404c000000000000000080000000000002001800000020000000000000000000000200000000', 1, 1, 122.166999816895, 1.22140002250671), (50440, 'Clc1cc(Cl)c(OCC#CI)cc1Cl', 'Clc1cc(Cl)c(OCC#CI)cc1Cl', '\\x00100400100002000200c10000200000

## Calculate Descriptors

In [8]:
cursor.execute("update drugbank.drugbank set (hba, hbd, mw, logp) = (mol_hba(molinstance), mol_hbd(molinstance), mol_amw(molinstance), mol_logp(molinstance))")
cursor.execute("select * from drugbank.drugbank limit 10")
print(cursor.fetchall())

[(56610, '[Cl-].[Cl-].[Cl-].[Cr+3]', '[Cl-].[Cl-].[Cl-].[Cr+3]', '\\x00000000000008000000000000000000000000000000000000000000000000000000000000000000000400000000000000000000000000000000000000100000', 0, 0, 158.354995727539, -8.99050045013428), (56611, '[Cu+2]', '[Cu+2]', '\\x00000000000000000000000000000000000000000000000000000000000800000000000000000400000000000000000000000000000000000000000000000000', 0, 0, 63.5460014343262, -0.00249999994412065), (56612, 'Cl[Cu]Cl', 'Cl[Cu]Cl', '\\x00000000000000000000000000000000000008000000000000000000401000000000000000000000000000000000000000000000000000008000000000000000', 0, 0, 134.451995849609, 1.37650001049042), (49926, 'NCCc1ccn[nH]1', 'NCCc1ccn[nH]1', '\\x00080000001000080004010402000000000009004000000000004008000000000000000000000100000412081000001400000000000000000000200000000000', 3, 3, 111.148002624512, -0.0891000032424927), (51464, 'OCCCCO', 'OCCCCO', '\\x000000200000000000000100000400000000000000000000000000400000000000000000800000000

## Lipinski filter

In [11]:
cursor.execute("select * from drugbank.drugbank where hba <= 10 and hbd <=5 and mw < 500 and logp <= 5")
print(cursor.fetchall()[:10])

[(56610, '[Cl-].[Cl-].[Cl-].[Cr+3]', '[Cl-].[Cl-].[Cl-].[Cr+3]', '\\x00000000000008000000000000000000000000000000000000000000000000000000000000000000000400000000000000000000000000000000000000100000', 0, 0, 158.354995727539, -8.99050045013428), (56611, '[Cu+2]', '[Cu+2]', '\\x00000000000000000000000000000000000000000000000000000000000800000000000000000400000000000000000000000000000000000000000000000000', 0, 0, 63.5460014343262, -0.00249999994412065), (56612, 'Cl[Cu]Cl', 'Cl[Cu]Cl', '\\x00000000000000000000000000000000000008000000000000000000401000000000000000000000000000000000000000000000000000008000000000000000', 0, 0, 134.451995849609, 1.37650001049042), (49926, 'NCCc1ccn[nH]1', 'NCCc1ccn[nH]1', '\\x00080000001000080004010402000000000009004000000000004008000000000000000000000100000412081000001400000000000000000000200000000000', 3, 3, 111.148002624512, -0.0891000032424927), (51464, 'OCCCCO', 'OCCCCO', '\\x000000200000000000000100000400000000000000000000000000400000000000000000800000000

## Lipinski filter + substructure search

In [12]:
cursor.execute("select * from drugbank.drugbank where hba <= 10 and hbd <=5 and mw < 500 and logp <= 5 and molinstance@>'c1ccccc1'")
print(cursor.fetchall()[:10])

[(51314, 'NS(=O)(=O)c1ccc(F)cc1', 'NS(=O)(=O)c1ccc(F)cc1', '\\x00000000000000008000000400000000400408000020000800084000000000000000000000000080020002401000000004010000000000001000020000000000', 3, 2, 175.184005737305, 0.473100006580353), (49846, 'CC(C)(N)Cc1ccccc1', 'CC(C)(N)Cc1ccccc1', '\\x00000000024400000100010000000400800008400980000000004004000000000000000000000000000002001000000120010000000000000000000000000000', 1, 2, 149.236999511719, 1.96640002727509), (51667, 'O=CCc1ccccc1', 'O=CCc1ccccc1', '\\x00000000000400000100010000800000000400000080400000004004000100000000000000000000000002001000000020008000000000000000002000100000', 1, 0, 120.151000976562, 1.42799997329712), (51681, 'OCCc1ccccc1', 'OCCc1ccccc1', '\\x0000200000040040010001000000000000000000008000000000404c000000000000000080000000000002001800000020000000000000000000000200000000', 1, 1, 122.166999816895, 1.22140002250671), (50440, 'Clc1cc(Cl)c(OCC#CI)cc1Cl', 'Clc1cc(Cl)c(OCC#CI)cc1Cl', '\\x00100400100002000200c10000200000

## Extract info about various drug groups & insert it into the database

In [13]:
grouplists = (mol.GetProp('DRUG_GROUPS').split('; ') for mol in mols)
groupnames = set()
for grouplist in grouplists:
    groupnames.update(grouplist)
cursor.execute("delete from drugbank.druggroup") # in case there are old values
query = "insert into drugbank.druggroup (groupname) values %s" % (', '.join(["('%s')" % (groupname)
                                                                             for groupname in groupnames]))
cursor.execute(query)

## Check that it got into the database & get the database-generated ID's

In [14]:
cursor.execute("select * from drugbank.druggroup")
ids_groupnames = cursor.fetchall()
print(ids_groupnames)
groupname2id = { groupname:gid for gid, groupname in ids_groupnames}
print(groupname2id)

[(15, 'vet_approved'), (16, 'approved'), (17, 'nutraceutical'), (18, 'illicit'), (19, 'investigational'), (20, 'withdrawn'), (21, 'experimental')]
{'vet_approved': 15, 'approved': 16, 'nutraceutical': 17, 'illicit': 18, 'investigational': 19, 'withdrawn': 20, 'experimental': 21}


In [None]:
Delete the DrugBank and reinsert values, now fill the 

## Let's not be messy :)

In [15]:
cursor.close()
connection.close()