# Dockstring

- tool wrapped around AutoDock to fully automate docking process pipeline, SMILES to docking score
- available at https://github.com/dockstring/dockstring , follow the install instructions, requires OpenBabel


In [1]:
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem import Lipinski
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors

import dockstring

import re
import csv
import pandas as pd
import numpy as np
import gzip

# Standardization

In [2]:
""" contribution from Hans de Winter """
def _InitialiseNeutralisationReactions():
    patts= (
        # Imidazoles
        ('[n+;H]','n'),
        # Amines
        ('[N+;!H0]','N'),
        # Carboxylic acids and alcohols
        ('[$([O-]);!$([O-][#7])]','O'),
        # Thiols
        ('[S-;X1]','S'),
        # Sulfonamides
        ('[$([N-;X2]S(=O)=O)]','N'),
        # Enamines
        ('[$([N-;X2][C,N]=C)]','N'),
        # Tetrazoles
        ('[n-]','[nH]'),
        # Sulfoxides
        ('[$([S-]=O)]','S'),
        # Amides
        ('[$([N-]C=O)]','N'),
        )
    return [(Chem.MolFromSmarts(x),Chem.MolFromSmiles(y,False)) for x,y in patts]

_reactions=None
def NeutraliseCharges(mol, reactions=None):
    global _reactions
    if reactions is None:
        if _reactions is None:
            _reactions=_InitialiseNeutralisationReactions()
        reactions=_reactions
    replaced = False
    for i,(reactant, product) in enumerate(reactions):
        while mol.HasSubstructMatch(reactant):
            replaced = True
            rms = Chem.ReplaceSubstructs(mol, reactant, product)
            mol = rms[0]
    return mol, replaced

In [3]:
_saltRemover = SaltRemover()
_inorganicPatt = Chem.MolFromSmarts("[!#6;!#7;!#8;!#16;!F;!Cl;!Br;!I]") # to remove compounds with unwanted atom types
_carbonPatt = Chem.MolFromSmarts("[#6]") # to remove compounds without carbon - inorganic
def standardize(mol):
    if mol.HasSubstructMatch(_carbonPatt):
        mol = _saltRemover(mol)
        if mol.GetNumAtoms()==0:
            return None
        else:
            mol, neutralized = NeutraliseCharges(mol)
            if mol.HasSubstructMatch(_inorganicPatt):
                return None
            else:
                # Sanitize mol, without sanitization some structures can't be drawn or fingerprinted
                Chem.SanitizeMol(mol) # add catch block?
                return mol
    else:
        return None

# Data loading

In [4]:
mols = []

In [5]:
# some pChEMBL values in our file are not valid floats
float_pattern = re.compile("[0-9]+(\.[0-9]*)?")
# function for Lipinski's descriptors
# load rock1, we want to remember source of data and Standard and pChEMBLE value (it's transfomed compound activity to "standard" value)
with open("../data/rock1/rock1_ic50_chembl.csv") as inp:
    rock1 = [row for row in csv.DictReader(inp, delimiter=";", quotechar='"')]
    for m in rock1:
        moldict = {"smiles": m["Smiles"]}
        mol = Chem.MolFromSmiles(moldict["smiles"])
        if mol:
            moldict["rdmol"] = mol
            stdmol = standardize(mol)
            moldict["source"] = "rock1_chembl"
            moldict["pChEMBL Value"] = float(m["pChEMBL Value"]) if float_pattern.match(m["pChEMBL Value"]) else 0
            moldict["std_rdmol"] = None
            moldict["Standard Value"] = float(m["Standard Value"]) if float_pattern.match(m["Standard Value"]) else np.NaN
        if stdmol:
            moldict["std_rdmol"] = stdmol
            moldict["std_smiles"] = Chem.MolToSmiles(stdmol)
            mols.append(moldict)

In [6]:
mols

[{'smiles': 'O=C(Nc1cccc(-c2ccnc3cc(-c4ccncc4)nn23)c1)c1cccc(C(F)(F)F)c1',
  'rdmol': <rdkit.Chem.rdchem.Mol at 0x7f16924d44e0>,
  'source': 'rock1_chembl',
  'pChEMBL Value': 0,
  'std_rdmol': <rdkit.Chem.rdchem.Mol at 0x7f1691f91530>,
  'Standard Value': 50000.0,
  'std_smiles': 'O=C(Nc1cccc(-c2ccnc3cc(-c4ccncc4)nn23)c1)c1cccc(C(F)(F)F)c1'},
 {'smiles': 'C[C@@H](Nc1c(Nc2ccncc2)c(=O)c1=O)c1ccccc1',
  'rdmol': <rdkit.Chem.rdchem.Mol at 0x7f1691f91b20>,
  'source': 'rock1_chembl',
  'pChEMBL Value': 7.22,
  'std_rdmol': <rdkit.Chem.rdchem.Mol at 0x7f1691f92120>,
  'Standard Value': 60.0,
  'std_smiles': 'C[C@@H](Nc1c(Nc2ccncc2)c(=O)c1=O)c1ccccc1'},
 {'smiles': 'C[C@@H](N)[C@H]1CC[C@H](C(=O)Nc2ccncc2)CC1',
  'rdmol': <rdkit.Chem.rdchem.Mol at 0x7f1691f92170>,
  'source': 'rock1_chembl',
  'pChEMBL Value': 6.82,
  'std_rdmol': <rdkit.Chem.rdchem.Mol at 0x7f1691f927b0>,
  'Standard Value': 150.0,
  'std_smiles': 'C[C@@H](N)[C@H]1CC[C@H](C(=O)Nc2ccncc2)CC1'},
 {'smiles': 'Cc1n[nH]c2ccc(-c3c

In [6]:
%%capture
# %%capture hides stderr (and maybe all output)
# same for drugbank
for m in Chem.SDMolSupplier("../data/drugbank.sdf"):
    if m:
        stdmol = standardize(m)
    else:
        continue    
    if stdmol:
        std_smiles = Chem.MolToSmiles(stdmol)
        mols.append({
         "rdmol": m,
         "std_rdmol": stdmol,
         "source": "drugbank",
         "std_smiles": std_smiles,
         "Smiles": Chem.MolToSmiles(m),
         "pChEMBL Value": 0,
         "Standard Value": np.NaN,
        })

In [8]:
%%capture
with gzip.open('../ohusak/Data/actives_final.sdf.gz', 'r') as sdf:
    suppl = Chem.ForwardSDMolSupplier(sdf)
    for m in suppl:
        print(m)
        if m:
            stdmol = standardize(m)
        else:
            continue    
        if stdmol:
            std_smiles = Chem.MolToSmiles(stdmol)
            mols.append({
             "rdmol": m,
             "std_rdmol": stdmol,
             "source": "rock1_active",
             "std_smiles": std_smiles,
             "Smiles": Chem.MolToSmiles(m),
             "pChEMBL Value": 0,
             "Standard Value": np.NaN,
            })

In [9]:
%%capture
with gzip.open('../ohusak/Data/decoys_final.sdf.gz', 'r') as sdf:
    suppl = Chem.ForwardSDMolSupplier(sdf)
    for m in suppl:
        print(m)
        if m:
            stdmol = standardize(m)
        else:
            continue    
        if stdmol:
            std_smiles = Chem.MolToSmiles(stdmol)
            mols.append({
             "rdmol": m,
             "std_rdmol": stdmol,
             "source": "rock1_decoy",
             "std_smiles": std_smiles,
             "Smiles": Chem.MolToSmiles(m),
             "pChEMBL Value": 0,
             "Standard Value": np.NaN,
            })

In [10]:
df = pd.DataFrame(mols)
df

Unnamed: 0,smiles,rdmol,source,pChEMBL Value,std_rdmol,Standard Value,std_smiles,Smiles
0,O=C(Nc1cccc(-c2ccnc3cc(-c4ccncc4)nn23)c1)c1ccc...,<rdkit.Chem.rdchem.Mol object at 0x000001BDBC5...,rock1_chembl,0.00,<rdkit.Chem.rdchem.Mol object at 0x000001BDBC5...,50000.0,O=C(Nc1cccc(-c2ccnc3cc(-c4ccncc4)nn23)c1)c1ccc...,
1,C[C@@H](Nc1c(Nc2ccncc2)c(=O)c1=O)c1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x000001BDBC5...,rock1_chembl,7.22,<rdkit.Chem.rdchem.Mol object at 0x000001BDBC5...,60.0,C[C@@H](Nc1c(Nc2ccncc2)c(=O)c1=O)c1ccccc1,
2,C[C@@H](N)[C@H]1CC[C@H](C(=O)Nc2ccncc2)CC1,<rdkit.Chem.rdchem.Mol object at 0x000001BDBC5...,rock1_chembl,6.82,<rdkit.Chem.rdchem.Mol object at 0x000001BDBC5...,150.0,C[C@@H](N)[C@H]1CC[C@H](C(=O)Nc2ccncc2)CC1,
3,Cc1n[nH]c2ccc(-c3cc(OC[C@@H](N)Cc4ccccc4)cnc3-...,<rdkit.Chem.rdchem.Mol object at 0x000001BDBC5...,rock1_chembl,5.22,<rdkit.Chem.rdchem.Mol object at 0x000001BDBC5...,6000.0,Cc1n[nH]c2ccc(-c3cc(OC[C@@H](N)Cc4ccccc4)cnc3-...,
4,Cc1n[nH]c2ccc(-c3cc(OC[C@@H](N)Cc4ccccc4)cnc3-...,<rdkit.Chem.rdchem.Mol object at 0x000001BDBC5...,rock1_chembl,0.00,<rdkit.Chem.rdchem.Mol object at 0x000001BDBC5...,10000.0,Cc1n[nH]c2ccc(-c3cc(OC[C@@H](N)Cc4ccccc4)cnc3-...,
...,...,...,...,...,...,...,...,...
31455,,<rdkit.Chem.rdchem.Mol object at 0x000001BD9CF...,rock1_decoy,0.00,<rdkit.Chem.rdchem.Mol object at 0x000001BD9CF...,,COc1cc(/C=C/C(=O)c2c(O)cc(C)oc2=O)ccc1SC,COc1cc(/C=C/C(=O)c2c([O-])cc(C)oc2=O)ccc1SC
31456,,<rdkit.Chem.rdchem.Mol object at 0x000001BD9CF...,rock1_decoy,0.00,<rdkit.Chem.rdchem.Mol object at 0x000001BD9CF...,,COc1ccccc1CC1CCN(c2cnnc(O)c2Cl)CC1,COc1ccccc1CC1CCN(c2cnnc([O-])c2Cl)CC1
31457,,<rdkit.Chem.rdchem.Mol object at 0x000001BD9CF...,rock1_decoy,0.00,<rdkit.Chem.rdchem.Mol object at 0x000001BD9CF...,,CN(C)S(=O)(=O)/N=C(\O)C1(c2ccc(C(C)(C)C)cc2)CC1,CN(C)S(=O)(=O)/N=C(\[O-])C1(c2ccc(C(C)(C)C)cc2...
31458,,<rdkit.Chem.rdchem.Mol object at 0x000001BD9CF...,rock1_decoy,0.00,<rdkit.Chem.rdchem.Mol object at 0x000001BD9CF...,,CN(C)S(=O)(=O)/N=C(\O)c1cc2sc3ccccc3c2s1,CN(C)S(=O)(=O)/N=C(\[O-])c1cc2sc3ccccc3c2s1


# Docking prep

In [11]:
def run_dockstring(target, ligand_smiles):
    """wrapper around Dockstring to return None if the process fails"""
    try:
        docking = target.dock(ligand_smiles)
        return docking
    except:
        return None, None

In [17]:
target = dockstring.load_target('ROCK1')
run_dockstring(target, 'CCO')



(None, None)

In [21]:
target = dockstring.load_target('ROCK1')
docking = target.dock('C1CCCC1')

TypeError: argument of type 'WindowsPath' is not iterable

# Docking

In [14]:
# stratified sampling, 10 from each category
df_sample = df.groupby('source', group_keys=False).apply(lambda x: x.sample(min(len(x), 10)))
df_sample

Unnamed: 0,smiles,rdmol,source,pChEMBL Value,std_rdmol,Standard Value,std_smiles,Smiles
4150,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC86...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC86...,,CCCCCCCCCCNC(=O)NC1CCCCC1,CCCCCCCCCCNC(=O)NC1CCCCC1
3635,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC66...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC66...,,C[C@]12CCC(=O)C[C@@H]1CC[C@@H]1[C@@H]2CC[C@]2(...,C[C@]12CCC(=O)C[C@@H]1CC[C@@H]1[C@@H]2CC[C@]2(...
2142,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC16...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC16...,,C=CCc1ccccc1OCC(O)CNC(C)C,C=CCc1ccccc1OCC(O)CNC(C)C
5508,,<rdkit.Chem.rdchem.Mol object at 0x000001BDCC6...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDCC6...,,O=C(O)c1ccccc1NC(=O)N1CCC(c2ccccc2C(F)(F)F)CC1,O=C(O)c1ccccc1NC(=O)N1CCC(c2ccccc2C(F)(F)F)CC1
5925,,<rdkit.Chem.rdchem.Mol object at 0x000001BDCE6...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDCE6...,,CCCC(=O)N[C@H]1/C(=N/OC(=O)Nc2ccccc2)O[C@H](CO...,CCCC(=O)N[C@H]1/C(=N/OC(=O)Nc2ccccc2)O[C@H](CO...
4189,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC86...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC86...,,COc1cc[nH]c1/C=C1\C(=O)Nc2ccc(F)c(C#C[C@@H](O)...,COc1cc[nH]c1/C=C1\C(=O)Nc2ccc(F)c(C#C[C@@H](O)...
7253,,<rdkit.Chem.rdchem.Mol object at 0x000001BDD36...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDD36...,,FC(F)(F)COCC(F)(F)F,FC(F)(F)COCC(F)(F)F
7116,,<rdkit.Chem.rdchem.Mol object at 0x000001BDD16...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDD16...,,COc1ccc(CC(N)=O)cc1,COc1ccc(CC(N)=O)cc1
3791,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC76...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC76...,,N=C(N)c1cc2c(I)cccc2s1,NC(=[NH2+])c1cc2c(I)cccc2s1
6865,,<rdkit.Chem.rdchem.Mol object at 0x000001BDD16...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDD16...,,N=C(N)c1cccc(CN2CCN(S(=O)(=O)c3cc4ccc(Cl)cc4s3...,N=C(N)c1cccc(CN2CCN(S(=O)(=O)c3cc4ccc(Cl)cc4s3...


In [15]:
df_sample['rock1_dockstring_score'] = df_sample['std_smiles'].apply(lambda x: run_dockstring(target, x)[0])
df_sample



Unnamed: 0,smiles,rdmol,source,pChEMBL Value,std_rdmol,Standard Value,std_smiles,Smiles,rock1_dockstring_score
4150,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC86...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC86...,,CCCCCCCCCCNC(=O)NC1CCCCC1,CCCCCCCCCCNC(=O)NC1CCCCC1,
3635,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC66...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC66...,,C[C@]12CCC(=O)C[C@@H]1CC[C@@H]1[C@@H]2CC[C@]2(...,C[C@]12CCC(=O)C[C@@H]1CC[C@@H]1[C@@H]2CC[C@]2(...,
2142,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC16...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC16...,,C=CCc1ccccc1OCC(O)CNC(C)C,C=CCc1ccccc1OCC(O)CNC(C)C,
5508,,<rdkit.Chem.rdchem.Mol object at 0x000001BDCC6...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDCC6...,,O=C(O)c1ccccc1NC(=O)N1CCC(c2ccccc2C(F)(F)F)CC1,O=C(O)c1ccccc1NC(=O)N1CCC(c2ccccc2C(F)(F)F)CC1,
5925,,<rdkit.Chem.rdchem.Mol object at 0x000001BDCE6...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDCE6...,,CCCC(=O)N[C@H]1/C(=N/OC(=O)Nc2ccccc2)O[C@H](CO...,CCCC(=O)N[C@H]1/C(=N/OC(=O)Nc2ccccc2)O[C@H](CO...,
4189,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC86...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC86...,,COc1cc[nH]c1/C=C1\C(=O)Nc2ccc(F)c(C#C[C@@H](O)...,COc1cc[nH]c1/C=C1\C(=O)Nc2ccc(F)c(C#C[C@@H](O)...,
7253,,<rdkit.Chem.rdchem.Mol object at 0x000001BDD36...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDD36...,,FC(F)(F)COCC(F)(F)F,FC(F)(F)COCC(F)(F)F,
7116,,<rdkit.Chem.rdchem.Mol object at 0x000001BDD16...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDD16...,,COc1ccc(CC(N)=O)cc1,COc1ccc(CC(N)=O)cc1,
3791,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC76...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC76...,,N=C(N)c1cc2c(I)cccc2s1,NC(=[NH2+])c1cc2c(I)cccc2s1,
6865,,<rdkit.Chem.rdchem.Mol object at 0x000001BDD16...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDD16...,,N=C(N)c1cccc(CN2CCN(S(=O)(=O)c3cc4ccc(Cl)cc4s3...,N=C(N)c1cccc(CN2CCN(S(=O)(=O)c3cc4ccc(Cl)cc4s3...,


In [16]:
df_sample2 = df.groupby('source', group_keys=False).apply(lambda x: x.sample(min(len(x), 30)))
df_sample2['rock1_dockstring_score'] = df_sample2['std_smiles'].apply(lambda x: run_dockstring(target, x)[0])
df_sample2

[13:42:24] Explicit valence for atom # 1 O, 3, is greater than permitted


Unnamed: 0,smiles,rdmol,source,pChEMBL Value,std_rdmol,Standard Value,std_smiles,Smiles,rock1_dockstring_score
4123,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC86...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC86...,,N[C@H](C(=O)O)[C@@H](O)C(=O)O,N[C@H](C(=O)O)[C@@H](O)C(=O)O,
2302,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC16...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC16...,,CCN(CC)CCNC(=O)c1ccc(N)cc1,CCN(CC)CCNC(=O)c1ccc(N)cc1,
1931,,<rdkit.Chem.rdchem.Mol object at 0x000001BDC06...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDC06...,,N=c1nc(O)c2c(CCc3ccc(C(=O)N[C@@H](CCC(=O)O)C(=...,N=c1nc(O)c2c(CCc3ccc(C(=O)N[C@@H](CCC(=O)O)C(=...,
6882,,<rdkit.Chem.rdchem.Mol object at 0x000001BDD16...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDD16...,,COC(=O)[C@H](Cc1ccc(Br)cc1)NC(=O)CCCCCCC(=O)NO,COC(=O)[C@H](Cc1ccc(Br)cc1)NC(=O)CCCCCCC(=O)NO,
6182,,<rdkit.Chem.rdchem.Mol object at 0x000001BDCE6...,drugbank,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDCE6...,,COc1ccc2[nH]cc(CCC(=O)O)c2c1,COc1ccc2[nH]cc(CCC(=O)O)c2c1,
...,...,...,...,...,...,...,...,...,...
22169,,<rdkit.Chem.rdchem.Mol object at 0x000001BDFFC...,rock1_decoy,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDFFC...,,CC[C@H](C)[C@H](NC(=O)Nc1c(C)cc(Br)cc1C)C(=O)O,CC[C@H](C)[C@H](NC(=O)Nc1c(C)cc(Br)cc1C)C(=O)[O-],
12557,,<rdkit.Chem.rdchem.Mol object at 0x000001BDE29...,rock1_decoy,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDE29...,,Cc1ccc(NC(=O)c2ccc(OC[C@H]3CCCO3)c(Br)c2)c(Br)c1,Cc1ccc(NC(=O)c2ccc(OC[C@H]3CCCO3)c(Br)c2)c(Br)c1,
12833,,<rdkit.Chem.rdchem.Mol object at 0x000001BDE39...,rock1_decoy,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDE39...,,Cc1cc(Br)c(-n2nnc(C(=O)O)c2C2CC2)c(Br)c1,Cc1cc(Br)c(-n2nnc(C(=O)[O-])c2C2CC2)c(Br)c1,
25552,,<rdkit.Chem.rdchem.Mol object at 0x000001BDFFF...,rock1_decoy,0.0,<rdkit.Chem.rdchem.Mol object at 0x000001BDFFF...,,N#C[C@H](C(=O)c1cc2c(s1)CCCC2)c1cccc(F)c1,N#C[C@H](C(=O)c1cc2c(s1)CCCC2)c1cccc(F)c1,
