## Murcko Scaffolds

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os
sys.path.append('..')
sns.set(style='white', context='talk', font_scale=0.9)

In [4]:
import deepchem
from rdkit import Chem

### Load the data

In [5]:
pkl_file = './df_COCRYS_DUD_DEKOIS_with_Fingerprints_1.pkl'

if  os.path.isfile(pkl_file):
    # this dataset was computed as df_dims_mds in the 2_Comparing...ipynb 
     df_all_mols = pd.read_pickle(pkl_file)

### Murcko Scaffolds

In [22]:
from rdkit.Chem.Scaffolds import MurckoScaffold 
from rdkit.Chem import MolToSmiles
from rdkit.Chem import rdDepictor

In [212]:
# Define a lambda function to compute Murcko scaffolds
def scaffold2smiles(mol, generic, return_smiles = True):
    ''' Returns a SMILES string representing the Murcko Scaffold of a given molecule'''
    if generic:
        # Makes a Murcko scaffold generic (all atoms -> carbon and all bonds -> single)
        scff = MurckoScaffold.MakeScaffoldGeneric(mol)
        scff = MurckoScaffold.GetScaffoldForMol(scff)
        scff_smiles = MolToSmiles(scff)
    else:
        # Return a smiles scaffold 
        try:
            scff = MurckoScaffold.GetScaffoldForMol(mol)
            scff_smiles =  MolToSmiles(scff)
        except:
            scff_smiles = np.nan
            scff = np.nan
    if return_smiles:
        return scff_smiles
    else:
        return scff

#### Compute Murcko Scaffolds

In [213]:
%%time
# Compute Generic Murko Scaffolds
generic_murcko_smiles = [scaffold2smiles(mol, generic=True, return_smiles=True) 
                         for mol in df_all_mols.mol_rdk]

# Compute Murcko Scaffolds as SMILES
murcko_smiles = [scaffold2smiles(mol, generic=False, return_smiles=True) 
                 for mol in df_all_mols.mol_rdk]

RDKit ERROR: [21:11:20] Explicit valence for atom # 23 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom # 4 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom # 5 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom # 30 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom # 23 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom # 3 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom # 33 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom # 13 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom # 3 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom # 3 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom # 3 C, 5, is greater than permitted
RDKit ERROR: [21:11:20] Explicit valence for atom

RDKit ERROR: [21:11:23] Explicit valence for atom # 24 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for atom # 28 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for atom # 32 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for atom # 17 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for atom # 25 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for atom # 19 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for atom # 21 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for atom # 19 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for atom # 21 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [21:11:23] Explicit valence for

RDKit ERROR: [21:11:25] Explicit valence for atom # 23 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for atom # 28 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for atom # 23 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for atom # 19 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for atom # 18 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for atom # 23 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for atom # 30 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [21:11:25] Explicit valence for

RDKit ERROR: [21:11:27] Explicit valence for atom # 17 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for atom # 21 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for atom # 23 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for atom # 26 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for atom # 23 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for atom # 29 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for atom # 29 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for atom # 22 N, 4, is greater than permitted
RDKit ERROR: [21:11:27] Explicit valence for 

RDKit ERROR: [21:11:28] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence for atom # 25 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence for atom # 17 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence for atom # 18 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence for atom # 18 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [21:11:28] Explicit valence fo

CPU times: user 25.9 s, sys: 200 ms, total: 26.1 s
Wall time: 26 s


RDKit ERROR: [21:11:29] Explicit valence for atom # 13 N, 4, is greater than permitted


### Create a dataframe

In [214]:
df_murcko = df_all_mols[['name', 'library', 'Activity', 'mol_rdk']].copy()
df_murcko['scff'] = murcko_smiles
df_murcko['scff_generic'] = generic_murcko_smiles

Unnamed: 0,name,library,Activity,mol_rdk,scff,scff_generic
0,RPR,COCRYS,active,<rdkit.Chem.rdchem.Mol object at 0x7ff1931ee1f0>,,CCC1CCCC(C2CCC(C(C)CC(C)C(CC3CCCC(C(C)C)C3)C(C...
1,815,COCRYS,active,<rdkit.Chem.rdchem.Mol object at 0x7ff1931ee270>,O=C1C(NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1ccc2ccncc2c1,CC1CCCC2CCC(CC3CCC(CC(C)(C)C4CC5CCCCC5C4)C3C)CC12
2,PR2,COCRYS,active,<rdkit.Chem.rdchem.Mol object at 0x7ff1931ee2f0>,O=C1C(NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1cc2ccncc2...,CC1C(CC2CC3CCCCC3C2)CCC1CC(C)(C)C1CC2CCCCC2C1
3,DX9,COCRYS,active,<rdkit.Chem.rdchem.Mol object at 0x7ff1931ee370>,,CC(C)C1CCC(CC2CCC(C(CC3CCC4CCC(C(C)C)CC4C3)C(C...
4,Z34,COCRYS,active,<rdkit.Chem.rdchem.Mol object at 0x7ff1931ee3f0>,,CC(C)CC(C)C1C(C)C(CC2CCCC(C3CCCC3C)C2)CC(CC2CC...
...,...,...,...,...,...,...
7254,decoy_5741,DUD,inactive,<rdkit.Chem.rdchem.Mol object at 0x7ff19357cbf0>,O=C(N=C(Nc1ccccc1)Nc1nc(=O)cc[nH]1)c1ccccc1,CCCC(C)CC1CC(C)CC(CC(CC2CCCC(C)C2C)CC(C)C2CCCC...
7255,decoy_5742,DUD,inactive,<rdkit.Chem.rdchem.Mol object at 0x7ff19357cc70>,O=C(Nc1ccccc1)ON=Cc1cccc(CS(=O)(=O)c2ccccn2)c1,CC1CCC(CC(C)CCC(C)C2CCCC(CC(C)(C)C3CCCCC3)C2)CC1
7256,decoy_5743,DUD,inactive,<rdkit.Chem.rdchem.Mol object at 0x7ff19357ccf0>,c1ccc(NC(=NOC(c2ccccc2)c2ccccc2)c2cnon2)cc1,CC1CCC(CC(CCC(C2CCCCC2)C2CCCCC2)C2CCCC2C)CC1
7257,decoy_5744,DUD,inactive,<rdkit.Chem.rdchem.Mol object at 0x7ff19357cd70>,,CC(CCC(C)C1CCC(CC2CCC(C)CC2C)C(C(C)C)C1)CC1CCCCC1


In [215]:
df_murcko.groupby('scff').count()['name'].sort_values()
df_murcko.dropna(how='any', inplace=True)
df_murcko

Unnamed: 0,name,library,Activity,mol_rdk,scff,scff_generic
1,815,COCRYS,active,<rdkit.Chem.rdchem.Mol object at 0x7ff1931ee270>,O=C1C(NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1ccc2ccncc2c1,CC1CCCC2CCC(CC3CCC(CC(C)(C)C4CC5CCCCC5C4)C3C)CC12
2,PR2,COCRYS,active,<rdkit.Chem.rdchem.Mol object at 0x7ff1931ee2f0>,O=C1C(NS(=O)(=O)c2cc3ncccc3s2)CCN1Cc1cc2ccncc2...,CC1C(CC2CC3CCCCC3C2)CCC1CC(C)(C)C1CC2CCCCC2C1
7,XMA,COCRYS,active,<rdkit.Chem.rdchem.Mol object at 0x7ff1931ee570>,O=C1CN(S(=O)(=O)c2ccc3ccccc3c2)CC2OC3(CCN(c4cc...,CCCC12CC(C(C)(C)C3CCC4CC(C)CCC4C3)CC(C)C1CC1(C...
8,XMB,COCRYS,active,<rdkit.Chem.rdchem.Mol object at 0x7ff1931ee5f0>,O=C(C(CNS(=O)(=O)c1ccc2ccccc2c1)[NH2+]CC1CCN(c...,CC1CCC2CC(C(C)(C)CCC(CCC3CCC(C4CCCCC4)CC3)C(C)...
9,XMD,COCRYS,active,<rdkit.Chem.rdchem.Mol object at 0x7ff1931ee670>,O=C1CN(S(=O)(=O)c2ccc3ccccc3c2)CCN1CC1CCN(c2cc...,CCCC(C)C1CC(C(C)(C)C2CCC3CC(C)CCC3C2)CC(C)C1CC...
...,...,...,...,...,...,...
7253,decoy_5740,DUD,inactive,<rdkit.Chem.rdchem.Mol object at 0x7ff19357cb70>,O=C(N=C(Nc1ccccc1)Nc1ncccn1)c1ccccc1,CCCC(C)CC1CC(C)CC(CC(CC2CCCC(C)C2C)CC(C)C2CCCC...
7254,decoy_5741,DUD,inactive,<rdkit.Chem.rdchem.Mol object at 0x7ff19357cbf0>,O=C(N=C(Nc1ccccc1)Nc1nc(=O)cc[nH]1)c1ccccc1,CCCC(C)CC1CC(C)CC(CC(CC2CCCC(C)C2C)CC(C)C2CCCC...
7255,decoy_5742,DUD,inactive,<rdkit.Chem.rdchem.Mol object at 0x7ff19357cc70>,O=C(Nc1ccccc1)ON=Cc1cccc(CS(=O)(=O)c2ccccn2)c1,CC1CCC(CC(C)CCC(C)C2CCCC(CC(C)(C)C3CCCCC3)C2)CC1
7256,decoy_5743,DUD,inactive,<rdkit.Chem.rdchem.Mol object at 0x7ff19357ccf0>,c1ccc(NC(=NOC(c2ccccc2)c2ccccc2)c2cnon2)cc1,CC1CCC(CC(CCC(C2CCCCC2)C2CCCCC2)C2CCCC2C)CC1


### Use DeepChem to compute the Scaffold splitting

In [216]:
from deepchem.data import NumpyDataset 
from rdkit import Chem

In [233]:
# As qe can see in https://deepchem.io/docs/_modules/deepchem/splits/splitters.html, https://deepchem.io/deepchem.splits.html
# molecules are required in SMILES format
df_dataset = df_murcko[['scff_generic', 'scff', 'name']].copy()
df_dataset['y'] = [1 if i == 'active' else 0 for i in df_murcko.Activity]
df_dataset['smiles'] = [Chem.MolToSmiles(mol) for mol in df_murcko.mol_rdk]

# Create the deepchem dataset
dataset = NumpyDataset( X = df_dataset.drop('y', axis=1), y = df_dataset['y'], ids=df_dataset['smiles'])
dataset.ids.size


dataset_g = NumpyDataset( X = df_dataset.drop('y', axis=1), y = df_dataset['y'], ids=df_dataset['scff_generic'])

In [234]:
from deepchem.splits.splitters import ScaffoldSplitter

### Split function used by deepchem

```python
def split(self,
            dataset,
            frac_train=.8,
            frac_valid=.1,
            frac_test=.1,
            log_every_n=1000): # Usa la función log para guardar
    """
        Splits internal compounds into train/validation/test by scaffold.
        """
    #** 
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
    
    #** Crea un diccionario vacío
    scaffolds = {}
    log("About to generate scaffolds", self.verbose)
    data_len = len(dataset)
    
    #** El objeto de la clase Dataset tiene como atributo ids al smiles de la molécula
    for ind, smiles in enumerate(dataset.ids): 
      if ind % log_every_n == 0:
        log("Generating scaffold %d/%d" % (ind, data_len), self.verbose)
        
      # Genera los scaffolds de Murcko
      scaffold = generate_scaffold(smiles) # La función implementa MurckoScaffoldSmiles
        # Con lo cual, no da opción de Scaffolds genéricos, y sólo funciona desde SMILES
        
      #********************
      # La clave para ir llenando los bins está aquí
      #********************
      if scaffold not in scaffolds:
        scaffolds[scaffold] = [ind] # Crea una nueva entrada si el scaffold no ha sido 'visto'
        # El key es el scaffold y el value es una lista con el índice de la primera ocurrencia
      else:
        scaffolds[scaffold].append(ind) # Si el scaffold ya existe, añade el índice de la ocurrencia
        
    # Ordena los índices de las moléculas que pertenecen a cada scaffold
    # Sort from largest to smallest scaffold sets
    scaffolds = {key: sorted(value) for key, value in scaffolds.items()}
    
    # Crea una lista con las listas de los índices de cada scaffold
    # Ordenando por longitud del número de índices en el set, y por el valor del primer índice
    # Cada scaffold_set es una lista de índices que pertenecen a un scaffold en particular
    scaffold_sets = [
        scaffold_set
        for (scaffold, scaffold_set) in sorted(
            scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True)
    ]
    # La longitud del Train set se establece según la longitud del dataset
    train_cutoff = frac_train * len(dataset)
    valid_cutoff = (frac_train + frac_valid) * len(dataset)
    # Se crean las listas vacias de los tres sets
    # Cuando no se requiere el set de validación, se omite estableciendo el frac_valid = 0
    train_inds, valid_inds, test_inds = [], [], []
    log("About to sort in scaffold sets", self.verbose)
    
    # Por cada scaffold_set:
    for scaffold_set in scaffold_sets:
      # Si lo que hay ya en train_inds más el scaffold_set es mayor al cutoff para el train set 
      if len(train_inds) + len(scaffold_set) > train_cutoff:
        # Lo mismo para el valid set
        if len(train_inds) + len(valid_inds) + len(scaffold_set) > valid_cutoff:
          test_inds += scaffold_set
        else:
          valid_inds += scaffold_set
      else:
        train_inds += scaffold_set
        
    return train_inds, valid_inds, test_inds
```

In [235]:
x = ScaffoldSplitter().train_test_split(dataset)
x_g = ScaffoldSplitter().train_test_split(dataset_g)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'k_fold_split',
 'split',
 'train_test_split',
 'train_valid_test_split',
 'verbose']

In [221]:
x[0].get_shape()
np.sum(x[0]._y)
np.sum(x[1]._y)
df_dataset.y.sum()

292

In [239]:
X_train = x[0]
X_test = x[1]
X_train_g = x_g[0]
X_test_g = x_g[1]

X_train = pd.DataFrame(X_train.X)
X_train_g = pd.DataFrame(X_train_g.X)

X_test = pd.DataFrame(X_test.X) 
X_test_g = pd.DataFrame(X_test_g.X)

In [242]:
X_test[2]

0       decoy_829
1       decoy_828
2       decoy_771
3       decoy_759
4       decoy_757
          ...    
1348          XMK
1349          XMJ
1350          XMB
1351          XMA
1352          815
Name: 2, Length: 1353, dtype: object

In [243]:
X_test_g[2]

0       decoy_4499
1       decoy_4498
2       decoy_4496
3       decoy_4494
4       decoy_4481
           ...    
1348           I1H
1349           D91
1350           XMC
1351           XMB
1352           XMA
Name: 2, Length: 1353, dtype: object