## Enumerate SLAP VL

This was used to enumerate the SLAP virtual library. Note that the reactionSMARTS templates are underspecified, leading to some erroneous products if there are additional carbonyls (e.g. esters, amides). These where later filtered out by removing all products that still contained an aldehyde or ketone, but in the interest of reproducibility, we give the original, erroneous version of the reactionSMARTS.

Run this notebook only on systems with lots of RAM. We used a 96GB RAM workstation, but more would be better.

In [None]:
import os
from joblib import Parallel, delayed

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.SimpleEnum.Enumerator import EnumerateReaction
from rdkit.Chem.rdchem import KekulizeException

### Reactions
Define and initialize the reactions that form our VL

In [None]:
rxn_morpholine = AllChem.ReactionFromSmarts('[#6:3]=O.[#6:4]-[#6]=O>>[#6:4]-[#6]-1-[#6]-[#8]-[#6]-[#6:3]-[#7]-1')
rxn_only_aldehyde = AllChem.ReactionFromSmarts('[#6:1]-[#6]=O.[#6:2]-[#6]=O>>[#6:1]-[#6]-1-[#6]-[#8]-[#6]-[#6](-[#6:2])-[#7]-1.[#6]-[#6]-[#8]-[#6](=O)-[#7]-1-[#6]-[#6](-[#6:1])-[#7]-[#6](-[#6:2])-[#6]-1.[#6:1]-[#6]-1-[#6]-[#8]-[#6](-[#6])-[#6](-[#6:2])-[#7]-1.[#6:1]-[#6]-1-[#6]-[#8][C@]([#6])([#6])[#6](-[#6:2])-[#7]-1')

In [None]:
rxn_morpholine.Initialize()
AllChem.SanitizeRxn(rxn_morpholine)
rxn_only_aldehyde.Initialize()
AllChem.SanitizeRxn(rxn_only_aldehyde)

In [None]:
# show the morpholine reaction
rxn_morpholine

In [None]:
# show the reaction for two aldehyde reactants
rxn_only_aldehyde

### Building Blocks
Import the building blocks for the VL

In [None]:
# import aldehydes
supplier = Chem.SDMolSupplier(os.path.abspath('../data/filtered_aldehydes.sdf'))  # this file can be obtained by running `vl_building_block_filtering.ipynb`
aldehydes = [i for i in supplier]
len(aldehydes)

In [None]:
# import ketones
supplier = Chem.SDMolSupplier(os.path.abspath('../data/filtered_ketones.sdf'))    # this file can be obtained by running `vl_building_block_filtering.ipynb`
ketones = [i for i in supplier]
len(ketones)

In [None]:
# show a few building blocks
Draw.MolsToGridImage(aldehydes[:5] + ketones[:4])

### Parallelized Enumeration
Setup VL enumeration in parallelized way

In [None]:
"""This one is for only aldehyde reactants"""
errors = []

def get_products(sm1, sm2):
    global errors
    generator = EnumerateReaction(rxn_only_aldehyde, (sm1, sm2))
    def sanitize_inner(generator):
        for products in generator:
            for p in products:
                try:
                    Chem.SanitizeMol(p)
                    s = Chem.MolToSmiles(p)  # yield SMILES instead of MOL for memory reasons
                    yield s
                except KekulizeException:
                    errors.append(p)
                    yield None
    return list(sanitize_inner(generator))

In [None]:
"""This one is for aldehyde + ketone reactants"""
errors = []

def get_products_ketone(sm1, sm2):
    global errors
    generator = EnumerateReaction(rxn_morpholine, (sm1, sm2))
    def sanitize_inner(generator):
        for products in generator:
            for p in products:
                try:
                    Chem.SanitizeMol(p)
                    s = Chem.MolToSmiles(p)  # yield SMILES instead of MOL for memory reasons
                    yield s
                except KekulizeException:
                    errors.append(p)
                    yield None
    return list(sanitize_inner(generator))

In [None]:
# prepare aldehyde list for parallel enumeration
aldehyde_slices = [aldehydes[i::8] for i in range(8)]  # we split the total aldehydes in 8 slices

In [None]:
# prepare ketone list for parallel enumeration
ket_and_ald = aldehydes + ketones
ket_and_ald_slices = [ket_and_ald[i::8] for i in range(8)]  # we split the total aldehydes + ketones in 8 slices

### Generate Virtual Library
Enumerate ketone and aldehyde products sequentially.
Note that you will need lots of RAM to run the enumeration.

In [None]:
# ketones
library_ket = Parallel(n_jobs=16)(delayed(get_products_ketone)(i, j) for i in aldehyde_slices for j in ket_and_ald_slices)


In [None]:
# aldehydes
library_ald = Parallel(n_jobs=16)(delayed(get_products)(i, j) for i in aldehyde_slices for j in aldehyde_slices)


In [None]:
# expected library size
print(f'Expected ketone products: {len(ket_and_ald) * len(aldehydes)}')
print(f'Expected aldehyde products: {len(aldehydes) ** 2 * 4}')

In [None]:
library_ket_flattened = [j
                for i in library_ket
                for j in i
                if j is not None
              ]

library_ald_flattened = [j
                for i in library_ald
                for j in i
                if j is not None
              ]

library_set = set(library_ald_flattened + library_ket_flattened)

print(f'Size of the enumerated aldehyde library: {len(library_ald_flattened)}')
print(f'Size of the enumerated ketone library: {len(library_ket_flattened)}')
print(f'Total size of the enumerated library: {len(library_ald_flattened + library_ket_flattened)}')
print(f'Size of the library after duplicate removal: {len(library_set)}')

### Save VL to file
We save the VL (canonical) SMILES strings to a text file. At this point, the VL is filtered from duplicates, but not processed other than that.

In [None]:
with open(os.path.abspath('../data/VL_smiles.txt'), 'w') as file:
    for sm in library_set:
        file.write(f'{sm}\n')