# Filter building blocks

Before enumerating the VL, a series of filters is applied to the building blocks to remove building blocks with undesired properties or high chance of side reactions.

## Filter aldehyde building blocks



In [None]:
import os

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Descriptors import MolWt

In [None]:
supplier = Chem.SDMolSupplier(os.path.abspath('../data/Enamine_Aldehydes_6455cmpds_20210405.sdf'))  # this file was obtained from Enamine
aldehydes = [i for i in supplier]
len(aldehydes)

In [None]:
"""Now we filter the building blocks. First by mass. We set a weight limit of 200"""
aldehydes = [m for m in aldehydes if MolWt(m) <= 200]
len(aldehydes)


In [None]:
"""Filter carbonic acid / carboxylate anion"""
sub_acid = Chem.MolFromSmarts('[CX3](=O)[OX1H0-,OX2H1] ')
aldehydes = [m for m in aldehydes if not m.HasSubstructMatch(sub_acid)]
len(aldehydes)

In [None]:
"""Filter ketone"""
sub_ketone = Chem.MolFromSmarts('[#6][CX3](=O)[#6]')
mol = [m for m in aldehydes if len(m.GetSubstructMatches(sub_ketone)) >= 1]
len(mol)  # THERE ARE NO KETONES (in aldehyde building blocks) so we don't apply a filter

In [None]:
"""Filter if a second aldehyde is present"""
sub_aldehyde = Chem.MolFromSmarts('[CX3H1](=O)[#6]')
mol = [m for m in aldehydes if len(m.GetSubstructMatches(sub_aldehyde)) > 1]
len(mol) # THERE ARE NO DI-ALDEHYDES so we don't apply a filter


In [None]:
"""Filter Boc"""
sub_boc = Chem.MolFromSmarts('[NX3][CX3](=[OX1])[OX2H0]C([#6])([#6])[#6]')
aldehydes = [m for m in aldehydes if not m.HasSubstructMatch(sub_boc)]
len(aldehydes)

In [None]:
"""Filter enolizable aldehydes. E.g. alpha- phenyl, nitrile, nitro"""
sub_enol_phenyl = Chem.MolFromSmarts('O=[#6]-[CX4H,CX4H2]-[cX3]1[cX3H][cX3H][cX3H][cX3H][cX3H]1')
sub_enol_nitrile = Chem.MolFromSmarts('O=[#6]-[CX4H,CX4H2]-[CX2]#[NX1]')
sub_enol_nitro = Chem.MolFromSmarts('O=[#6]-[CX4H,CX4H2]-[$([NX3](=O)=O),$([NX3+](=O)[O-])]')
aldehydes = [m for m in aldehydes if not (m.HasSubstructMatch(sub_enol_phenyl) or m.HasSubstructMatch(sub_enol_nitrile) or m.HasSubstructMatch(sub_enol_nitro))]
len(aldehydes)

In [None]:
"""Filter branched aliphatic (alpha position). We remove tertiary and quarternary non-cyclic alpha carbons and quarternary cyclic alpha carbons """
sub_branched = Chem.MolFromSmarts('O=[#6]-[CR0D3,CR0D4,CD4]')
aldehydes = [m for m in aldehydes if not m.HasSubstructMatch(sub_branched)]
len(aldehydes)

In [None]:
"""Filter some heteroaromatics (randomly + deterministically). We discard 2 thirds of them, because they tend to not synthesize"""
sub_heteroaromatic =  Chem.MolFromSmarts('[oR,sR,nR]')
aldehydes = [m for i, m in enumerate(aldehydes) if not m.HasSubstructMatch(sub_heteroaromatic) or i % 3 == 0] # we drop roughly 2 in 3 heteroaromatics
len(aldehydes)

In [None]:
"""Filter azides"""
sub_azide = Chem.MolFromSmarts('[$(*-[NX2-]-[NX2+]#[NX1]),$(*-[NX2]=[NX2+]=[NX1-])]')
aldehydes = [m for m in aldehydes if not m.HasSubstructMatch(sub_azide)]
len(aldehydes)

In [None]:
"""Filter acetals"""
sub_acetal = Chem.MolFromSmarts('[#6]-[CX3,CX4](-[OX2])-[OX2]')
aldehydes = [m for m in aldehydes if not m.HasSubstructMatch(sub_acetal)]
len(aldehydes)

In [None]:
"""Check duplicates (there should be none if Enamine did their job right)"""
s = len(set([Chem.MolToSmiles(m) for m in aldehydes]))
l = len(aldehydes)
print(s)
print(l)
assert s == l  # if s < l, we have duplicates

In [None]:
# show a few examples that passed filtering
Draw.MolsToGridImage(aldehydes[500:550])

In [None]:
# show the frequency of aldehyde subclasses in the remaining data.
classes = [m.GetProp('Subclass') for m in aldehydes]
count_het = 0
count_arom = 0
count_aliph = 0
for m in classes:
    if 'Hetero_aromatic_aldehydes' in m:
        count_het += 1
    if 'Aromatic_aldehydes' in m:
        count_arom += 1
    if 'Aliphatic_Aldehydes' in m:
        count_aliph += 1
print(count_het)
print(count_arom)
print(count_aliph)

In [None]:
"""Write to sdf file"""
with open(os.path.abspath('../data/filtered_aldehydes.sdf'), 'wt') as file:
    writer = Chem.SDWriter(file)
    for m in aldehydes:
        writer.write(m)
    writer.close()

## Filter ketone building blocks

The filters are (almost) the same as above, with the only change being that we obviously filter diketones instead of ketones and aldehydes instead of dialdehydes now.


In [None]:
supplier = Chem.SDMolSupplier(os.path.abspath('../data/Enamine_Ketones_8649cmpds_20210405.sdf'))  # this file was obtained from Enamine
ketones = [i for i in supplier]
len(ketones)

In [None]:
"""Now we filter the building blocks. First by mass. We set a weight limit of 200"""
ketones = [m for m in ketones if MolWt(m) <= 200]
len(ketones)


In [None]:
"""Filter carbonic acid / carboxylate anion"""
sub_acid = Chem.MolFromSmarts('[CX3](=O)[OX1H0-,OX2H1] ')
ketones = [m for m in ketones if not m.HasSubstructMatch(sub_acid)]
len(ketones)

In [None]:
"""Filter diketone"""
sub_ketone = Chem.MolFromSmarts('[#6][CX3](=O)[#6]')
mol = [m for m in ketones if len(m.GetSubstructMatches(sub_ketone)) > 1]
len(mol)  # THERE ARE NO DIKETONES so we don't apply a filter

In [None]:
"""Filter aldehyde"""
sub_aldehyde = Chem.MolFromSmarts('[CX3H1](=O)[#6]')
mol = [m for m in ketones if len(m.GetSubstructMatches(sub_aldehyde)) >= 1]
len(mol) # THERE ARE NO ALDEHYDES (in the ketone set) so we don't apply a filter

In [None]:
"""Filter Boc"""
sub_boc = Chem.MolFromSmarts('[NX3][CX3](=[OX1])[OX2H0]C([#6])([#6])[#6]')
ketones = [m for m in ketones if not m.HasSubstructMatch(sub_boc)]
len(ketones)

In [None]:
"""Filter enolizable aldehydes. E.g. alpha- phenyl, nitrile, nitro"""
sub_enol_phenyl = Chem.MolFromSmarts('O=[#6]-[CX4H,CX4H2]-[cX3]1[cX3H][cX3H][cX3H][cX3H][cX3H]1')
sub_enol_nitrile = Chem.MolFromSmarts('O=[#6]-[CX4H,CX4H2]-[CX2]#[NX1]')
sub_enol_nitro = Chem.MolFromSmarts('O=[#6]-[CX4H,CX4H2]-[$([NX3](=O)=O),$([NX3+](=O)[O-])]')
ketones = [m for m in ketones if not (m.HasSubstructMatch(sub_enol_phenyl) or m.HasSubstructMatch(sub_enol_nitrile) or m.HasSubstructMatch(sub_enol_nitro))]
len(ketones)

In [None]:
"""Filter branched aliphatic (alpha position). We remove tertiary and quarternary non-cyclic alpha carbons and quarternary cyclic alpha carbons """
sub_branched = Chem.MolFromSmarts('O=[#6]-[CR0D3,CR0D4,CD4]')
ketones = [m for m in ketones if not m.HasSubstructMatch(sub_branched)]
len(ketones)

In [None]:
"""Filter some heteroaromatics (randomly + deterministically). We discard 2 thirds of them, because they tend to not synthesize"""
sub_heteroaromatic =  Chem.MolFromSmarts('[oR,sR,nR]')
ketones = [m for i, m in enumerate(ketones) if not m.HasSubstructMatch(sub_heteroaromatic) or i % 3 == 0] # we drop roughly 2 in 3 heteroaromatics
len(ketones)

In [None]:
"""Filter azides"""
sub_azide = Chem.MolFromSmarts('[$(*-[NX2-]-[NX2+]#[NX1]),$(*-[NX2]=[NX2+]=[NX1-])]')
ketones = [m for m in ketones if not m.HasSubstructMatch(sub_azide)]
len(ketones)

In [None]:
"""Filter acetals"""
sub_acetal = Chem.MolFromSmarts('[#6]-[CX3,CX4](-[OX2])-[OX2]')
ketones = [m for m in ketones if not m.HasSubstructMatch(sub_acetal)]
len(ketones)

In [None]:
"""Check duplicates (there should be none if Enamine did their job right)"""
s = len(set([Chem.MolToSmiles(m) for m in ketones]))
l = len(ketones)
print(s)
print(l)
assert s == l  # if s < l, we have duplicates

In [None]:
# show a few examples that passed filtering
Draw.MolsToGridImage(ketones[500:550])

In [None]:
"""Write to sdf file"""
with open(os.path.abspath('../data/filtered_ketones.sdf'), 'wt') as file:
    writer = Chem.SDWriter(file)
    for m in ketones:
        writer.write(m)
    writer.close()