In [7]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import rdFMCS
from rdkit.Chem import AllChem
from rdkit.Chem.rdMolDescriptors import CalcNumRings
import copy

# Allowed atoms
ALLOWED_ELEMENTS = {"C", "N", "O", "S", "P", "F", "Cl", "Br", "I", "B", "Si", "Se"}

# Hann filter SMARTS (problematic groups)
HANN_SMARTS = [
    "[N;R0;$(N=*)]",         # nitroso
    "[N;R0;$(N#N)]",         # azide
    "[N;R0;$(N-[N]=N)]",     # diazo
    "[N;R0;$(N-[C]=[N])]",   # isocyanide
    "[N;R0;$(N=O)]",         # nitro
    "[N;R0;$(N-[O])]",       # hydroxylamine
    "[Cl,Br,I][Cl,Br,I]",    # dihalides
    "[O;R0;$(O-O)]",         # peroxide
]

hann_patterns = [Chem.MolFromSmarts(smarts) for smarts in HANN_SMARTS]

def is_element_allowed(mol):
    for atom in mol.GetAtoms():
        if atom.GetSymbol() not in ALLOWED_ELEMENTS:
            return False
    return True

def has_incorrect_valence(mol):
    try:
        Chem.SanitizeMol(mol)
        return False
    except:
        return True

def has_unwanted_substructure(mol):
    return any(mol.HasSubstructMatch(p) for p in hann_patterns)

def standardize_protonation(mol):
    uncharger = rdMolStandardize.Uncharger()
    mol = uncharger.uncharge(mol)
    Chem.SanitizeMol(mol)
    return mol

def filter_fragments(mols):
    filtered = []
    seen = set()

    for mol in mols:
        if mol is None:
            continue
        try:
            mw = Descriptors.ExactMolWt(mol)
            if not (30 <= mw <= 300):
                continue
            if rdMolDescriptors.CalcNumRings(mol) > 4:
                continue
            if not is_element_allowed(mol):
                continue
            if sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == "F") > 3:
                continue
            if has_incorrect_valence(mol):
                continue
            if has_unwanted_substructure(mol):
                continue
            mol = standardize_protonation(mol)
            smi = Chem.MolToSmiles(mol, isomericSmiles=True)
            if smi in seen:
                continue
            seen.add(smi)
            filtered.append((mol, smi))
        except:
            continue
    return filtered


In [8]:
from dogs_core import load_fragments_from_sdf

# Load raw molecules from SDF
mols, smiles = load_fragments_from_sdf("Enamine_Essential_Fragment_Library_320cmpds_20231002.sdf")

# Apply DOGS-style filtering
filtered_data = filter_fragments(mols)

# Unpack final molecules and SMILES
filtered_mols = [mol for mol, smi in filtered_data]
filtered_smiles = [smi for mol, smi in filtered_data]


Loaded 320 valid fragments


In [4]:
from rdkit.Chem import AllChem, rdChemReactions

# Defined FGA/FGI reactions as SMARTS
reaction_smarts_list = [
    # Acyl chloride formation from carboxylic acid (neutral)
    "[C:1](=O)[OX2H1:2]>>[C:1](=O)Cl",

    # Acyl chloride formation from carboxylic acid (charged)
    "[C:1](=O)[O-]>>[C:1](=O)Cl",

    # Aliphatic alcohol to bromide
    "[C:1][OX2H:2]>>[C:1]Br",

    # Aliphatic alcohol to chloride
    "[C:1][OX2H:2]>>[C:1]Cl",

    # Sulfonic acid to sulfonyl chloride (neutral)
    "[$([#16X4](=[OX1])(=[OX1])([#6])[OX2H,OX1H0-]),$([#16X4+2]([OX1-])([OX1-])([#6])[OX2H,OX1H0-])]>>S(=O)(=O)Cl",

    # Sulfonic acid to sulfonyl chloride (charged)
    "[$([#16X4](=[OX1])(=[OX1])([#6])[OX2H0]),$([#16X4+2]([OX1-])([OX1-])([#6])[OX2H0])]>>S(=O)(=O)Cl",

    # Aromatic halide to nitrile
    "[c:1][Cl,Br]>>[c:1]C#N",

    # Aliphatic alcohol to nitrile
    "[C:1][OX2H:2]>>[C:1]C#N",

    # Primary Amine to nitrile (neutral)
    "[C:1][NX3;H2,H1;!$(NC=O):2]>>[C:1]C#N",

    # Primary Amine to nitrile (charged)
    "[C:1][NX3+1;H3;!$(NC=O):2]>>[C:1]C#N",

    # Alkyne to nitrile
    "[C:1][$([CX2]#C):2]>>[C:1]C#N",
]

# Convert SMARTS to RDKit reaction objects
REACTIONS = [rdChemReactions.ReactionFromSmarts(s) for s in reaction_smarts_list]

def apply_reactions(mol):
    """Applies all FGA/FGI reactions to a molecule and returns list of unique products."""
    products = set()
    for rxn in REACTIONS:
        ps = rxn.RunReactants((mol,))
        for prod_tuple in ps:
            prod = prod_tuple[0]
            try:
                AllChem.SanitizeMol(prod)
                smiles = Chem.MolToSmiles(prod)
                products.add(smiles)
            except Exception:
                continue
    return list(products)


In [5]:
from rdkit import Chem
from rdkit.Chem import AllChem, rdChemReactions

def expand_fragments_with_reactions(mols, keep_original=True):
    """
    Applies FGA/FGI reactions to a list of RDKit molecules.
    
    Args:
        mols (list): List of RDKit Mol objects (filtered fragments).
        keep_original (bool): If True, include original mols in the output.

    Returns:
        List of unique RDKit Mol objects (original + transformed if desired).
    """
    all_smiles = set()
    result_mols = []

    for mol in mols:
        try:
            Chem.SanitizeMol(mol)
            smi = Chem.MolToSmiles(mol)
            if keep_original and smi not in all_smiles:
                all_smiles.add(smi)
                result_mols.append(mol)
        except:
            continue  # skip sanitization errors

        for rxn in REACTIONS:
            try:
                products = rxn.RunReactants((mol,))
                for prod_tuple in products:
                    prod = prod_tuple[0]
                    Chem.SanitizeMol(prod)
                    smi = Chem.MolToSmiles(prod)
                    if smi not in all_smiles:
                        all_smiles.add(smi)
                        result_mols.append(prod)
            except:
                continue

    return result_mols


In [9]:
expanded_mols = expand_fragments_with_reactions(filtered_mols, keep_original=True)
