In [3]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import rdFMCS
from rdkit.Chem import AllChem
from rdkit.Chem.rdMolDescriptors import CalcNumRings
import copy

# Allowed atoms
ALLOWED_ELEMENTS = {"C", "N", "O", "S", "P", "F", "Cl", "Br", "I", "B", "Si", "Se"}

# Hann filter SMARTS (problematic groups)
HANN_SMARTS = [
    "[N;R0;$(N=*)]",         # nitroso
    "[N;R0;$(N#N)]",         # azide
    "[N;R0;$(N-[N]=N)]",     # diazo
    "[N;R0;$(N-[C]=[N])]",   # isocyanide
    "[N;R0;$(N=O)]",         # nitro
    "[N;R0;$(N-[O])]",       # hydroxylamine
    "[Cl,Br,I][Cl,Br,I]",    # dihalides
    "[O;R0;$(O-O)]",         # peroxide
]

hann_patterns = [Chem.MolFromSmarts(smarts) for smarts in HANN_SMARTS]

def is_element_allowed(mol):
    for atom in mol.GetAtoms():
        if atom.GetSymbol() not in ALLOWED_ELEMENTS:
            return False
    return True

def has_incorrect_valence(mol):
    try:
        Chem.SanitizeMol(mol)
        return False
    except:
        return True

def has_unwanted_substructure(mol):
    return any(mol.HasSubstructMatch(p) for p in hann_patterns)

def standardize_protonation(mol):
    uncharger = rdMolStandardize.Uncharger()
    mol = uncharger.uncharge(mol)
    Chem.SanitizeMol(mol)
    return mol

def filter_fragments(mols):
    filtered = []
    seen = set()

    for mol in mols:
        if mol is None:
            continue
        try:
            mw = Descriptors.ExactMolWt(mol)
            if not (30 <= mw <= 300):
                continue
            if rdMolDescriptors.CalcNumRings(mol) > 4:
                continue
            if not is_element_allowed(mol):
                continue
            if sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == "F") > 3:
                continue
            if has_incorrect_valence(mol):
                continue
            if has_unwanted_substructure(mol):
                continue
            mol = standardize_protonation(mol)
            smi = Chem.MolToSmiles(mol, isomericSmiles=True)
            if smi in seen:
                continue
            seen.add(smi)
            filtered.append((mol, smi))
        except:
            continue
    return filtered


In [4]:
from dogs_core import load_fragments_from_sdf

# Load raw molecules from SDF
mols, smiles = load_fragments_from_sdf("Enamine_Essential_Fragment_Library_320cmpds_20231002.sdf")

# Apply DOGS-style filtering
filtered_data = filter_fragments(mols)

# Unpack final molecules and SMILES
filtered_mols = [mol for mol, smi in filtered_data]
filtered_smiles = [smi for mol, smi in filtered_data]


In [5]:
import pandas as pd
from rdkit.Chem import Descriptors

# Create DataFrame from filtered mols
data = [{
    "SMILES": smi,
    "MolWeight": Descriptors.ExactMolWt(mol)
} for mol, smi in filtered_data]

df = pd.DataFrame(data)

# Save to CSV
df.to_csv("filtered_fragments.csv", index=False)

print(f"Exported {len(df)} fragments to filtered_fragments.csv")



Exported 318 fragments to filtered_fragments.csv
