In [None]:
! pip install rdkit-pypi

In [None]:
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams

In [None]:
HERE = Path(_dh[-1])
DATA = HERE / "data"

In [None]:
new_data = pd.read_csv('/content/sample_data/ro5_properties_filtered.csv',
    index_col=0,
)

In [None]:
PandasTools.AddMoleculeColumnToFrame(new_data, smilesCol="smiles")
Chem.Draw.MolsToGridImage(
    list(new_data.head(6).ROMol),
    legends=list(new_data.head(6).molecule_chembl_id),
)

In [None]:
params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
catalog = FilterCatalog(params)

In [None]:
matches = []
clean = []
for index, row in tqdm(new_data.iterrows(), total=new_data.shape[0]):
    molecule = Chem.MolFromSmiles(row.smiles)
    entry = catalog.GetFirstMatch(molecule)
    if entry is not None:
        matches.append(
            {
                "chembl_id": row.molecule_chembl_id,
                "rdkit_molecule": molecule,
                "pains": entry.GetDescription().capitalize(),
            }
        )
    else:
        clean.append(index)

matches = pd.DataFrame(matches)
new_data = new_data.loc[clean]

In [None]:
new_data.to_csv('/content/sample_data/molecules_after_PAINS_Filteration.csv', index=True)

In [None]:
print(f"Number of compounds with PAINS: {len(matches)}")
print(f"Number of compounds without PAINS: {len(new_data)}")

In [None]:
Chem.Draw.MolsToGridImage(
    list(matches.head(6).rdkit_molecule),
    legends=list(matches.head(6)["pains"]),
)

In [None]:
import pandas as pd
from rdkit import Chem

substructures = pd.read_csv('/content/sample_data/unwanted_substructures.csv', sep=",")
substructures["rdkit_molecule"] = substructures.smarts.apply(Chem.MolFromSmarts)
print("Number of unwanted substructures in collection:", len(substructures))


Number of unwanted substructures in collection: 99


In [None]:
Chem.Draw.MolsToGridImage(
    mols=substructures.rdkit_molecule.tolist()[2:5],
    legends=substructures.name.tolist()[2:5],
)

In [None]:
matches = []
clean = []
for index, row in tqdm(new_data.iterrows(), total=new_data.shape[0]):
    molecule = Chem.MolFromSmiles(row.smiles)
    match = False
    for _, substructure in substructures.iterrows():
        if molecule.HasSubstructMatch(substructure.rdkit_molecule):
            matches.append(
                {
                    "chembl_id": row.molecule_chembl_id,
                    "rdkit_molecule": molecule,
                    "substructure": substructure.rdkit_molecule,
                    "substructure_name": substructure["name"],
                }
            )
            match = True
    if not match:
        clean.append(index)

matches = pd.DataFrame(matches)
new_data = new_data.loc[clean]

In [None]:
new_data.to_csv('/content/sample_data/final_cleaned_file_from_substructures.csv', index=False)

In [None]:
print(f"Number of found unwanted substructure: {len(matches)}")
print(f"Number of compounds without unwanted substructure: {len(new_data)}")

In [None]:
to_highlight = [
    row.rdkit_molecule.GetSubstructMatch(row.substructure) for _, row in matches.head(3).iterrows()
]
Chem.Draw.MolsToGridImage(
    list(matches.head(3).rdkit_molecule),
    highlightAtomLists=to_highlight,
    legends=list(matches.head(3).substructure_name),
)

In [None]:
groups = matches.groupby("substructure_name")
group_frequencies = groups.size()
group_frequencies.sort_values(ascending=False, inplace=True)
group_frequencies.head(10)