# Drop nans for each target

In [2]:
import polars as pl

host_df = pl.read_csv('../../data/from_host/train.csv')
host_df.head()

id,SMILES,Tg,FFV,Tc,Density,Rg
i64,str,f64,f64,f64,f64,f64
87817,"""*CC(*)c1ccccc1C(=O)OCCCCCC""",,0.374645,0.205667,,
106919,"""*Nc1ccc([C@H](CCC)c2ccc(C3(c4c…",,0.3704102,,,
388772,"""*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(…",,0.37886,,,
519416,"""*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c…",,0.3873239,,,
539187,"""*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCO…",,0.35547,,,


In [3]:
host_tg_df = host_df.drop_nulls(subset='Tg').with_columns(
    pl.col('Tg').alias('TARGET')
)['SMILES', 'TARGET']

host_tg_df.head()

SMILES,TARGET
str,f64
"""*NC(C)C(=O)NCC(=O)NCC(*)=O""",208.639749
"""*CCCCCCSSCCCCSS*""",-41.266724
"""*C=CCCCCCCCC*""",-17.282022
"""*CCCCCCCCCCOC(=O)c1ccc(C(=O)NC…",4.250403
"""*c1nc2cc3sc(-c4cc(OCCCCCC)c(*)…",168.526313


In [4]:
from unimol_tools.data.conformer import mol2unimolv2
from rdkit import Chem
from rdkit.Chem import AllChem

mol = Chem.MolFromSmiles("*C=CCCCCCCCC*")
# mol2unimolv2(mol)

AllChem.EmbedMolecule(mol)

mol.GetConformer()

<rdkit.Chem.rdchem.Conformer at 0x7fb48f8d3d10>

In [5]:
from tqdm import tqdm
import traceback

# TARGET_NAMES= ["Tg", "FFV", "Tc", "Density", "Rg"]
TARGET_NAMES= ["FFV"]


def can_embed(smiles_string: str) -> bool:
    """
    Return True only if RDKit can parse the SMILES *and*
    `AllChem.EmbedMolecule` succeeds (status == 0).

    Any parsing, sanitisation, or embedding error ⇒ False.
    """
    try:
        molecule = Chem.MolFromSmiles(smiles_string)

        if molecule.GetNumAtoms(onlyExplicit=False) > 130:
            return False

        if molecule is None:
            return False                          # unparsable SMILES
        # 1‑shot, quick‑fail embed; tweak kwargs if you like
        embed_status: int = AllChem.EmbedMolecule(
            molecule,
            maxAttempts=10,
            clearConfs=True,
        )
        return embed_status == 0
    except Exception:                             # catches RDKit C++ errors too
        # traceback.print_exc()
        return False

for target_name in tqdm(TARGET_NAMES):
    subset_df = (
        host_df#.sample(n=100)
        .drop_nulls(subset=target_name)
        .with_columns(
            pl.col(target_name).alias('TARGET')
        )
        .filter(
            pl.col("SMILES").map_elements(        # keeps only embeddable rows
                can_embed,
                return_dtype=pl.Boolean,
            )
            # # Only keep rows whose SMILES can be embedded (EmbedMolecule == 0)
            # pl.col("SMILES").map_elements(
            #     lambda smiles_string: (
            #         (molecule := Chem.MolFromSmiles(smiles_string)) is not None
            #         and AllChem.EmbedMolecule(molecule) == 0
            #     ),
            #     return_dtype=pl.Boolean,
            # )
        )
        ['SMILES', 'TARGET']
    )
    subset_df.write_csv(f'from_host/{target_name}.csv')

100%|██████████| 1/1 [01:44<00:00, 104.30s/it]


In [15]:
from __future__ import annotations

from pathlib import Path
from typing import Callable

from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem
import polars as pl


TARGET_NAMES: list[str] = ["Tg", "FFV", "Tc", "Density", "Rg"]
OUTPUT_DIR: Path = Path("from_host")

OUTPUT_DIR.mkdir(exist_ok=True)


def build_smiles_processor(target_name: str) -> Callable[[str], str | None]:
    """
    Return a function that:
      • parses a SMILES,
      • replaces '*' (atomic number 0) with carbon,
      • canonicalises and embeds it,
      • applies optional size filter for FFV,
      • and returns the canonical SMILES *or* None on failure.
    """

    def _process(original_smiles: str) -> str | None:
        try:
            # ── 1  parse without sanitising first ────────────────────────────
            molecule = Chem.MolFromSmiles(original_smiles, sanitize=False)
            if molecule is None:  # unparsable string
                return None

            # ── 2  replace wildcards with carbon ────────────────────────────
            for atom in molecule.GetAtoms():
                if atom.GetAtomicNum() == 0:
                    # atom.SetAtomicNum(6)
                    atom.SetAtomicNum(85)

            # ── 3  full sanitisation ────────────────────────────────────────
            Chem.SanitizeMol(molecule)

            # ── 4  dataset‑specific filter ──────────────────────────────────
            if (
                target_name == "FFV"
                and molecule.GetNumAtoms(onlyExplicit=False) > 110
            ):
                return None

            # ── 5  conformer generation sanity check ───────────────────────
            embed_status: int = AllChem.EmbedMolecule(
                molecule, maxAttempts=5, clearConfs=True
            )
            if embed_status != 0:
                return None

            # ── 6  return canonicalised SMILES ──────────────────────────────
            return Chem.MolToSmiles(
                molecule, canonical=True, isomericSmiles=True
            )

        except Exception:
            # Anything weird → drop the row
            return None

    return _process


for target_name in tqdm(TARGET_NAMES, desc="processing targets"):
    preprocess_and_embed = build_smiles_processor(target_name)

    subset_df: pl.DataFrame = (
        host_df
        # keep only rows with a label for this target
        .drop_nulls(subset=target_name)
        # rename the column to the generic name expected downstream
        .with_columns(pl.col(target_name).alias("TARGET"))
        # convert SMILES → canonicalised; invalid rows become null
        .with_columns(
            pl.col("SMILES")
            .map_elements(preprocess_and_embed, return_dtype=pl.Utf8)
            .alias("SMILES")
        )
        # remove rows that failed preprocessing
        .drop_nulls(subset=["SMILES"])
        # output only what Uni‑Mol 2 needs
        .select(["SMILES", "TARGET"])
    )

    subset_df.write_csv(OUTPUT_DIR / f"{target_name}_At.csv")


processing targets: 100%|██████████| 5/5 [01:44<00:00, 20.81s/it]


In [5]:
import polars as pl
from rdkit import Chem

tg_smiles = pl.read_csv('from_host/FFV.csv')['SMILES']
atom_counts = [
    Chem.MolFromSmiles(smi).GetNumAtoms(onlyExplicit=False)
    for smi in tg_smiles
]
print(atom_counts[:5])
print(max(atom_counts))

[39, 101, 78, 48, 40]
192


# Sanity check

In [1]:
from unimol_tools import MolTrain, MolPredict

clf = MolTrain(
    task='regression', 
    data_type='molecule', 
    epochs=10, 
    learning_rate=1e-4,
    batch_size=16, 
    kfold=5,
    model_name='unimolv2',
    model_size='84m',
    early_stopping=1e9,
    metrics='mae',
    conf_cache_level=2,
    save_path='./exp'
)
clf.fit(data = 'from_host/Tg.csv')

2025-07-15 23:06:31 | unimol_tools/data/conformer.py | 437 | INFO | Uni-Mol Tools | Start generating conformers...
410it [00:04, 92.45it/s] 
2025-07-15 23:06:35 | unimol_tools/data/conformer.py | 452 | INFO | Uni-Mol Tools | Succeeded in generating conformers for 100.00% of molecules.
2025-07-15 23:06:35 | unimol_tools/data/conformer.py | 469 | INFO | Uni-Mol Tools | Succeeded in generating 3d conformers for 100.00% of molecules.
2025-07-15 23:06:35 | unimol_tools/data/datahub.py | 181 | INFO | Uni-Mol Tools | conf_cache_level is 2, saving conformers to ./exp/Tg.sdf.
2025-07-15 23:06:36 | unimol_tools/data/datahub.py | 195 | INFO | Uni-Mol Tools | Successfully saved sdf file to ./exp/Tg.sdf
2025-07-15 23:06:36 | unimol_tools/data/datahub.py | 146 | INFO | Uni-Mol Tools | Split method: random, fold: 5
2025-07-15 23:06:36 | unimol_tools/train.py | 223 | INFO | Uni-Mol Tools | Output directory already exists: ./exp
2025-07-15 23:06:36 | unimol_tools/tasks/trainer.py | 78 | INFO | Uni-Mol 

In [7]:
import joblib

joblib.load('exp/metric.result')

{'mae': 54.72370237721238,
 'pearsonr': 0.7795227932409188,
 'spearmanr': 0.7459312521845795,
 'mse': 5122.315569984664,
 'r2': 0.5991287588357533}

In [9]:
joblib.load('../../models/UniMol2_2025_07_16/Tc/metric.result')

{'mae': 0.022057177832748225,
 'pearsonr': 0.901795441267618,
 'spearmanr': 0.925422367748698,
 'mse': 0.001351675419052299,
 'r2': 0.8124799961411993}