In [1]:
import os
import pandas as pd

from tqdm import tqdm
from multiprocessing import Pool, cpu_count

from STOUT import translate_forward, translate_reverse

2025-08-29 16:46:13.447658: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-29 16:46:13.503657: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-08-29 16:46:15.210685: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


/home/rlawlsgurjh/.data/STOUT-V2/models/translator_forward/
Downloading trained model to /home/rlawlsgurjh/.data/STOUT-V2/models


Downloading models.zip: 0.00B [00:00, ?B/s]

DownloadError: Failed with urllib to download https://zenodo.org/records/12542360/files/models.zip?download=1 to /home/rlawlsgurjh/.data/STOUT-V2/models.zip

In [None]:
# SMILES to IUPAC name translation
SMILES = "CN1C=NC2=C1C(=O)N(C(=O)N2C)C"
IUPAC_name = translate_forward(SMILES)
print(f"🧪 IUPAC name of {SMILES} is: {IUPAC_name}")

# IUPAC name to SMILES translation
IUPAC_name = "1,3,7-trimethylpurine-2,6-dione"
SMILES = translate_reverse(IUPAC_name)
print(f"🔬 SMILES of {IUPAC_name} is: {SMILES}")

In [None]:
ROOT_DIR = os.path.join('/home', 'rlawlsgurjh', 'hdd', 'work', 'RxnFlow')

DATA_DIR = os.path.join(ROOT_DIR, 'data', 'building_blocks')

SMI_PATH = os.path.join(DATA_DIR, 'enamine_catalog.smi')

CHECKPOINT_PATH = os.path.join(DATA_DIR, 'iupac_results.csv')

In [None]:
def fetch_iupac(smi):
    try:
        iupac_name = translate_forward(smi)
        
        if iupac_name and isinstance(iupac_name, str):
            return ("success", smi, iupac_name)
        else:
            return ("error", smi, None)
        
    except Exception as e:
        return ("error", smi, None)

In [None]:
def check_iupac_parallel(smi_path, num_cpus=None):
    """
    Reads a .smi file and processes it in parallel to fetch IUPAC names.
    """
    if num_cpus is None:
        # Default to using all available CPU cores
        num_cpus = cpu_count()
        print(f"Using {num_cpus} CPU cores.")

    # First, parse all valid SMILES strings from the file
    smiles_to_process = []
    try:
        with open(smi_path, "r") as f:
            for line in f:
                parts = line.strip().split('\t')
                if parts and parts[0]:
                    smiles_to_process.append(parts[0])
    except FileNotFoundError:
        print(f"Error: File not found at {smi_path}")
        return pd.DataFrame({"smiles": [], "iupac": [], "error": []})

    # Prepare the dictionary to store final results
    results_dict = {
        "smiles": [],
        "iupac": [],
        "error": []
    }

    # Create a pool of worker processes
    with Pool(processes=num_cpus) as pool:
        # Use pool.imap_unordered for efficiency and a responsive progress bar
        # It applies the 'fetch_iupac' function to each item in 'smiles_to_process'
        results_iterator = pool.imap_unordered(fetch_iupac, smiles_to_process)
        
        # Wrap the iterator with tqdm to show progress
        for result in tqdm(results_iterator, total=len(smiles_to_process), desc="Processing SMILES -> IUPAC"):
            status, smi, iupac = result
            if status == "success":
                results_dict["smiles"].append(smi)
                results_dict["iupac"].append(iupac)
            else:
                results_dict["error"].append(smi)

    return pd.DataFrame(results_dict)

In [None]:
df = check_iupac_parallel(
    smi_path=SMI_PATH,
    num_cpus=12,
)