In [1]:
import shutil
import subprocess
import tempfile
import time
from os import PathLike
from pathlib import Path

import ase.io
import numpy as np
import pandas as pd
import torchani
from ase.optimize import BFGS
from calculate import calculate_compound
from joblib import Parallel, delayed, parallel_config
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm

Define helper functions

In [2]:
def run_compound(input_file: str | PathLike) -> dict[str, list[float]]:
    try:
        results = calculate_compound(str(input_file))
    except (ValueError, NotImplementedError, IndexError):
        results = {
            "s1_cis": [np.nan],
            "t1_cis": [np.nan],
            "exchange_integral": [np.nan],
            "dsp_scf": [np.nan],
            "dsp_cis": [np.nan],
            "homo_lumo_overlap": [np.nan],
            "oscillator_strength": [np.nan],
        }
    return results

In [3]:
def opt_mmff(smiles: str, output_path: PathLike, random_seed=None):
    """Optimize molecule with MMFF."""
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol, randomSeed=random_seed)
    AllChem.MMFFOptimizeMolecule(mol)
    Chem.MolToXYZFile(mol, str(output_path))

In [4]:
def opt_ani(input_path: PathLike, output_path: PathLike, calculator):
    """Optimize molecule with ANI."""
    atoms = ase.io.read(input_path)
    atoms.set_calculator(calculator)
    opt = BFGS(atoms, logfile=None)
    opt.run(fmax=0.001)
    ase.io.write(output_path, atoms, plain=True)



In [5]:
def opt_gfn(input_path: PathLike, output_path: PathLike, keywords=None):
    """Optimize molecule with GFN methods."""
    if keywords is None:
        keywords = []
    with tempfile.TemporaryDirectory() as temp_dir:
        # Copy the input file to the temporary directory
        temp_path = Path(temp_dir)

        shutil.copy(input_path, temp_path)
        command = f"xtb {input_path.name} --opt " + " ".join(keywords)

        # Run the shell command in the temporary directory
        subprocess.run(
            command.split(),
            cwd=temp_path,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
        try:
            shutil.copy(temp_path / "xtbopt.xyz", output_path)
            return
        except FileNotFoundError as e:
            raise e

In [6]:
def process_compound(idx: int, smiles: str, output_dir: PathLike) -> pd.DataFrame:
    """Process a single compound with all methods."""
    # Optimize with MMFF
    start_time = time.perf_counter()
    filename_mmff = output_dir / f"{idx}_MMFF.xyz"
    opt_mmff(smiles, filename_mmff, random_seed=42)
    end_time = time.perf_counter()
    time_mmff = end_time - start_time
    results_mmff = run_compound(filename_mmff)
    path_mmff = Path(filename_mmff)

    # Optimize with GFNFF
    input_path = path_mmff
    output_path = input_path.parent / f"{idx}_GFNFF.xyz"
    start_time = time.perf_counter()
    opt_gfn(input_path, output_path, keywords=["--gfnff"])
    end_time = time.perf_counter()
    time_gfnff = end_time - start_time
    results_gfnff = run_compound(output_path)

    # Optimize with GFN2xTB
    input_path = path_mmff
    output_path = input_path.parent / f"{idx}_GFN2xTB.xyz"
    start_time = time.perf_counter()
    opt_gfn(input_path, output_path, keywords=["--gfn 2"])
    end_time = time.perf_counter()
    time_gfn2 = end_time - start_time
    results_gfn2 = run_compound(output_path)

    # Optimize with ANI1ccx
    calculator = torchani.models.ANI1ccx().ase()
    input_path = path_mmff
    output_path = input_path.parent / f"{idx}_ANI1ccx.xyz"
    start_time = time.perf_counter()
    opt_ani(input_path, output_path, calculator)
    end_time = time.perf_counter()
    time_ani = end_time - start_time
    results_ani1ccx = run_compound(output_path)

    # Collect output
    data_mmff = {"id": [idx], "opt_time": time_mmff, **results_mmff}
    data_gfnff = {"id": [idx], "opt_time": time_gfnff, **results_gfnff}
    data_gfn2 = {"id": [idx], "opt_time": time_gfn2, **results_gfn2}
    data_ani1ccx = {"id": [idx], "opt_time": time_ani, **results_ani1ccx}
    df_mmff = pd.DataFrame(data_mmff).set_index("id")
    df_gfnff = pd.DataFrame(data_gfnff).set_index("id")
    df_gfn2 = pd.DataFrame(data_gfn2).set_index("id")
    df_ani1ccx = pd.DataFrame(data_ani1ccx).set_index("id")

    return (
        df_mmff,
        df_gfnff,
        df_gfn2,
        df_ani1ccx,
    )

Extract snakemake input parameters

In [None]:
input_path = Path(snakemake.input.path)
output_dir = Path(snakemake.output.xyz_dir)
log_path = Path(snakemake.log.progress)
n_jobs = snakemake.threads

Load reference data

In [7]:
df = pd.read_csv(input_path, index_col=0)

Run calculations

In [8]:
output_dir.mkdir()
# Run the calculation in parallel with joblib
input_data = [(i, row["smiles"]) for i, row in df.iterrows()]
with open(log_path, "w") as f:
    with parallel_config(backend="loky", inner_max_num_threads=1):
        results = Parallel(n_jobs=n_jobs)(
            delayed(process_compound)(
                idx,
                smiles,
                output_dir,
            )
            for idx, smiles in tqdm(input_data, file=f, mininterval=5, maxinterval=5)
        )



31

2
4
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
5
6
7
8
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
9
10
11
12
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
/Users/Kjell/mambaforge/envs/coulson/lib/python3.11/site-packages/torchani/resources/
/Users/Kjell/mambaforge/env

Extract and save results

In [9]:
(
    results_mmff,
    results_gfnff,
    results_gfn2,
    results_ani1ccx,
) = zip(*results)

In [10]:
results_all_mmff = pd.concat(results_mmff)
results_all_gfnff = pd.concat(results_gfnff)
results_all_gfn2 = pd.concat(results_gfn2)
results_all_ani1ccx = pd.concat(results_ani1ccx)

In [11]:
results_all_mmff.to_csv(snakemake.output.mmff)
results_all_gfnff.to_csv(snakemake.output.gfnff)
results_all_gfn2.to_csv(snakemake.output.gfn2)
results_all_ani1ccx.to_csv(snakemake.output.ani)