In [147]:
import os
import re
import sys
import yaml
import optuna
import joblib
import copy
import subprocess
import shutil
from optuna.storages import JournalFileStorage, JournalStorage
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from ase import Atoms
from ase.io import read, write
from ase.calculators.singlepoint import SinglePointCalculator
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import get_unique_elements

multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/home/lukas/anaconda3/envs/soc/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/home/lukas/anaconda3/envs/soc/lib/python3.7/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "/home/lukas/anaconda3/envs/soc/lib/python3.7/site-packages/nequip/data/dataset.py", line 790, in _ase_dataset_reader
    if global_index in include_frames
  File "/home/lukas/anaconda3/envs/soc/lib/python3.7/site-packages/nequip/data/AtomicData.py", line 449, in from_ase
    **add_fields,
  File "/home/lukas/anaconda3/envs/soc/lib/python3.7/site-packages/nequip/data/AtomicData.py", line 318, in from_points
    pbc=pbc,
  File "/home/lukas/anaconda3/envs/soc/lib/python3.7/site-packages/nequip/data/AtomicData.py", line 777, in neighbor_list_and_relative_vec
    f"Every single atom has no neighbors within the cutoff r_max={r_max} (after elimi

In [15]:
isozymes = ["3A4", "RLM", "HLC"]
data_splits = ["train", "test"]
splitters = ["rand", "scaff", "time"]
rel_paths = {
    "3A4": r"project_resources/3A4.csv",
    "3A4_train_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_train.csv",
    "3A4_test_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_test.csv",
    "3A4_train_rand": r"project_resources/base_splits/random/3A4_train.csv",
    "3A4_test_rand": r"project_resources/base_splits/random/3A4_test.csv",
    "3A4_train_time": r"project_resources/base_splits/time_split/3A4_train.csv",
    "3A4_test_time": r"project_resources/base_splits/time_split/3A4_test.csv",
    "3A4_train_xyz_rand": r"project_resources/nequip/positions/random/3A4_train_mol_positions.extxyz",
    "3A4_test_xyz_rand": r"project_resources/nequip/positions/random/3A4_test_mol_positions.extxyz",
    "3A4_train_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/3A4_train_mol_positions.extxyz",
    "3A4_test_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/3A4_test_mol_positions.extxyz",
    "3A4_train_xyz_time": r"project_resources/nequip/positions/time_split/3A4_train_mol_positions.extxyz",
    "3A4_test_xyz_time": r"project_resources/nequip/positions/time_split/3A4_test_mol_positions.extxyz",
    "3A4_rand_config": r"project_resources/nequip/3A4_rand_config.yaml",
    "3A4_scaff_config": r"project_resources/nequip/3A4_scaffold_config.yaml",
    "3A4_time_config": r"project_resources/nequip/3A4_time_config.yaml",

    "RLM": r"project_resources/RLM.csv",
    "RLM_train_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_train.csv",
    "RLM_test_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_test.csv",
    "RLM_train_rand": r"project_resources/base_splits/random/RLM_train.csv",
    "RLM_test_rand": r"project_resources/base_splits/random/RLM_test.csv",
    "RLM_train_time": r"project_resources/base_splits/time_split/RLM_train.csv",
    "RLM_test_time": r"project_resources/base_splits/time_split/RLM_test.csv",
    "RLM_train_xyz_rand": r"project_resources/nequip/positions/random/RLM_train_mol_positions.extxyz",
    "RLM_test_xyz_rand": r"project_resources/nequip/positions/random/RLM_test_mol_positions.extxyz",
    "RLM_train_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/RLM_train_mol_positions.extxyz",
    "RLM_test_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/RLM_test_mol_positions.extxyz",
    "RLM_train_xyz_time": r"project_resources/nequip/positions/time_split/RLM_train_mol_positions.extxyz",
    "RLM_test_xyz_time": r"project_resources/nequip/positions/time_split/RLM_test_mol_positions.extxyz",
    "RLM_rand_config": r"project_resources/nequip/RLM_rand_config.yaml",
    "RLM_scaff_config": r"project_resources/nequip/RLM_scaffold_config.yaml",
    "RLM_time_config": r"project_resources/nequip/RLM_time_config.yaml",

    "HLC": r"project_resources/HLC.csv",
    "HLC_train_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_train.csv",
    "HLC_test_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_test.csv",
    "HLC_train_rand": r"project_resources/base_splits/random/HLC_train.csv",
    "HLC_test_rand": r"project_resources/base_splits/random/HLC_test.csv",
    "HLC_train_time": r"project_resources/base_splits/time_split/HLC_train.csv",
    "HLC_test_time": r"project_resources/base_splits/time_split/HLC_test.csv",
    "HLC_train_xyz_rand": r"project_resources/nequip/positions/random/HLC_train_mol_positions.extxyz",
    "HLC_test_xyz_rand": r"project_resources/nequip/positions/random/HLC_test_mol_positions.extxyz",
    "HLC_train_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/HLC_train_mol_positions.extxyz",
    "HLC_test_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/HLC_test_mol_positions.extxyz",
    "HLC_train_xyz_time": r"project_resources/nequip/positions/time_split/HLC_train_mol_positions.extxyz",
    "HLC_test_xyz_time": r"project_resources/nequip/positions/time_split/HLC_test_mol_positions.extxyz",
    "HLC_rand_config": r"project_resources/nequip/HLC_rand_config.yaml",
    "HLC_scaff_config": r"project_resources/nequip/HLC_scaffold_config.yaml",
    "HLC_time_config": r"project_resources/nequip/HLC_time_config.yaml"
}
# sampler - a method used to generate new sets of hyperparameters in each iteration of the optimization process
samplers = {
    'RandomSampler': optuna.samplers.RandomSampler,          # Sampler that selects hyperparameters randomly from the search space.
    'GridSampler': optuna.samplers.GridSampler,              # Sampler that performs a grid search over the hyperparameter space.
    'TPESampler': optuna.samplers.TPESampler,                # Sampler that uses a tree-structured Parzen estimator to model the objective function and sample new points from the search space.
    'CmaEsSampler': optuna.samplers.CmaEsSampler,            # Sampler that uses the Covariance Matrix Adaptation Evolution Strategy algorithm to efficiently search the hyperparameter space.
    'NSGAIISampler': optuna.samplers.NSGAIISampler,          # Multi-objective evolutionary algorithm that generates new samples using non-dominated sorting and crowding distance selection.
    'QMCSampler': optuna.samplers.QMCSampler,                # Quasi-Monte Carlo sampler that uses low-discrepancy sequences to sample the search space in a more efficient and evenly distributed way than random sampling.
    'BoTorchSampler': optuna.integration.BoTorchSampler,     # Sampler that leverages the BoTorch library for Bayesian optimization and can handle both continuous and categorical hyperparameters.
    'BruteForceSampler': optuna.samplers.BruteForceSampler,  # Sampler that exhaustively evaluates all possible combinations of hyperparameters in the search space.
}
# pruner - a technique used to eliminate unpromising trials during the course of hyperparameter optimization.
pruners = {
    'BasePruner': optuna.pruners.BasePruner,                            # This is the base class for all pruning strategies in Optuna. It provides a skeleton for implementing custom pruning strategies.
    'MedianPruner': optuna.pruners.MedianPruner,                        # A pruner that prunes unpromising trials that have median objective values, as determined in previous steps.
    'SuccessiveHalvingPruner': optuna.pruners.SuccessiveHalvingPruner,  # This pruner repeatedly splits trials into halves, discarding the lower performing half at each iteration.
    'HyperbandPruner': optuna.pruners.HyperbandPruner,                  # This pruner implements the Hyperband algorithm, which selects promising trials and runs them with different resource allocation schemes to determine the best one.
    'PercentilePruner': optuna.pruners.PercentilePruner,                # A pruner that prunes unpromising trials based on their percentile rank relative to all completed trials.
    'NopPruner': optuna.pruners.NopPruner,                              # A pruner that does nothing and does not prune any trials.
    'ThresholdPruner': optuna.pruners.ThresholdPruner,                  # This pruner prunes trials that have not reached a certain level of performance (i.e., objective value).
    'PatientPruner': optuna.pruners.PatientPruner,                      # This pruner prunes trials that do not show improvement over a certain number of steps (or epochs).
}
smiles = {}
halflives = {}
split_indexes = {}
position_blocks = {}
rdkit_symbols = {}
unique_symbols = {}

In [17]:
for splitter in splitters:
    print(f"\n{splitter}")
    split_indexes[splitter] = {}
    for isozyme in isozymes:
        print(isozyme)
        split_indexes[splitter][isozyme] = {}
        for split in data_splits:
            df = pd.read_csv(rel_paths[f"{isozyme}_{split}_{splitter}"])
            split_indexes[splitter][isozyme][split] = list(df["index"])
            print(len(split_indexes[splitter][isozyme][split]), split_indexes[splitter][isozyme][split][:25])


rand
3A4
56 [63, 31, 59, 36, 58, 48, 17, 35, 43, 29, 8, 54, 41, 45, 47, 20, 57, 40, 26, 39, 14, 51, 4, 18, 9]
14 [23, 1, 50, 5, 55, 19, 11, 34, 46, 31, 32, 10, 68, 6]
RLM
1421 [439, 199, 16, 266, 544, 1106, 799, 1609, 1672, 753, 731, 1174, 200, 712, 487, 1330, 1581, 549, 1156, 950, 1186, 586, 1038, 911, 311]
356 [66, 942, 833, 1700, 800, 1532, 1037, 240, 1205, 1478, 937, 1121, 1557, 1720, 1068, 918, 110, 1579, 555, 882, 352, 399, 1512, 1395, 323]
HLC
151 [36, 99, 105, 76, 94, 69, 30, 128, 165, 57, 32, 13, 42, 101, 127, 98, 126, 27, 39, 155, 156, 182, 86, 91, 3]
38 [185, 164, 19, 16, 68, 109, 46, 77, 17, 133, 61, 124, 43, 157, 56, 67, 116, 160, 31, 120, 70, 153, 138, 20, 52]

scaff
3A4
56 [30, 33, 38, 51, 54, 58, 60, 64, 67, 31, 63, 37, 31, 36, 37, 63, 65, 2, 12, 14, 16, 17, 20, 21, 4]
14 [29, 26, 25, 23, 18, 15, 11, 10, 9, 8, 6, 5, 3, 1]
RLM
1421 [41, 59, 70, 320, 356, 417, 590, 638, 746, 778, 791, 926, 928, 946, 1016, 1031, 1071, 1084, 1094, 1102, 1111, 1149, 1193, 1195, 1200]
356 [6

In [18]:
for splitter in splitters:
    print(splitter)
    smiles[splitter] = {}
    halflives[splitter] = {}
    position_blocks[splitter] = {}
    rdkit_symbols[splitter] = {}
    smiles[splitter] = {}

    for isozyme in isozymes:
        smiles[splitter][isozyme] = {}
        halflives[splitter][isozyme] = {}
        position_blocks[splitter][isozyme] = {}
        rdkit_symbols[splitter][isozyme] = {}
        smiles[splitter][isozyme] = {}

        for split in data_splits:
            try:
                read(rel_paths[f"{isozyme}_{split}_xyz_{splitter}"])
                df = pd.read_csv(rel_paths[f"{isozyme}_{split}_{splitter}"])
                smiles[splitter][isozyme][split] = df["smiles"]
                print(f"{isozyme}_{split}_mol_positions.extxyz already exists")

            except FileNotFoundError:
                # create .extxyz files for each combination of splitter, isozyme and split
                df = pd.read_csv(rel_paths[f"{isozyme}_{split}_{splitter}"])
                smiles[splitter][isozyme][split] = list(df["smiles"])
                halflives[splitter][isozyme][split] = np.log(np.array(df["half-life"]))
                isozyme_positions = []
                isozyme_symbols = []

                for smi in smiles[splitter][isozyme][split]:
                    # create mol
                    mol = Chem.MolFromSmiles(smi)
                    mol = Chem.AddHs(mol, explicitOnly=True)
                    AllChem.EmbedMolecule(mol)

                    # get x, y and z positions of each atom from mol
                    xyz_string = Chem.MolToXYZBlock(mol)

                    # for looping over the positions
                    lines = xyz_string.strip().split("\n")[2:]

                    mol_positions = []  # corrdinates of each atom of mol in 3D space
                    mol_symbols = []  # list of atoms in mol

                    # get each atom and its coordinates
                    for line in lines:
                        parts = line.split()
                        symbol = parts[0]
                        x, y, z = map(float, parts[1:4])
                        mol_symbols.append(symbol)
                        mol_positions.append([float(coord) for coord in parts[1:]])

                    isozyme_positions.append(mol_positions)
                    isozyme_symbols.append(mol_symbols)

                position_blocks[splitter][isozyme] = {}
                position_blocks[splitter][isozyme][split] = isozyme_positions
                # save data to dicts
                rdkit_symbols[splitter][isozyme] = {}
                rdkit_symbols[splitter][isozyme][split] = isozyme_symbols

                # generate .extxyz files for train+validation and test sets
                out_filename_extxyz = rel_paths[f"{isozyme}_{split}_xyz_{splitter}"]
                # get data
                positions = position_blocks[splitter][isozyme][split]  # coordinates of each atom in 3D space
                symbols = rdkit_symbols[splitter][isozyme][split]  # atomic symbols (e.g. C, O, H...)
                energies = halflives[splitter][isozyme][split]  # log half-life

                # iterate over data and write continuously to extxyz file
                for pos_idx in range(len(positions)):
                    curr_atoms = Atoms(
                    # set atomic positions
                    positions=positions[pos_idx],
                    # set chemical symbols / species
                    symbols=symbols[pos_idx],
                    # assuming data with periodic boundary conditions, set to false for e.g. for molecules in vacuum
                    pbc=True
                    )

                    # set calculator to assign targets
                    calculator = SinglePointCalculator(curr_atoms, energy=energies[pos_idx])
                    curr_atoms.calc = calculator

                    write(out_filename_extxyz, curr_atoms, format='extxyz', append=True)
                print(f"{out_filename_extxyz} was successfully created")

                # create .txt files which are copies of the .extxyz files
                # with molecule indexes for better corss-referencing with source .csv files
                # in order to include mol indexes, the file is no longer correctly formatted
                # therefore can't be used for training/testing a module
                out_filename_txt = out_filename_extxyz.replace("extxyz", "txt")
                for mol_idx, pos_idx in zip(split_indexes[splitter][isozyme][split], range(len(positions))):
                    curr_atoms = Atoms(
                    # set atomic positions
                    positions=positions[pos_idx],
                    # set chemical symbols / species
                    symbols=symbols[pos_idx],
                    # assuming data with periodic boundary conditions, set to false for e.g. for molecules in vacuum
                    pbc=True
                    )

                    # set calculator to assign targets
                    calculator = SinglePointCalculator(curr_atoms, energy=energies[pos_idx])
                    curr_atoms.calc = calculator

                    with open(out_filename_txt, "a") as file:
                        file.write(f"molecule index: {mol_idx}\n")
                        file.close()
                    write(out_filename_txt, curr_atoms, format='extxyz', append=True)
                print(f"{out_filename_txt} was successfully created")

rand
3A4_train_mol_positions.extxyz already exists
3A4_test_mol_positions.extxyz already exists
RLM_train_mol_positions.extxyz already exists
RLM_test_mol_positions.extxyz already exists
HLC_train_mol_positions.extxyz already exists
HLC_test_mol_positions.extxyz already exists
scaff
3A4_train_mol_positions.extxyz already exists
3A4_test_mol_positions.extxyz already exists
RLM_train_mol_positions.extxyz already exists
RLM_test_mol_positions.extxyz already exists
HLC_train_mol_positions.extxyz already exists
HLC_test_mol_positions.extxyz already exists
time
3A4_train_mol_positions.extxyz already exists
3A4_test_mol_positions.extxyz already exists
RLM_train_mol_positions.extxyz already exists
RLM_test_mol_positions.extxyz already exists
HLC_train_mol_positions.extxyz already exists
HLC_test_mol_positions.extxyz already exists


In [20]:
for isozyme in isozymes:
    df = pd.read_csv(rel_paths[isozyme])
    isozyme_smiles = list(df["smiles"])
    unique_symbs = get_unique_elements(isozyme_smiles)
    unique_symbols[isozyme] = list(unique_symbs)
    print(f"chemical elements present in {isozyme}: {unique_symbols[isozyme]}")

chemical elements present in 3A4: ['H', 'P', 'O', 'Cl', 'C', 'F', 'S', 'N']
chemical elements present in RLM: ['H', 'Br', 'B', 'O', 'Cl', 'C', 'F', 'S', 'I', 'N']
chemical elements present in HLC: ['H', 'O', 'C', 'F', 'S', 'N']


In [167]:
def create_nequip_eval_yaml(yaml_path, root, trial_number, eval_dataset_file_name, unique_symbols):
    with open(yaml_path, "w") as out_f:
        data = {}
        
        # root is the same as nequip-train, except at the end is also included the number of the run
        data["root"] = root
        data["chemical_symbols"] = unique_symbols
        data["dataset_file_name"] = eval_dataset_file_name
        data["dataset"] = "ase"
        
        yaml.dump(data, out_f)
        print(f"""The evaluation yaml file for the {trial_number}. iteration of {splitter} {isozyme}
        with {root} as root and was successfully created """)
        out_f.close()

In [163]:
def create_nequip_train_yaml(yaml_path, batch_size, learning_rate, num_layers, root, trial_number,
                       train_dataset_file_name, unique_symbols, num_mols):
    with open(yaml_path, "w") as out_f:
        data = {}

        data["batch_size"] = batch_size
        data["learning_rate"] = learning_rate
        data["num_layers"] = num_layers

        data["root"] = root  # project_resources/nequip/{splitter}/{isozyme}
        data["run_name"] = str(trial_number)  # number of the trial for a given combination of splitter and isozyme
        data["dataset_file_name"] = train_dataset_file_name
        data["seed"] = 123
        data["dataset_seed"] = 456
        data["append"] = True
        data["model_builders"] = ["SimpleIrrepsConfig", "EnergyModel", "PerSpeciesRescale", "RescaleEnergyEtc"]

        data["max_epochs"] = 3
        data["r_max"] = 0.4  # cutoff radius in Angstroms; changing the value has no effect on validation_e_mae
        data["l_max"] = 2
        data["parity"] = False  # slighly worse performance than True, but training takes way less time
        data["num_features"] = 32
        data["num_basis"] = 8

        data["dataset"] = "ase"
        key_mapping = {"z": "atomic_numbers", "E": "total_energy", "R": "pos"}
        data["key_mapping"] = key_mapping
        data["npz_fixed_field_keys"] = "atomic_numbers"
        data["chemical_symbols"] = unique_symbols
        data["wandb"] = False  # impossible to use wandb inside a notebook (cannot choose an option)

        num_train_val = num_mols  # number of molecules in the dataset
        num_train = int(np.floor(0.8 * num_train_val))
        num_validation = int(num_train_val - num_train)
        data["n_train"] = num_train
        data["n_val"] = num_validation
        data["train_val_split"] = "sequential"
        data["validation_batch_size"] = num_validation
        data["loss_coeffs"] = "total_energy"
        data["optimizer_name"] = "Adam"

        splitter = root.split("/")[-2]
        isozyme = root.split("/")[-1]
        
        yaml.dump(data, out_f)
        print(f"""The train yaml file for the {trial_number}. iteration of {splitter} {isozyme}
        with {root} as root and was successfully created """)
        out_f.close()

In [140]:
def parse_nequip_xyz_out(xyz):
    # get the predicted half-lives from xyz file created after using nequip-evaluate --output
    with open(xyz) as f:
        energies = [float(re.search(r'energy=(-?\d+\.\d+)', line).group(1)) for line in f if "energy" in line]
    return energies

In [72]:
def parse_nequip_log(log):
    # get the rmse value from the log file created after using nequip-evaluate --log
    with open(log) as l:
        file_content = l.read()
    # use regular expressions to find the rmse value
    rmse = re.search(r'\s*e_rmse\s*=\s*(\d+\.\d+)', file_content)
    return float(rmse.group(1))

In [176]:
class NequIPTuner():
    def __init__(self, run_name, root, unique_symbols, num_mols, train_dataset_file_name, eval_dataset_file_name):
        self.run_name = run_name  # {splitter}_{isozyme}
        self.root = root
        self.unique_symbols = unique_symbols
        self.num_mols = num_mols  # number of mols present in the train .extxyz file
        self.train_dataset_file_name = train_dataset_file_name
        self.eval_dataset_file_name = eval_dataset_file_name

    def sample_params(self, trial: optuna.Trial):
        batch_size = trial.suggest_int("batch_size", 2, 50)
        learning_rate = trial.suggest_float("learning_rate", 0.001, 1)
        num_layers = trial.suggest_int("num_layers", 3, 5)
        return {
            "batch_size": batch_size,
            "learning_rate": learning_rate,
            "num_layers": num_layers
        }

    def train_test_return(self, train_yaml_name, eval_yaml_name, train_dir, return_predictions=False):
        xyz_out = f"{train_dir}/output.xyz"
        log_file = f"{train_dir}/evaluation_log.txt"
        
        print(f"running command: nequip-train {train_yaml_name}")
        !nequip-train {train_yaml_name}
        # {train_dir} ... project_resources/optuna/nequip/random/3A4/1
        # {eval_yaml_name} ... eval_dataset.yaml
        # {xyz_out} ... project_resources/optuna/nequip/random/3A4/1/output.xyz
        # ^ Parse this to get the predicted half-lives ^
        # {log} ... project_resources/optuna/nequip/random/3A4/1/eval_log.txt
        
        print(f"""running command: nequip-evaluate --train-dir {train_dir} 
              --dataset-config {eval_yaml_name} 
              --output {xyz_out} 
              --log {log_file}""")
        !nequip-evaluate --train-dir {train_dir} --dataset-config {eval_yaml_name} --output {xyz_out} --log {log_file}

        rmse = parse_nequip_log(log_file)
        
        # delete the dirs and files that were created, as they are no longer needed
        shutil.rmtree(train_dir)
        os.remove(train_yaml_name)
        os.remove(eval_yaml_name)
        
        if return_predictions:
            predictions = parse_nequip_xyz_out(xyz_out)
            return rmse, predictions
        else:
            return rmse

    def objective(self, trial=None):
        parameters = self.sample_params(trial)
    
        run_splitter = self.run_name.split("_")[0]
        run_isozyme = self.run_name.split("_")[1]
        if run_splitter == "rand":
            run_splitter_name = "random"
        elif run_splitter == "scaff":
            run_splitter_name = "scaffold_splitter"
        else:
            run_splitter_name = "time_split"

        working_dir = f"project_resources/optuna/nequip/{run_splitter_name}/{run_isozyme}"
        train_yaml_name = f"{working_dir}/{self.run_name}_nequip_train_{trial.number}.yaml"
        eval_yaml_name = f"{working_dir}/{self.run_name}_nequip_eval_{trial.number}.yaml"
        train_dir = self.root + f"/{trial.number}"
        
        # create train yaml file for this trial
        create_nequip_train_yaml(train_yaml_name, parameters["batch_size"], parameters["learning_rate"],
                           parameters["num_layers"], self.root, trial.number, self.train_dataset_file_name,
                           self.unique_symbols, self.num_mols)

        # create evaluation yaml file for this trial
        create_nequip_eval_yaml(eval_yaml_name, self.root, trial.number,
                                self.eval_dataset_file_name, self.unique_symbols)

        return self.train_test_return(train_yaml_name, eval_yaml_name, train_dir)

In [178]:
sampler = samplers["TPESampler"]
pruner = pruners["BasePruner"]
n_trials = 1
for splitter in splitters:
    if splitter == "rand":
        splitter_name = "random"
    elif splitter == "scaff":
        splitter_name = "scaffold_splitter"
    else:
        splitter_name = "time_split"

    for isozyme in isozymes:
        root = f"project_resources/optuna/nequip/{splitter_name}/{isozyme}"
        lock_obj = optuna.storages.JournalFileOpenLock(root + "/nequip_journal.log")
        storage = JournalStorage(
            JournalFileStorage(root + "/nequip_journal.log", lock_obj=lock_obj)
        )
        
        study = optuna.create_study(study_name=f"nequip_{splitter}_{isozyme}", directions=["minimize"],
                                    pruner=pruner, storage=storage, load_if_exists=True)

        run_name = f"{splitter}_{isozyme}"
        num_mols = len(smiles[splitter][isozyme]["train"])
        train_dataset_file_name = f"project_resources/nequip/positions/{splitter_name}/{isozyme}_train_mol_positions.extxyz"
        eval_dataset_file_name = f"project_resources/nequip/positions/{splitter_name}/{isozyme}_test_mol_positions.extxyz"
        tuner = NequIPTuner(run_name, root, unique_symbols[isozyme], num_mols, train_dataset_file_name, eval_dataset_file_name)
        
        study.optimize(tuner.objective, n_trials=n_trials, n_jobs=-1)
        joblib.dump(study, f"{root}/nequip.pkl")

[I 2023-11-22 22:10:00,107] Using an existing study with name 'nequip_rand_3A4' instead of creating a new one.


The train yaml file for the 1. iteration of random 3A4
        with project_resources/optuna/nequip/random/3A4 as root and was successfully created 
The evaluation yaml file for the 1. iteration of rand 3A4
        with project_resources/optuna/nequip/random/3A4 as root and was successfully created 
running command: nequip-train project_resources/optuna/nequip/random/3A4/rand_3A4_nequip_train_1.yaml
Torch device: cpu
Successfully loaded the data set of type ASEDataset(56)...
Replace string dataset_per_atom_total_energy_std to 0.04218994081020355
Replace string dataset_per_atom_total_energy_mean to -0.08114522695541382
Atomic outputs are scaled by: [H, C, N, O, F, P, S, Cl: 0.042190], shifted by [H, C, N, O, F, P, S, Cl: -0.081145].
Replace string dataset_total_energy_std to tensor([1.4029])
Initially outputs are globally scaled by: tensor([1.4029]), total_energy are globally shifted by None.
Successfully built the network...
Number of weights: 240664
Number of trainable weights: 240664

[I 2023-11-22 22:10:25,662] Trial 1 finished with value: 1.626565 and parameters: {'batch_size': 25, 'learning_rate': 0.5642759064409827, 'num_layers': 4}. Best is trial 0 with value: 1.588775.
[I 2023-11-22 22:10:25,668] Using an existing study with name 'nequip_rand_RLM' instead of creating a new one.


The train yaml file for the 1. iteration of random RLM
        with project_resources/optuna/nequip/random/RLM as root and was successfully created 
The evaluation yaml file for the 1. iteration of rand RLM
        with project_resources/optuna/nequip/random/RLM as root and was successfully created 
running command: nequip-train project_resources/optuna/nequip/random/RLM/rand_RLM_nequip_train_1.yaml
Torch device: cpu
Processing dataset...


KeyboardInterrupt: 