In [6]:
import os
import re
import sys
import yaml
import optuna
import joblib
import copy
import subprocess
import shutil
from tdc.single_pred import ADME
from sklearn.preprocessing import MinMaxScaler
from optuna.storages import JournalFileStorage, JournalStorage
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from ase import Atoms
from ase.io import read, write
from ase.calculators.singlepoint import SinglePointCalculator
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import smiles_to_xyz_file, get_unique_elements

In [7]:
tdc_benchmarks = ["obach", "microsome", "hepatocyte"]
# sampler - a method used to generate new sets of hyperparameters in each iteration of the optimization process
samplers = {
    'RandomSampler': optuna.samplers.RandomSampler,          # Sampler that selects hyperparameters randomly from the search space.
    'GridSampler': optuna.samplers.GridSampler,              # Sampler that performs a grid search over the hyperparameter space.
    'TPESampler': optuna.samplers.TPESampler,                # Sampler that uses a tree-structured Parzen estimator to model the objective function and sample new points from the search space.
    'CmaEsSampler': optuna.samplers.CmaEsSampler,            # Sampler that uses the Covariance Matrix Adaptation Evolution Strategy algorithm to efficiently search the hyperparameter space.
    'NSGAIISampler': optuna.samplers.NSGAIISampler,          # Multi-objective evolutionary algorithm that generates new samples using non-dominated sorting and crowding distance selection.
    'QMCSampler': optuna.samplers.QMCSampler,                # Quasi-Monte Carlo sampler that uses low-discrepancy sequences to sample the search space in a more efficient and evenly distributed way than random sampling.
    'BoTorchSampler': optuna.integration.BoTorchSampler,     # Sampler that leverages the BoTorch library for Bayesian optimization and can handle both continuous and categorical hyperparameters.
    'BruteForceSampler': optuna.samplers.BruteForceSampler,  # Sampler that exhaustively evaluates all possible combinations of hyperparameters in the search space.
}
# pruner - a technique used to eliminate unpromising trials during the course of hyperparameter optimization.
pruners = {
    'BasePruner': optuna.pruners.BasePruner,                            # This is the base class for all pruning strategies in Optuna. It provides a skeleton for implementing custom pruning strategies.
    'MedianPruner': optuna.pruners.MedianPruner,                        # A pruner that prunes unpromising trials that have median objective values, as determined in previous steps.
    'SuccessiveHalvingPruner': optuna.pruners.SuccessiveHalvingPruner,  # This pruner repeatedly splits trials into halves, discarding the lower performing half at each iteration.
    'HyperbandPruner': optuna.pruners.HyperbandPruner,                  # This pruner implements the Hyperband algorithm, which selects promising trials and runs them with different resource allocation schemes to determine the best one.
    'PercentilePruner': optuna.pruners.PercentilePruner,                # A pruner that prunes unpromising trials based on their percentile rank relative to all completed trials.
    'NopPruner': optuna.pruners.NopPruner,                              # A pruner that does nothing and does not prune any trials.
    'ThresholdPruner': optuna.pruners.ThresholdPruner,                  # This pruner prunes trials that have not reached a certain level of performance (i.e., objective value).
    'PatientPruner': optuna.pruners.PatientPruner,                      # This pruner prunes trials that do not show improvement over a certain number of steps (or epochs).
}
tdc_datasets = {}
unique_symbols = {}

In [8]:
obach = ADME(name='Half_Life_Obach')
obach_split = obach.get_split()
tdc_datasets["obach"] = obach_split
microsome = ADME(name='Clearance_Microsome_AZ')
microsome_split = microsome.get_split()
tdc_datasets["microsome"] = microsome_split
hepatocyte = ADME(name='Clearance_Hepatocyte_AZ')
hepatocyte_split = hepatocyte.get_split()
tdc_datasets["hepatocyte"] = hepatocyte_split

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


In [9]:
for benchmark in tdc_benchmarks:
    benchmark_train_smiles = tdc_datasets[benchmark]["train"]["Drug"]
    benchmark_test_smiles = tdc_datasets[benchmark]["test"]["Drug"]
    
    benchmark_train_halflives = tdc_datasets[benchmark]["train"]["Y"]
    reshaped_train_halflife = np.array(benchmark_train_halflives).reshape(-1, 1)
    scaler = MinMaxScaler().fit(reshaped_train_halflife)
    train_halflife_scaled = scaler.transform(reshaped_train_halflife)
    train_halflives_scaled = np.array([val[0] for val in train_halflife_scaled])

    benchmark_test_halflives = tdc_datasets[benchmark]["test"]["Y"]
    reshaped_test_halflife = np.array(benchmark_test_halflives).reshape(-1, 1)
    scaler = MinMaxScaler().fit(reshaped_test_halflife)
    test_halflife_scaled = scaler.transform(reshaped_test_halflife)
    test_halflives_scaled = np.array([val[0] for val in test_halflife_scaled])
    
    file_location = "project_resources/nequip/positions"

    smiles_to_xyz_file(benchmark_train_smiles, train_halflives_scaled, f"{file_location}/{benchmark}_train.extxyz")
    smiles_to_xyz_file(benchmark_test_smiles, test_halflives_scaled, f"{file_location}/{benchmark}_test.extxyz")

obach_train.extxyz already exists
obach_test.extxyz already exists
microsome_train.extxyz already exists
microsome_test.extxyz already exists
hepatocyte_train.extxyz already exists
hepatocyte_test.extxyz already exists


In [45]:
def foo(list_smiles):
    # gets unique symbols from every mol in the list e.g. ["C1=CC=C(C=C1)O", "C1=CSC=C1"] -> ["C", "O", "S"]
    formulae = ""
    for smiles in list_smiles:
        mol = Chem.MolFromSmiles(smiles)
        chemical_formula = Chem.rdMolDescriptors.CalcMolFormula(mol)
        formulae += chemical_formula
    unique_elements = [part for part in formulae if part.isalpha()]
    for idx, element in enumerate(unique_elements):
        two_letter_element = ""
        if element.islower():
            two_letter_element += unique_elements[idx - 1]
            two_letter_element += unique_elements[idx]
            unique_elements.remove(unique_elements[idx])
            unique_elements.remove(unique_elements[idx - 1])
            unique_elements.append(two_letter_element)
    return set(unique_elements)

In [46]:
benchmark = "obach"
benchmark_train_smiles = tdc_datasets[benchmark]["train"]["Drug"]
benchmark_test_smiles = tdc_datasets[benchmark]["test"]["Drug"]
benchmark_all_smiles = list(benchmark_train_smiles) + list(benchmark_test_smiles)

print(foo(benchmark_all_smiles))

{'I', 'Li', 'O', 'H', 'P', 'N', 'C', 'B', 'F', 'S', 'Na', 'Br', 'Cl'}


In [20]:
for benchmark in tdc_benchmarks:
    benchmark_train_smiles = tdc_datasets[benchmark]["train"]["Drug"]
    benchmark_test_smiles = tdc_datasets[benchmark]["test"]["Drug"]
    benchmark_all_smiles = list(benchmark_train_smiles) + list(benchmark_test_smiles)
    
    unique_symbs = get_unique_elements(benchmark_all_smiles)
    unique_symbols[benchmark] = list(unique_symbs)
    print(f"chemical elements present in {benchmark}: {unique_symbols[benchmark]}")

chemical elements present in obach: ['Li', 'S', 'L', 'F', 'N', 'B', 'Cl', 'Br', 'O', 'P', 'H', 'I', 'Na', 'C']
chemical elements present in microsome: ['S', 'N', 'F', 'B', 'Cl', 'Br', 'O', 'P', 'H', 'I', 'C']
chemical elements present in hepatocyte: ['S', 'N', 'F', 'B', 'Cl', 'Br', 'O', 'P', 'H', 'I', 'C']


In [8]:
def create_nequip_eval_yaml(yaml_path, root, trial_number, eval_dataset_file_name, unique_symbols):
    with open(yaml_path, "w") as out_f:
        data = {}
        
        # root is the same as nequip-train, except at the end is also included the number of the run
        data["root"] = root
        data["chemical_symbols"] = unique_symbols
        data["dataset_file_name"] = eval_dataset_file_name
        data["dataset"] = "ase"
        
        yaml.dump(data, out_f)
        print(f"""The evaluation yaml file for the {trial_number}. iteration of {splitter} {isozyme}
        with {root} as root and was successfully created """)
        out_f.close()

In [21]:
def create_nequip_train_yaml(yaml_path, batch_size, learning_rate, num_layers, root, trial_number,
                       train_dataset_file_name, unique_symbols, num_mols):
    with open(yaml_path, "w") as out_f:
        data = {}

        data["batch_size"] = batch_size
        data["learning_rate"] = learning_rate
        data["num_layers"] = num_layers

        data["root"] = root  # project_resources/nequip/{dataset}
        data["run_name"] = str(trial_number)  # number of the trial for a given combination of splitter and isozyme
        data["dataset_file_name"] = train_dataset_file_name
        data["seed"] = 123
        data["dataset_seed"] = 456
        data["append"] = True
        data["model_builders"] = ["SimpleIrrepsConfig", "EnergyModel", "PerSpeciesRescale", "RescaleEnergyEtc"]

        data["max_epochs"] = 3
        data["r_max"] = 0.4  # cutoff radius in Angstroms; changing the value has no effect on validation_e_mae
        data["l_max"] = 2
        data["parity"] = False  # slighly worse performance than True, but training takes way less time
        data["num_features"] = 32
        data["num_basis"] = 8

        data["dataset"] = "ase"
        key_mapping = {"z": "atomic_numbers", "E": "total_energy", "R": "pos"}
        data["key_mapping"] = key_mapping
        data["npz_fixed_field_keys"] = "atomic_numbers"
        data["chemical_symbols"] = unique_symbols
        data["wandb"] = False  # impossible to use wandb inside a notebook (cannot choose an option)

        num_train_val = num_mols  # number of molecules in the dataset
        num_train = int(np.floor(0.8 * num_train_val))
        num_validation = int(num_train_val - num_train)
        data["n_train"] = num_train
        data["n_val"] = num_validation
        data["train_val_split"] = "sequential"
        data["validation_batch_size"] = num_validation
        data["loss_coeffs"] = "total_energy"
        data["optimizer_name"] = "Adam"

        splitter = root.split("/")[-2]
        isozyme = root.split("/")[-1]
        
        yaml.dump(data, out_f)
        print(f"""The train yaml file for the {trial_number}. iteration of {splitter} {isozyme}
        with {root} as root and was successfully created """)
        out_f.close()

In [22]:
def parse_nequip_xyz_out(xyz):
    # get the predicted half-lives from xyz file created after using nequip-evaluate --output
    with open(xyz) as f:
        energies = [float(re.search(r'energy=(-?\d+\.\d+)', line).group(1)) for line in f if "energy" in line]
    return energies

In [23]:
def parse_nequip_log(log):
    # get the rmse value from the log file created after using nequip-evaluate --log
    with open(log) as l:
        file_content = l.read()
    # use regular expressions to find the rmse value
    rmse = re.search(r'\s*e_rmse\s*=\s*(\d+\.\d+)', file_content)
    return float(rmse.group(1))

In [24]:
class NequIPTuner():
    def __init__(self, run_name, root, unique_symbols, num_mols, train_dataset_file_name, eval_dataset_file_name):
        self.run_name = run_name  # {splitter}_{isozyme}
        self.root = root
        self.unique_symbols = unique_symbols
        self.num_mols = num_mols  # number of mols present in the train .extxyz file
        self.train_dataset_file_name = train_dataset_file_name
        self.eval_dataset_file_name = eval_dataset_file_name

    def sample_params(self, trial: optuna.Trial):
        batch_size = trial.suggest_int("batch_size", 2, 50)
        learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
        num_layers = trial.suggest_int("num_layers", 3, 5)
        return {
            "batch_size": batch_size,
            "learning_rate": learning_rate,
            "num_layers": num_layers
        }

    def train_test_return(self, train_yaml_name, eval_yaml_name, train_dir, return_predictions=False):
        xyz_out = f"{train_dir}/output.xyz"
        log_file = f"{train_dir}/evaluation_log.txt"
        
        print(f"running command: nequip-train {train_yaml_name}")
        !nequip-train {train_yaml_name}
        # {train_dir} ... project_resources/optuna/nequip/random/3A4/1
        # {eval_yaml_name} ... eval_dataset.yaml
        # {xyz_out} ... project_resources/optuna/nequip/random/3A4/1/output.xyz
        # ^ Parse this to get the predicted half-lives ^
        # {log} ... project_resources/optuna/nequip/random/3A4/1/eval_log.txt
        
        print(f"""running command: nequip-evaluate --train-dir {train_dir} 
              --dataset-config {eval_yaml_name} 
              --output {xyz_out} 
              --log {log_file}""")
        !nequip-evaluate --train-dir {train_dir} --dataset-config {eval_yaml_name} --output {xyz_out} --log {log_file}

        rmse = parse_nequip_log(log_file)
        
        # delete the dirs and files that were created, as they are no longer needed
        shutil.rmtree(train_dir)
        os.remove(train_yaml_name)
        os.remove(eval_yaml_name)
        
        if return_predictions:
            predictions = parse_nequip_xyz_out(xyz_out)
            return rmse, predictions
        else:
            return rmse

    def objective(self, trial=None):
        parameters = self.sample_params(trial)
    
        run_splitter = self.run_name.split("_")[0]
        run_isozyme = self.run_name.split("_")[1]
        if run_splitter == "rand":
            run_splitter_name = "random"
        elif run_splitter == "scaff":
            run_splitter_name = "scaffold_splitter"
        else:
            run_splitter_name = "time_split"

        working_dir = f"project_resources/optuna/nequip/{run_splitter_name}/{run_isozyme}"
        train_yaml_name = f"{working_dir}/{self.run_name}_nequip_train_{trial.number}.yaml"
        eval_yaml_name = f"{working_dir}/{self.run_name}_nequip_eval_{trial.number}.yaml"
        train_dir = self.root + f"/{trial.number}"
        
        # create train yaml file for this trial
        create_nequip_train_yaml(train_yaml_name, parameters["batch_size"], parameters["learning_rate"],
                           parameters["num_layers"], self.root, trial.number, self.train_dataset_file_name,
                           self.unique_symbols, self.num_mols)

        # create evaluation yaml file for this trial
        create_nequip_eval_yaml(eval_yaml_name, self.root, trial.number,
                                self.eval_dataset_file_name, self.unique_symbols)

        return self.train_test_return(train_yaml_name, eval_yaml_name, train_dir)

In [178]:
sampler = samplers["TPESampler"]
pruner = pruners["BasePruner"]
n_trials = 1
for benchmark in tdc_benchmarks:
    root = f"project_resources/optuna/nequip/{benchmark}"
    lock_obj = optuna.storages.JournalFileOpenLock(root + "/nequip_journal.log")
    storage = JournalStorage(
        JournalFileStorage(root + "/nequip_journal.log", lock_obj=lock_obj)
    )

    study = optuna.create_study(study_name=f"nequip_{benchmark}", directions=["minimize"],
                                pruner=pruner, storage=storage, load_if_exists=True)

    run_name = benchmark
    
    num_mols = len(tdc_datasets[benchmark]["train"]["Drug"])
    train_dataset_file_name = f"project_resources/nequip/positions/{benchmark}_train.extxyz"
    eval_dataset_file_name = f"project_resources/nequip/positions/{benchmark}_test.extxyz"
    tuner = NequIPTuner(run_name, root, unique_symbols[benchmark], num_mols, train_dataset_file_name, eval_dataset_file_name)

    study.optimize(tuner.objective, n_trials=n_trials, n_jobs=-1)
    joblib.dump(study, f"{root}/nequip.pkl")

[I 2023-11-22 22:10:00,107] Using an existing study with name 'nequip_rand_3A4' instead of creating a new one.


The train yaml file for the 1. iteration of random 3A4
        with project_resources/optuna/nequip/random/3A4 as root and was successfully created 
The evaluation yaml file for the 1. iteration of rand 3A4
        with project_resources/optuna/nequip/random/3A4 as root and was successfully created 
running command: nequip-train project_resources/optuna/nequip/random/3A4/rand_3A4_nequip_train_1.yaml
Torch device: cpu
Successfully loaded the data set of type ASEDataset(56)...
Replace string dataset_per_atom_total_energy_std to 0.04218994081020355
Replace string dataset_per_atom_total_energy_mean to -0.08114522695541382
Atomic outputs are scaled by: [H, C, N, O, F, P, S, Cl: 0.042190], shifted by [H, C, N, O, F, P, S, Cl: -0.081145].
Replace string dataset_total_energy_std to tensor([1.4029])
Initially outputs are globally scaled by: tensor([1.4029]), total_energy are globally shifted by None.
Successfully built the network...
Number of weights: 240664
Number of trainable weights: 240664

[I 2023-11-22 22:10:25,662] Trial 1 finished with value: 1.626565 and parameters: {'batch_size': 25, 'learning_rate': 0.5642759064409827, 'num_layers': 4}. Best is trial 0 with value: 1.588775.
[I 2023-11-22 22:10:25,668] Using an existing study with name 'nequip_rand_RLM' instead of creating a new one.


The train yaml file for the 1. iteration of random RLM
        with project_resources/optuna/nequip/random/RLM as root and was successfully created 
The evaluation yaml file for the 1. iteration of rand RLM
        with project_resources/optuna/nequip/random/RLM as root and was successfully created 
running command: nequip-train project_resources/optuna/nequip/random/RLM/rand_RLM_nequip_train_1.yaml
Torch device: cpu
Processing dataset...


KeyboardInterrupt: 