In [1]:
import sys
import yaml
import pandas as pd
import numpy as np
from tdc.single_pred import ADME
from rdkit import Chem
from rdkit.Chem import AllChem
from ase import Atoms
from ase.io import read, write
from sklearn.preprocessing import MinMaxScaler
from ase.calculators.singlepoint import SinglePointCalculator
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import smiles_to_xyz_file, get_unique_elements

importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [2]:
tdc_benchmarks = ["obach", "microsome", "hepatocyte"]
rel_paths = {
    "3A4": r"project_resources/3A4.csv",
    "3A4_train_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_train.csv",
    "3A4_test_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_test.csv",
    "3A4_train_rand": r"project_resources/base_splits/random/3A4_train.csv",
    "3A4_test_rand": r"project_resources/base_splits/random/3A4_test.csv",
    "3A4_train_time": r"project_resources/base_splits/time_split/3A4_train.csv",
    "3A4_test_time": r"project_resources/base_splits/time_split/3A4_test.csv",
    "3A4_train_xyz_rand": r"project_resources/nequip/positions/random/3A4_train_mol_positions.extxyz",
    "3A4_test_xyz_rand": r"project_resources/nequip/positions/random/3A4_test_mol_positions.extxyz",
    "3A4_train_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/3A4_train_mol_positions.extxyz",
    "3A4_test_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/3A4_test_mol_positions.extxyz",
    "3A4_train_xyz_time": r"project_resources/nequip/positions/time_split/3A4_train_mol_positions.extxyz",
    "3A4_test_xyz_time": r"project_resources/nequip/positions/time_split/3A4_test_mol_positions.extxyz",
    "3A4_rand_config": r"project_resources/nequip/3A4_rand_config.yaml",
    "3A4_scaff_config": r"project_resources/nequip/3A4_scaffold_config.yaml",
    "3A4_time_config": r"project_resources/nequip/3A4_time_config.yaml",

    "RLM": r"project_resources/RLM.csv",
    "RLM_train_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_train.csv",
    "RLM_test_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_test.csv",
    "RLM_train_rand": r"project_resources/base_splits/random/RLM_train.csv",
    "RLM_test_rand": r"project_resources/base_splits/random/RLM_test.csv",
    "RLM_train_time": r"project_resources/base_splits/time_split/RLM_train.csv",
    "RLM_test_time": r"project_resources/base_splits/time_split/RLM_test.csv",
    "RLM_train_xyz_rand": r"project_resources/nequip/positions/random/RLM_train_mol_positions.extxyz",
    "RLM_test_xyz_rand": r"project_resources/nequip/positions/random/RLM_test_mol_positions.extxyz",
    "RLM_train_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/RLM_train_mol_positions.extxyz",
    "RLM_test_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/RLM_test_mol_positions.extxyz",
    "RLM_train_xyz_time": r"project_resources/nequip/positions/time_split/RLM_train_mol_positions.extxyz",
    "RLM_test_xyz_time": r"project_resources/nequip/positions/time_split/RLM_test_mol_positions.extxyz",
    "RLM_rand_config": r"project_resources/nequip/RLM_rand_config.yaml",
    "RLM_scaff_config": r"project_resources/nequip/RLM_scaffold_config.yaml",
    "RLM_time_config": r"project_resources/nequip/RLM_time_config.yaml",

    "HLC": r"project_resources/HLC.csv",
    "HLC_train_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_train.csv",
    "HLC_test_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_test.csv",
    "HLC_train_rand": r"project_resources/base_splits/random/HLC_train.csv",
    "HLC_test_rand": r"project_resources/base_splits/random/HLC_test.csv",
    "HLC_train_time": r"project_resources/base_splits/time_split/HLC_train.csv",
    "HLC_test_time": r"project_resources/base_splits/time_split/HLC_test.csv",
    "HLC_train_xyz_rand": r"project_resources/nequip/positions/random/HLC_train_mol_positions.extxyz",
    "HLC_test_xyz_rand": r"project_resources/nequip/positions/random/HLC_test_mol_positions.extxyz",
    "HLC_train_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/HLC_train_mol_positions.extxyz",
    "HLC_test_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/HLC_test_mol_positions.extxyz",
    "HLC_train_xyz_time": r"project_resources/nequip/positions/time_split/HLC_train_mol_positions.extxyz",
    "HLC_test_xyz_time": r"project_resources/nequip/positions/time_split/HLC_test_mol_positions.extxyz",
    "HLC_rand_config": r"project_resources/nequip/HLC_rand_config.yaml",
    "HLC_scaff_config": r"project_resources/nequip/HLC_scaffold_config.yaml",
    "HLC_time_config": r"project_resources/nequip/HLC_time_config.yaml"
}
halflives = {}
tdc_datasets = {}
rmsds = {}
ranks = {}
y_predicted = {}
best_trials = {}
best_models = {}
best_model_hyperparams = {}
split_indexes = {}
position_blocks = {}
rdkit_symbols = {}
unique_symbols = {}

In [3]:
obach = ADME(name='Half_Life_Obach')
obach_split = obach.get_split()
tdc_datasets["obach"] = obach_split
microsome = ADME(name='Clearance_Microsome_AZ')
microsome_split = microsome.get_split()
tdc_datasets["microsome"] = microsome_split
hepatocyte = ADME(name='Clearance_Hepatocyte_AZ')
hepatocyte_split = hepatocyte.get_split()
tdc_datasets["hepatocyte"] = hepatocyte_split

Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!
Found local copy...
Loading...
Done!


In [4]:
for benchmark in tdc_benchmarks:
    benchmark_train_smiles = tdc_datasets[benchmark]["train"]["Drug"]
    benchmark_test_smiles = tdc_datasets[benchmark]["test"]["Drug"]
    
    benchmark_train_halflives = tdc_datasets[benchmark]["train"]["Y"]
    reshaped_train_halflife = np.array(benchmark_train_halflives).reshape(-1, 1)
    scaler = MinMaxScaler().fit(reshaped_train_halflife)
    train_halflife_scaled = scaler.transform(reshaped_train_halflife)
    train_halflives_scaled = np.array([val[0] for val in train_halflife_scaled])

    benchmark_test_halflives = tdc_datasets[benchmark]["test"]["Y"]
    reshaped_test_halflife = np.array(benchmark_test_halflives).reshape(-1, 1)
    scaler = MinMaxScaler().fit(reshaped_test_halflife)
    test_halflife_scaled = scaler.transform(reshaped_test_halflife)
    test_halflives_scaled = np.array([val[0] for val in test_halflife_scaled])
    
    file_location = "project_resources/nequip/positions"

    smiles_to_xyz_file(benchmark_train_smiles, train_halflives_scaled, f"{file_location}/{benchmark}_train.extxyz")
    smiles_to_xyz_file(benchmark_test_smiles, test_halflives_scaled, f"{file_location}/{benchmark}_test.extxyz")

obach_train.extxyz already exists
obach_test.extxyz already exists
microsome_train.extxyz already exists
microsome_test.extxyz already exists
hepatocyte_train.extxyz already exists
hepatocyte_test.extxyz already exists


In [5]:
for benchmark in tdc_benchmarks:
    benchmark_train_smiles = tdc_datasets[benchmark]["train"]["Drug"]
    benchmark_test_smiles = tdc_datasets[benchmark]["test"]["Drug"]
    benchmark_all_smiles = list(benchmark_train_smiles) + list(benchmark_test_smiles)
    
    unique_symbs = get_unique_elements(benchmark_all_smiles)
    unique_symbols[benchmark] = list(unique_symbs)
    print(f"chemical elements present in {benchmark}: {unique_symbols[benchmark]}")

chemical elements present in obach: ['H', 'F', 'S', 'I', 'Br', 'Na', 'Li', 'N', 'P', 'L', 'C', 'B', 'Cl', 'O']
chemical elements present in microsome: ['H', 'S', 'F', 'I', 'Br', 'N', 'P', 'C', 'B', 'Cl', 'O']
chemical elements present in hepatocyte: ['H', 'S', 'F', 'I', 'Br', 'N', 'P', 'C', 'B', 'Cl', 'O']


In [18]:
for splitter in splitters:
    print(splitter)
    for isozyme in isozymes:
        try:
            with open(rel_paths[f"{isozyme}_{splitter}_config"]) as f:
                f.close()
            print(f"{isozyme}_config.yaml already exists")
        except FileNotFoundError:
            with open(rel_paths[f"{isozyme}_{splitter}_config"], "w") as out_f:
                data = {}

                data["root"] = "P450"
                data["run_name"] = f"{isozyme}_{splitter}"
                data["dataset_file_name"] = rel_paths[f"{isozyme}_train_xyz_{splitter}"].replace("project_resources/nequip/", "")
                data["seed"] = 123
                data["dataset_seed"] = 456
                data["append"] = True
                data["model_builders"] = ["SimpleIrrepsConfig", "EnergyModel", "PerSpeciesRescale", "RescaleEnergyEtc"]

                # hyperparams:
                data["batch_size"] = 10  # no large effect on 3A4 (tried 5, 10, 20 - 10 works best)
                data["learning_rate"] = 0.1
                # learning_rate -> validation_e_mae
                # 0.005 -> 1.31374
                # 0.01 -> 0.96237
                # 0.1 -> 0.94379
                data["max_epochs"] = 30
                data["r_max"] = 0.4  # cutoff radius in Angstroms; changing the value has no effect on validation_e_mae
                data["num_layers"] = 4  # little to no effect on validation_e_mae
                data["l_max"] = 2
                data["parity"] = False  # slighly worse performance than True, but training takes way less time
                data["num_features"] = 32
                data["num_basis"] = 8

                data["dataset"] = "ase"
                key_mapping = {"z": "atomic_numbers", "E": "total_energy", "R": "pos"}
                data["key_mapping"] = key_mapping
                data["npz_fixed_field_keys"] = "atomic_numbers"
                data["chemical_symbols"] = unique_symbols[isozyme]
                data["wandb"] = False  # impossible to use wandb inside a notebook (cannot choose an option)

                num_train_val = int(len(smiles[splitter][isozyme]["train"]))
                num_train = int(np.floor(0.8 * num_train_val))
                num_validation = int(num_train_val - num_train)
                data["n_train"] = num_train
                data["n_val"] = num_validation
                data["train_val_split"] = "sequential"
                data["validation_batch_size"] = num_validation
                data["loss_coeffs"] = "total_energy"
                data["optimizer_name"] = "Adam"
                yaml.dump(data, out_f)
                out_f.close()
            print(rel_paths[f"{isozyme}_{splitter}_config"], " was successfully created")

rand
3A4_config.yaml already exists
RLM_config.yaml already exists
HLC_config.yaml already exists
scaff
3A4_config.yaml already exists
RLM_config.yaml already exists
HLC_config.yaml already exists
time
project_resources/nequip/3A4_time_config.yaml  was successfully created
project_resources/nequip/RLM_time_config.yaml  was successfully created
project_resources/nequip/HLC_time_config.yaml  was successfully created
