In [1]:
import sys
import nequip
import yaml
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from ase import Atoms
from ase.io import read, write
from ase.calculators.singlepoint import SinglePointCalculator
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import get_unique_elements

importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [2]:
isozymes = ["3A4", "RLM", "HLC"]
data_splits = ["train", "test"]
splitters = ["rand", "scaff"]
rel_paths = {
    "3A4": r"project_resources/3A4.csv",
    "3A4_train_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_train.csv",
    "3A4_test_scaff": r"project_resources/base_splits/scaffold_splitter/3A4_test.csv",
    "3A4_train_rand": r"project_resources/base_splits/random/3A4_train.csv",
    "3A4_test_rand": r"project_resources/base_splits/random/3A4_test.csv",
    "3A4_train_xyz_rand": r"project_resources/nequip/positions/random/3A4_train_mol_positions.extxyz",
    "3A4_test_xyz_rand": r"project_resources/nequip/positions/random/3A4_test_mol_positions.extxyz",
    "3A4_train_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/3A4_train_mol_positions.extxyz",
    "3A4_test_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/3A4_test_mol_positions.extxyz",
    "3A4_rand_config": r"project_resources/nequip/3A4_rand_config.yaml",
    "3A4_scaff_config": r"project_resources/nequip/3A4_scaffold_config.yaml",

    "RLM": r"project_resources/RLM.csv",
    "RLM_train_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_train.csv",
    "RLM_test_scaff": r"project_resources/base_splits/scaffold_splitter/RLM_test.csv",
    "RLM_train_rand": r"project_resources/base_splits/random/RLM_train.csv",
    "RLM_test_rand": r"project_resources/base_splits/random/RLM_test.csv",
    "RLM_train_xyz_rand": r"project_resources/nequip/positions/random/RLM_train_mol_positions.extxyz",
    "RLM_test_xyz_rand": r"project_resources/nequip/positions/random/RLM_test_mol_positions.extxyz",
    "RLM_train_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/RLM_train_mol_positions.extxyz",
    "RLM_test_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/RLM_test_mol_positions.extxyz",
    "RLM_rand_config": r"project_resources/nequip/RLM_rand_config.yaml",
    "RLM_scaff_config": r"project_resources/nequip/RLM_scaffold_config.yaml",

    "HLC": r"project_resources/HLC.csv",
    "HLC_train_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_train.csv",
    "HLC_test_scaff": r"project_resources/base_splits/scaffold_splitter/HLC_test.csv",
    "HLC_train_rand": r"project_resources/base_splits/random/HLC_train.csv",
    "HLC_test_rand": r"project_resources/base_splits/random/HLC_test.csv",
    "HLC_train_xyz_rand": r"project_resources/nequip/positions/random/HLC_train_mol_positions.extxyz",
    "HLC_test_xyz_rand": r"project_resources/nequip/positions/random/HLC_test_mol_positions.extxyz",
    "HLC_train_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/HLC_train_mol_positions.extxyz",
    "HLC_test_xyz_scaff": r"project_resources/nequip/positions/scaffold_splitter/HLC_test_mol_positions.extxyz",
    "HLC_rand_config": r"project_resources/nequip/HLC_rand_config.yaml",
    "HLC_scaff_config": r"project_resources/nequip/HLC_scaffold_config.yaml",
}
smiles = {}
halflives = {}
split_indexes = {}
position_blocks = {}
rdkit_symbols = {}
unique_symbols = {}

In [3]:
for splitter in splitters:
    print(f"\n{splitter}")
    split_indexes[splitter] = {}
    for isozyme in isozymes:
        print(isozyme)
        split_indexes[splitter][isozyme] = {}
        for split in data_splits:
            df = pd.read_csv(rel_paths[f"{isozyme}_{split}_{splitter}"])
            split_indexes[splitter][isozyme][split] = list(df["index"])
            print(len(split_indexes[splitter][isozyme][split]), split_indexes[splitter][isozyme][split][:25])


rand
3A4
56 [63, 31, 59, 36, 58, 48, 17, 35, 43, 29, 8, 54, 41, 45, 47, 20, 57, 40, 26, 39, 14, 51, 4, 18, 9]
14 [23, 1, 50, 5, 55, 19, 11, 34, 46, 31, 32, 10, 68, 6]
RLM
2024 [158, 1791, 791, 2077, 136, 2311, 1413, 565, 2335, 2186, 2270, 1466, 568, 1447, 1104, 124, 1098, 602, 407, 743, 2299, 720, 498, 1065, 343]
507 [2309, 2308, 195, 1883, 765, 171, 1658, 2380, 991, 1135, 1934, 1476, 1451, 1524, 1643, 252, 1002, 1195, 953, 322, 1076, 2505, 135, 2213, 2313]
HLC
151 [36, 99, 105, 76, 94, 69, 30, 128, 165, 57, 32, 13, 42, 101, 127, 98, 126, 27, 39, 155, 156, 182, 86, 91, 3]
38 [185, 164, 19, 16, 68, 109, 46, 77, 17, 133, 61, 124, 43, 157, 56, 67, 116, 160, 31, 120, 70, 153, 138, 20, 52]

scaff
3A4
56 [30, 33, 38, 51, 54, 58, 60, 64, 67, 31, 63, 37, 31, 36, 37, 63, 65, 2, 12, 14, 16, 17, 20, 21, 4]
14 [29, 26, 25, 23, 18, 15, 11, 10, 9, 8, 6, 5, 3, 1]
RLM
2024 [183, 186, 187, 188, 189, 190, 195, 201, 202, 203, 204, 205, 207, 209, 795, 813, 824, 1074, 1110, 1171, 1344, 1392, 1500, 1532, 1

In [4]:
for splitter in splitters:
    print(splitter)
    smiles[splitter] = {}
    halflives[splitter] = {}
    position_blocks[splitter] = {}
    rdkit_symbols[splitter] = {}
    smiles[splitter] = {}

    for isozyme in isozymes:
        smiles[splitter][isozyme] = {}
        halflives[splitter][isozyme] = {}
        position_blocks[splitter][isozyme] = {}
        rdkit_symbols[splitter][isozyme] = {}
        smiles[splitter][isozyme] = {}

        for split in data_splits:
            try:
                read(rel_paths[f"{isozyme}_{split}_xyz_{splitter}"])
                df = pd.read_csv(rel_paths[f"{isozyme}_{split}_{splitter}"])
                smiles[splitter][isozyme][split] = df["smiles"]
                print(f"{isozyme}_{split}_mol_positions.extxyz already exists")

            except FileNotFoundError:
                # create .extxyz files for each combination of splitter, isozyme and split
                df = pd.read_csv(rel_paths[f"{isozyme}_{split}_{splitter}"])
                smiles[splitter][isozyme][split] = list(df["smiles"])
                halflives[splitter][isozyme][split] = np.log(np.array(df["half-life"]))
                isozyme_positions = []
                isozyme_symbols = []

                for smi in smiles[splitter][isozyme][split]:
                    # create mol
                    mol = Chem.MolFromSmiles(smi)
                    mol = Chem.AddHs(mol, explicitOnly=True)
                    AllChem.EmbedMolecule(mol)

                    # get x, y and z positions of each atom from mol
                    xyz_string = Chem.MolToXYZBlock(mol)

                    # for looping over the positions
                    lines = xyz_string.strip().split("\n")[2:]

                    mol_positions = []  # corrdinates of each atom of mol in 3D space
                    mol_symbols = []  # list of atoms in mol

                    # get each atom and its coordinates
                    for line in lines:
                        parts = line.split()
                        symbol = parts[0]
                        x, y, z = map(float, parts[1:4])
                        mol_symbols.append(symbol)
                        mol_positions.append([float(coord) for coord in parts[1:]])

                    isozyme_positions.append(mol_positions)
                    isozyme_symbols.append(mol_symbols)

                position_blocks[splitter][isozyme] = {}
                position_blocks[splitter][isozyme][split] = isozyme_positions
                # save data to dicts
                rdkit_symbols[splitter][isozyme] = {}
                rdkit_symbols[splitter][isozyme][split] = isozyme_symbols

                # generate .extxyz files for train+validation and test sets
                out_filename_extxyz = rel_paths[f"{isozyme}_{split}_xyz_{splitter}"]
                # get data
                positions = position_blocks[splitter][isozyme][split]  # coordinates of each atom in 3D space
                symbols = rdkit_symbols[splitter][isozyme][split]  # atomic symbols (e.g. C, O, H...)
                energies = halflives[splitter][isozyme][split]  # log half-life

                # iterate over data and write continuously to extxyz file
                for pos_idx in range(len(positions)):
                    curr_atoms = Atoms(
                    # set atomic positions
                    positions=positions[pos_idx],
                    # set chemical symbols / species
                    symbols=symbols[pos_idx],
                    # assuming data with periodic boundary conditions, set to false for e.g. for molecules in vacuum
                    pbc=True
                    )

                    # set calculator to assign targets
                    calculator = SinglePointCalculator(curr_atoms, energy=energies[pos_idx])
                    curr_atoms.calc = calculator

                    write(out_filename_extxyz, curr_atoms, format='extxyz', append=True)
                print(f"{out_filename_extxyz} was successfully created")

                # create .txt files which are copies of the .extxyz files
                # with molecule indexes for better corss-referencing with source .csv files
                # in order to include mol indexes, the file is no longer correctly formatted
                # therefore can't be used for training/testing a module
                out_filename_txt = out_filename_extxyz.replace("extxyz", "txt")
                for mol_idx, pos_idx in zip(split_indexes[splitter][isozyme][split], range(len(positions))):
                    curr_atoms = Atoms(
                    # set atomic positions
                    positions=positions[pos_idx],
                    # set chemical symbols / species
                    symbols=symbols[pos_idx],
                    # assuming data with periodic boundary conditions, set to false for e.g. for molecules in vacuum
                    pbc=True
                    )

                    # set calculator to assign targets
                    calculator = SinglePointCalculator(curr_atoms, energy=energies[pos_idx])
                    curr_atoms.calc = calculator

                    with open(out_filename_txt, "a") as file:
                        file.write(f"molecule index: {mol_idx}\n")
                        file.close()
                    write(out_filename_txt, curr_atoms, format='extxyz', append=True)
                print(f"{out_filename_txt} was successfully created")

rand
3A4_train_mol_positions.extxyz already exists
3A4_test_mol_positions.extxyz already exists
RLM_train_mol_positions.extxyz already exists
RLM_test_mol_positions.extxyz already exists
HLC_train_mol_positions.extxyz already exists
HLC_test_mol_positions.extxyz already exists
scaff
3A4_train_mol_positions.extxyz already exists
3A4_test_mol_positions.extxyz already exists
RLM_train_mol_positions.extxyz already exists
RLM_test_mol_positions.extxyz already exists
HLC_train_mol_positions.extxyz already exists
HLC_test_mol_positions.extxyz already exists


In [5]:
for isozyme in isozymes:
    df = pd.read_csv(rel_paths[isozyme])
    isozyme_smiles = list(df["smiles"])
    unique_symbs = get_unique_elements(isozyme_smiles)
    unique_symbols[isozyme] = list(unique_symbs)
    print(f"chemical elements present in {isozyme}: {unique_symbols[isozyme]}")

chemical elements present in 3A4: ['C', 'P', 'H', 'N', 'F', 'O', 'S', 'Cl']
chemical elements present in RLM: ['C', 'I', 'P', 'B', 'H', 'N', 'Br', 'F', 'O', 'S', 'Cl']
chemical elements present in HLC: ['C', 'H', 'N', 'F', 'O', 'S']


In [7]:
with open("project_resources/nequip/test.yaml", "w") as f:
    data = {}
    data["foo"] = "bar"
    yaml.dump(data, f)

In [None]:
for splitter in splitters:
    print(splitter)
    for isozyme in isozymes:
        try:
            with open(rel_paths[f"{isozyme}_{splitter}_config"]) as f:
                f.close()
            print(f"{isozyme}_config.yaml already exists")
        except FileNotFoundError:
            with open(rel_paths[f"{isozyme}_{splitter}_config"], "w") as out_f:
                data = {}

                data["root"] = "P450"
                data["run_name"] = f"{isozyme}_{splitter}"
                data["dataset_file_name"] = rel_paths[f"{isozyme}_train_xyz_{splitter}"].remove("project_resources/nequip/")
                data["seed"] = 123
                data["dataset_seed"] = 456
                data["append"] = True
                data["model_builders"] = ["SimpleIrrepsConfig", "EnergyModel", "PerSpeciesRescale", "RescaleEnergyEtc"]

                # hyperparams:
                data["batch_size"] = 10
                data["learning_rate"] = 0.005
                data["max_epochs"] = 30
                data["r_max"] = 0.4  # cutoff radius in Angstroms
                data["num_layers"] = 4
                data["l_max"] = 2
                data["parity"] = True
                data["num_features"] = 32
                data["num_basis"] = 8

                data["dataset"] = "ase"
                key_mapping = {"z": "atomic_numbers", "E": "total_energy", "R": "pos"}
                data["key_mapping"] = key_mapping
                data["npz_fixed_field_keys"] = "atomic_numbers"
                data["chemical_symbols"] = unique_symbols[isozyme]
                data["wandb"] = True
                data["wandb_project"] = f"{isozyme}_{splitter}"

                num_train_val = int(len(smiles[splitter][isozyme]["train"]))
                num_train = int(np.floor(0.8 * num_train_val))
                num_validation = int(num_train_val - num_train)
                data["n_train"] = num_train
                data["n_val"] = num_validation
                data["train_val_split"] = "sequential"
                data["validation_batch_size"] = num_validation
                data["loss_coeffs"] = "total_energy"
                data["optimizer_name"] = "Adam"

In [26]:
for splitter in splitters:
    print(splitter)
    for isozyme in isozymes:
        try:
            with open(rel_paths[f"{isozyme}_{splitter}_config"]) as f:
                f.close()
            print(f"{isozyme}_config.yaml already exists")
        except FileNotFoundError:
            # create correctly configured .yaml files
            with open("project_resources/nequip/minimal_eng.yaml") as f:
                data = yaml.safe_load(f)
                data["root"] = "P450"
                data["run_name"] = f"{isozyme}_{splitter}"
                data["dataset"] = "ase"
                data["dataset_url"] = ""
                data["max_epochs"] = 3
                data["r_max"] = 0.4

                num_train_val = int(len(smiles[splitter][isozyme]["train"]))
                num_train = int(np.floor(0.8 * num_train_val))
                num_validation = int(num_train_val - num_train)
                data["n_train"] = num_train
                data["n_val"] = num_validation

                train_dataset = rel_paths[f"{isozyme}_train_xyz_{splitter}"].remove("project_resources/nequip/")
                data["dataset_file_name"] = train_dataset

                key_mapping = {"z": "atomic_numbers", "E": "total_energy", "R": "pos"}
                data["key_mapping"] = key_mapping

                symbols = unique_symbols[isozyme]
                data["chemical_symbols"] = symbols

                data["wandb"] = "false"
                data["wandb_project"] = ""
                with open(rel_paths[f"{isozyme}_{splitter}_config"], "w") as out_f:
                    yaml.dump(data, out_f)
                print(data)
                f.close()

rand
{'root': 'P450', 'run_name': '3A4_rand', 'seed': 123, 'dataset_seed': 456, 'model_builders': ['SimpleIrrepsConfig', 'EnergyModel', 'PerSpeciesRescale', 'RescaleEnergyEtc'], 'num_basis': 8, 'r_max': 4.0, 'l_max': 2, 'parity': True, 'num_features': 16, 'dataset': 'ase', 'dataset_url': '', 'dataset_file_name': 'project_resources/nequip/positions/random/3A4_train_mol_positions.extxyz', 'key_mapping': {'z': 'atomic_numbers', 'E': 'total_energy', 'R': 'pos'}, 'npz_fixed_field_keys': ['atomic_numbers'], 'chemical_symbols': ['C', 'Cl', 'S', 'F', 'H', 'N', 'P', 'O'], 'wandb': 'false', 'wandb_project': '', 'n_train': 44, 'n_val': 12, 'batch_size': 1, 'max_epochs': 3, 'loss_coeffs': 'total_energy', 'optimizer_name': 'Adam'}
{'root': 'P450', 'run_name': 'RLM_rand', 'seed': 123, 'dataset_seed': 456, 'model_builders': ['SimpleIrrepsConfig', 'EnergyModel', 'PerSpeciesRescale', 'RescaleEnergyEtc'], 'num_basis': 8, 'r_max': 4.0, 'l_max': 2, 'parity': True, 'num_features': 16, 'dataset': 'ase', 'da