In [1]:
import sys
import nequip
import yaml
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from ase import Atoms
from ase.io import read, write
from ase.calculators.singlepoint import SinglePointCalculator
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import *

importing Jupyter notebook from C:\Users\Lukas\Documents\datacytochromy\project_resources\cytochrome_P450.ipynb


In [2]:
isozymes = ["3A4", "RLM", "HLC"]
rel_paths = {
    "3A4_source": r"project_resources/ChEMBL_3A4.csv",
    "3A4_sep": ";",
    "3A4": r"project_resources/3A4.csv",
    "3A4_nequip_train_xyz": r"project_resources/nequip/positions/3A4_test_mol_positions.extxyz",
    "3A4_nequip_yaml": r"project_resources/nequip/3A4_config.yaml",
    
    "RLM_source": r"project_resources/AID_1508591_datatable_all.csv",
    "RLM_sep": ",",
    "RLM": r"project_resources/RLM.csv",
    "RLM_nequip_train_xyz": r"project_resources/nequip/positions/RLM_test_mol_positions.extxyz",
    "RLM_nequip_yaml": r"project_resources/nequip/RLM_config.yaml",

    "HLC_source": r"project_resources/AID_1508603_datatable_all.csv",
    "HLC_sep": ",",
    "HLC": r"project_resources/HLC.csv",
    "HLC_nequip_train_xyz": r"project_resources/nequip/positions/HLC_test_mol_positions.extxyz",
    "HLC_nequip_yaml": r"project_resources/nequip/HLC_config.yaml",
}
smiles = {}
position_blocks = {}
rdkit_symbols = {}
unique_symbols= {}

In [8]:
for isozyme in isozymes:
    try:
        read(f"project_resources/nequip/positions/{isozyme}_train_mol_positions.extxyz")
        read(f"project_resources/nequip/positions/{isozyme}_test_mol_positions.extxyz")
        df = pd.read_csv(rel_paths[isozyme])
        smiles[isozyme] = df["smiles"]
        print(f"both train and test .extxyz files already exist for {isozyme}")
    except FileNotFoundError:
        # create .extxyz files for train-validation and test for isozyme
        df = pd.read_csv(rel_paths[isozyme])
        smiles[isozyme] = df["smiles"]
        isozyme_positions = []
        isozyme_symbols = []


        for smi in smiles[isozyme]:
            # create mol
            mol = Chem.MolFromSmiles(smi)
            mol = Chem.AddHs(mol, explicitOnly=True)
            AllChem.EmbedMolecule(mol)

            # get x, y and z positions of each atom from mol
            xyz_string = Chem.MolToXYZBlock(mol)

            # for looping over the positions
            lines = xyz_string.strip().split("\n")[2:]

            mol_positions = [] # corrdinates of each atom of mol in 3D space
            mol_symbols = [] # list of atoms in mol
            
            # get each atom and its coordinates
            for line in lines:
                parts = line.split()
                symbol = parts[0]
                x, y, z = map(float, parts[1:4])
                mol_symbols.append(symbol)
                mol_positions.append([float(coord) for coord in parts[1:]])

            isozyme_positions.append(mol_positions)
            isozyme_symbols.append(mol_symbols)


        # list of positions for each mol in train-validation set, --||-- in test set
        isozyme_pos_tr, isozyme_pos_test = list_splitter(isozyme_positions, 0.8)
        # list of symbols for each mol...
        isozyme_symbs_tr, isozyme_symbs_test = list_splitter(isozyme_symbols, 0.8)


        position_blocks[isozyme] = {}
        position_blocks[isozyme]["train"] = isozyme_pos_tr
        position_blocks[isozyme]["test"] = isozyme_pos_test
        # save data to dicts
        rdkit_symbols[isozyme] = {}
        rdkit_symbols[isozyme]["train"] = isozyme_symbs_tr
        rdkit_symbols[isozyme]["test"] = isozyme_symbs_test

        print(len(isozyme_pos_tr), len(isozyme_pos_test), len(isozyme_symbs_tr), len(isozyme_symbs_test))

        # load half-life values and ensure order corresponds with positions and symbols
        halflife = df["half-life"]
        log_halflife = np.log(np.array(halflife))
        tr_halflife, test_halflife = list_splitter(log_halflife, 0.8)
        halflife_dict = {"train": tr_halflife, "test": test_halflife}
        print(log_halflife[:10])

        data_splits = ["train", "test"]
        train_molecules_count = 0
        # generate .extxyz files for train+validation and test sets
        for data_split in data_splits:
            out_filename = f'project_resources/nequip/positions/{isozyme}_{data_split}_mol_positions.extxyz'
            # get data 
            positions = position_blocks[isozyme][data_split] # coordinates of each atom in 3D space
            symbols = rdkit_symbols[isozyme][data_split] # atomic symbols (e.g. C, O, H...)
            energies = halflife_dict[data_split] # log half-life

            # iterate over data and write continuously to extxyz file
            for idx in range(len(positions)):
                curr_atoms = Atoms(
                # set atomic positions
                positions=positions[idx],
                # set chemical symbols / species
                symbols=symbols[idx], 
                # assuming data with periodic boundary conditions, set to false for e.g. for molecules in vacuum
                pbc=True
                )

                # set calculator to assign targets
                calculator = SinglePointCalculator(curr_atoms, energy=energies[idx])
                curr_atoms.calc = calculator

                write(out_filename, curr_atoms, format='extxyz', append=True)
                
            # create .txt files which are copies of the .extxyz files
            # with molecule indexes for better corss-referencing with source .csv files
            # in order to include mol indexes, the file is no longer correctly formatted
            # therefore can't be used for training/testing a module
            out_filename = f"project_resources/nequip/positions/{isozyme}_{data_split}_mol_pos_visual.txt"
            for idx in range(len(positions)):
                curr_atoms = Atoms(
                # set atomic positions
                positions=positions[idx],
                # set chemical symbols / species
                symbols=symbols[idx], 
                # assuming data with periodic boundary conditions, set to false for e.g. for molecules in vacuum
                pbc=True
                )

                # set calculator to assign targets
                calculator = SinglePointCalculator(curr_atoms, energy=energies[idx])
                curr_atoms.calc = calculator

                with open(out_filename, "a") as file:
                    if data_split == "train":
                        file.write(f"molecule index: {str(idx+1)}\n")
                        train_molecules_count += 1
                    else:
                        file.write(f"molecule index: {str(idx+1+train_molecules_count)}\n")
                    file.close()
                write(out_filename, curr_atoms, format='extxyz', append=True)

both train and test .extxyz files already exist for 3A4
both train and test .extxyz files already exist for RLM
both train and test .extxyz files already exist for HLC


In [57]:
for isozyme in isozymes:
    unique_symbols[isozyme] = get_unique_symbols(smiles[isozyme])
    print(f"chemical elements present in {isozyme}: {unique_symbols[isozyme]}")

chemical elements present in 3A4: ['O', 'F', 'N', 'P', 'S', 'C', 'Cl']
chemical elements present in RLM: ['O', 'F', 'Br', 'N', 'P', 'S', 'C', 'I', 'Cl']
chemical elements present in HLC: ['F', 'O', 'N', 'S', 'C']


In [60]:
for isozyme in isozymes:
    try:
        with open(rel_paths[f"{isozyme}_nequip_yaml"]) as f:
            f.close()
        print(f"{isozyme}_config.yaml already exists")
    except FileNotFoundError:
        # create correctly configured .yaml files
        with open("project_resources/nequip/minimal_eng.yaml") as f:
            data = yaml.safe_load(f)
            data["root"] = "project_resources/nequip"
            data["run_name"] = "P450"
            data["dataset"] = "ase"
            data["dataset_url"] = ""

            train_dataset = rel_paths[f"{isozyme}_nequip_train_xyz"]
            data["dataset_file_name"] = train_dataset

            key_mapping = {"z": "atomic_numbers", "E": "total_energy", "R": "pos"}
            data["key_mapping"] = key_mapping

            symbols = unique_symbols[isozyme]
            data["chemical_symbols"] = symbols

            data["wandb"] = "true"
            data["wandb_project"] = "P450"
            with open(rel_paths[f"{isozyme}_nequip_yaml"], "w") as out_f:
                yaml.dump(data, out_f)
            print(data)
            f.close()

{'root': 'project_resources/nequip', 'run_name': 'P450', 'seed': 123, 'dataset_seed': 456, 'model_builders': ['SimpleIrrepsConfig', 'EnergyModel', 'PerSpeciesRescale', 'RescaleEnergyEtc'], 'num_basis': 8, 'r_max': 4.0, 'l_max': 2, 'parity': True, 'num_features': 16, 'dataset': 'ase', 'dataset_url': '', 'dataset_file_name': 'project_resources/nequip/positions/3A4_test_mol_positions.extxyz', 'key_mapping': {'z': 'atomic_numbers', 'E': 'total_energy', 'R': 'pos'}, 'npz_fixed_field_keys': ['atomic_numbers'], 'chemical_symbols': ['O', 'F', 'N', 'P', 'S', 'C', 'Cl'], 'wandb': 'true', 'wandb_project': 'P450', 'n_train': 5, 'n_val': 5, 'batch_size': 1, 'max_epochs': 10, 'loss_coeffs': 'total_energy', 'optimizer_name': 'Adam'}
{'root': 'project_resources/nequip', 'run_name': 'P450', 'seed': 123, 'dataset_seed': 456, 'model_builders': ['SimpleIrrepsConfig', 'EnergyModel', 'PerSpeciesRescale', 'RescaleEnergyEtc'], 'num_basis': 8, 'r_max': 4.0, 'l_max': 2, 'parity': True, 'num_features': 16, 'data

In [62]:
!nequip-train project_resources/nequip/3A4_config.yaml

wandb: Currently logged in as: anony-mouse-699608256107134109. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.15.7
wandb: Run data is saved locally in C:\Users\Lukas\Documents\Jupyter_Notebooks\wandb\run-20230729_173217-1ojplj94
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run P450
wandb:  View project at https://wandb.ai/anony-mouse-699608256107134109/P450?apiKey=ed95dc810867c669f30ba6fdfc866255e910a1e6
wandb:  View run at https://wandb.ai/anony-mouse-699608256107134109/P450/runs/1ojplj94?apiKey=ed95dc810867c669f30ba6fdfc866255e910a1e6
Torch device: cpu
Processing dataset...
Loaded data: Batch(atomic_numbers=[580, 1], batch=[580], cell=[14, 3, 3], edge_cell_shift=[6501656, 3], edge_index=[2, 6501656], pbc=[14, 3], pos=[580, 3], ptr=[15], total_energy=[14, 1])
    processed data size: ~173.63 MB
Traceback (most recent call last):
  File "C:\Users\Lukas\anaconda3\envs\test_env\lib\shutil.py", line 816, in move
    os.rename(src, r

In [69]:
!nequip-train nequip/configs/example.yaml

wandb: Currently logged in as: anony-mouse-699608256107134109. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.15.7
wandb: Run data is saved locally in C:\Users\Lukas\Documents\Jupyter_Notebooks\wandb\run-20230729_182134-9c35nbyc
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run example-run-toluene
wandb:  View project at https://wandb.ai/anony-mouse-699608256107134109/toluene-example?apiKey=ed95dc810867c669f30ba6fdfc866255e910a1e6
wandb:  View run at https://wandb.ai/anony-mouse-699608256107134109/toluene-example/runs/9c35nbyc?apiKey=ed95dc810867c669f30ba6fdfc866255e910a1e6
Torch device: cpu
Downloading http://quantum-machine.org/gdml/data/npz/toluene_ccsd_t.zip
Processing dataset...
Loaded data: Batch(batch=[15000], cell=[1000, 3, 3], edge_cell_shift=[154352, 3], edge_index=[2, 154352], forces=[15000, 3], pbc=[1000, 3], pos=[15000, 3], ptr=[1001], total_energy=[1000, 1])
    processed data size: ~4.63 MB
Traceback (most recent ca

In [1]:
!echo %cd%

C:\Users\Lukas\Documents\Jupyter_Notebooks
