In [1]:
import sys
import random
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from ase import Atoms
from ase.io import read, write
from ase.calculators.singlepoint import SinglePointCalculator
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import *

importing Jupyter notebook from C:\Users\Lukas\Documents\Jupyter_Notebooks\project_resources\cytochrome_P450.ipynb


In [2]:
isozymes = ["3A4", "RLM", "HLC"]
rel_paths = {
    "3A4_source": r"project_resources/ChEMBL_3A4.csv",
    "3A4_sep": ";",
    "3A4": r"project_resources/3A4.csv",
    "3A4_jazzy_mol_fts": r"project_resources/jazzy/3A4_jazzy_mol_fts.csv",

    "RLM_source": r"project_resources/AID_1508591_datatable_all.csv",
    "RLM_sep": ",",
    "RLM": r"project_resources/RLM.csv",
    "RLM_jazzy_mol_fts": r"project_resources/jazzy/RLM_jazzy_mol_fts.csv",

    "HLC_source": r"project_resources/AID_1508603_datatable_all.csv",
    "HLC_sep": ",",
    "HLC": r"project_resources/HLC.csv",
    "HLC_jazzy_mol_fts": r"project_resources/jazzy/HLC_jazzy_mol_fts.csv",
}
position_blocks = {}
rdkit_symbols = {}

In [5]:
for isozyme in isozymes:
    try:
        read(f"project_resources/nequip/{isozyme}_train_validation_mol_positions.extxyz")
        read(f"project_resources/nequip/{isozyme}_test_mol_positions.extxyz")
        read("foo.extxyz")
        print(f"both train-validation and test .extxyz files already exist for {isozyme}")
    except FileNotFoundError:
        # create .extxyz files for train-validation and test for isozyme
        df = pd.read_csv(rel_paths[isozyme])
        smiles = df["smiles"]
        isozyme_positions = []
        isozyme_symbols = []

        # mols in source csv files are ordered from highest half-life to lowest half-life
        # so before train test split randomize the order with seed 42 (can be any integer)
        random.Random(42).shuffle(smiles)
        for smi in smiles:
            # create mol
            mol = Chem.MolFromSmiles(smi)
            mol = Chem.AddHs(mol, explicitOnly=True)
            AllChem.EmbedMolecule(mol)

            # get x, y and z positions of each atom from mol
            xyz_string = Chem.MolToXYZBlock(mol)

            # for looping over the positions
            lines = xyz_string.strip().split("\n")[2:]

            mol_positions = [] # corrdinates of each atom of mol in 3D space
            mol_symbols = [] # list of atoms in mol
            
            # get each atom and its coordinates
            for line in lines:
                parts = line.split()
                symbol = parts[0]
                x, y, z = map(float, parts[1:4])
                mol_symbols.append(symbol)
                mol_positions.append([float(coord) for coord in parts[1:]])

            isozyme_positions.append(mol_positions)
            isozyme_symbols.append(mol_symbols)


        # list of positions for each mol in train-validation set, --||-- in test set
        isozyme_pos_tr_val, isozyme_pos_test = list_splitter(isozyme_positions, 0.8)
        # list of symbols for each mol...
        isozyme_symbs_tr_val, isozyme_symbs_test = list_splitter(isozyme_symbols, 0.8)


        position_blocks[isozyme] = {}
        position_blocks[isozyme]["train_validation"] = isozyme_pos_tr_val
        position_blocks[isozyme]["test"] = isozyme_pos_test
        # save data to dicts
        rdkit_symbols[isozyme] = {}
        rdkit_symbols[isozyme]["train_validation"] = isozyme_symbs_tr_val
        rdkit_symbols[isozyme]["test"] = isozyme_symbs_test

        print(len(isozyme_pos_tr_val), len(isozyme_pos_test), len(isozyme_symbs_tr_val), len(isozyme_symbs_test))

        # load half-life values and ensure order corresponds with positions and symbols
        halflife = df["half-life"]
        log_halflife = np.log(np.array(halflife))
        random.Random(42).shuffle(log_halflife)
        tr_val_halflife, test_halflife = list_splitter(log_halflife, 0.8)
        halflife_dict = {"train_validation": tr_val_halflife, "test": test_halflife}
        print(log_halflife[:10])

        data_splits = ["train_validation", "test"]
        # generate .extxyz files for train+validation and test sets
        for data_split in data_splits:
            out_filename = f'project_resources/nequip/{isozyme}_{data_split}_mol_positions.extxyz'
            # get data 
            positions = position_blocks[isozyme][data_split] # 
            symbols = rdkit_symbols[isozyme][data_split] # atomic symbols (e.g. C, O, H...)
            energies = halflife_dict[data_split] # log half-life

            # iterate over data and write continuously to extxyz file
            for idx in range(len(positions)):
                curr_atoms = Atoms(
                # set atomic positions
                positions=positions[idx],
                # set chemical symbols / species
                symbols=symbols[idx], 
                # assuming data with periodic boundary conditions, set to false for e.g. for molecules in vacuum
                pbc=True
                )

                # set calculator to assign targets
                calculator = SinglePointCalculator(curr_atoms, energy=energies[idx])
                curr_atoms.calc = calculator

                write(out_filename, curr_atoms, format='extxyz', append=True)

[18:48:06] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:06] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:06] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:06] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:06] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:06] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:07] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:07] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:07] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:07] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:07] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:07] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:07] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:07] Molecule does not have explicit Hs. Consider calling 

56 14 56 14
[-0.56792522 -0.53905365 -2.31942611 -2.48494665 -2.48494665 -0.97630616
 -3.56383397 -1.52924137 -0.38082128 -2.46510402]


[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] UFFTYPER: Unrecognized charge state for atom: 1
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] UFFTYPER: Unrecognized charge state for atom: 8
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:09] Molecule does not have explicit Hs. Consider calling AddHs()
[18:48:10] Molecul

2024 507 2024 507
[1.06471074 3.40119738 3.40119738 2.97450864 2.4518668  1.58923521
 1.48160454 3.40119738 0.99325177 3.40119738]


[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:03] Molecule does not have explicit Hs. Consider calling 

151 38 151 38
[2.2617631  4.78749174 3.77963382 4.78749174 3.57234564 4.550714
 4.78749174 4.78749174 4.78749174 3.8918203 ]


[18:49:05] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:05] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:05] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:05] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:05] Molecule does not have explicit Hs. Consider calling AddHs()
[18:49:05] Molecule does not have explicit Hs. Consider calling AddHs()
