In [1]:
import sys
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
from ase import Atoms
from ase.io import write
from ase.calculators.singlepoint import SinglePointCalculator
from project_resources.import_utils import NotebookFinder
sys.meta_path.append(NotebookFinder())
from project_resources.cytochrome_P450 import *

importing Jupyter notebook from C:\Users\Lukas\Documents\Jupyter_Notebooks\project_resources\cytochrome_P450.ipynb


In [2]:
isozymes = ["3A4", "RLM", "HLC"]
rel_paths = {
    "3A4_source": r"project_resources/ChEMBL_3A4.csv",
    "3A4_sep": ";",
    "3A4": r"project_resources/3A4.csv",
    "3A4_jazzy_mol_fts": r"project_resources/jazzy/3A4_jazzy_mol_fts.csv",

    "RLM_source": r"project_resources/AID_1508591_datatable_all.csv",
    "RLM_sep": ",",
    "RLM": r"project_resources/RLM.csv",
    "RLM_jazzy_mol_fts": r"project_resources/jazzy/RLM_jazzy_mol_fts.csv",

    "HLC_source": r"project_resources/AID_1508603_datatable_all.csv",
    "HLC_sep": ",",
    "HLC": r"project_resources/HLC.csv",
    "HLC_jazzy_mol_fts": r"project_resources/jazzy/HLC_jazzy_mol_fts.csv",
}
position_blocks = {}
rdkit_symbols = {}

In [7]:
for isozyme in isozymes:
    df = pd.read_csv(rel_paths[isozyme])
    smiles = df["smiles"]
    isozyme_positions = []
    isozyme_symbols = []
    for smi in smiles:
        mol = Chem.MolFromSmiles(smi)
        mol = Chem.AddHs(mol, explicitOnly=False) # SET TO TRUE
        AllChem.EmbedMolecule(mol)
        # create mol

        xyz_string = Chem.MolToXYZBlock(mol)
        # get x, y and z positions of each atom from mol
        lines = xyz_string.strip().split("\n")[2:]
        # for looping over the positions

        mol_positions = []
        # corrdinates of each atom of mol in 3D space
        mol_symbols = []
        # list of atoms in mol

        for line in lines:
            parts = line.split()
            symbol = parts[0]
            x, y, z = map(float, parts[1:4])
            mol_symbols.append(symbol)
            mol_positions.append([float(coord) for coord in parts[1:]])

        isozyme_positions.append(mol_positions)
        isozyme_symbols.append(mol_symbols)

    # list of positions for each mol in train-validation set, --||-- in test set
    isozyme_pos_tr_val, isozyme_pos_test = list_splitter(isozyme_positions, 0.8)
    
    # list of symbols for each mol...
    isozyme_symbs_tr_val, isozyme_symbs_test = list_splitter(isozyme_symbols, 0.8)
    
    position_blocks[isozyme] = {}
    position_blocks[isozyme]["train_validation"] = isozyme_pos_tr_val
    position_blocks[isozyme]["test"] = isozyme_pos_test
    
    rdkit_symbols[isozyme] = {}
    rdkit_symbols[isozyme]["train_validation"] = isozyme_symbs_tr_val
    rdkit_symbols[isozyme]["test"] = isozyme_symbs_test
    
    print(len(isozyme_pos_tr_val), len(isozyme_pos_test), len(isozyme_symbs_tr_val), len(isozyme_symbs_test))

    halflife = df["half-life"]
    log_halflife = np.log(np.array(halflife))
    tr_val_halflife, test_halflife = list_splitter(log_halflife, 0.8)
    halflife_dict = {"train_validation": tr_val_halflife, "test": test_halflife}
    print(log_halflife)
    
    data_splits = ["train_validation", "test"]


    for data_split in data_splits:
        out_filename = f'project_resources/nequip/{isozyme}_{data_split}_mol_positions.extxyz'
        # get data 
        positions = position_blocks[isozyme][data_split]
        symbols = rdkit_symbols[isozyme][data_split] # z rdkit
        energies = halflife_dict[data_split] # log half-life

        # iterate over data and write continuously to extxyz file
        for idx in range(len(positions)):
            curr_atoms = Atoms(
            # set atomic positions
            positions=positions[idx],
            # set chemical symbols / species
            symbols=symbols[idx], 
            # assuming data with periodic boundary conditions, set to false for e.g. for molecules in vacuum
            pbc=True
            )

            # set calculator to assign targets
            calculator = SinglePointCalculator(curr_atoms, energy=energies[idx])
            curr_atoms.calc = calculator

            write(out_filename, curr_atoms, format='extxyz', append=True)

56 14 56 14
[-1.09871229  0.15443635 -3.10109279 -0.76206863 -0.28768207 -0.18632958
 -2.31942611 -2.48494665  0.78070008  1.09861229 -0.62867116  0.15443635
 -2.48494665 -0.4833726  -0.40541511 -0.56792522 -0.38082128 -3.68887945
 -1.52924137 -0.45681104 -0.35667494 -0.43078292 -0.97630616 -5.70388248
 -1.32163085 -2.90042209 -2.48494665 -4.60517019 -1.41346003 -2.38956011
 -3.91202301 -3.56383397 -2.48494665 -1.52924137 -1.60943791  0.04879016
  1.79175947 -0.20248345 -0.7985077  -3.56383397 -1.69663112 -0.62867116
 -2.48494665 -0.18236156 -0.76571787 -1.32163085  0.2366519  -2.48494665
 -1.32163085 -4.24540004 -1.79155949 -3.99921622 -1.2039728  -2.46510402
 -2.09313487 -2.48494665 -0.7271177  -2.48494665 -0.69314718 -2.10619628
 -1.38629436 -2.48494665  1.79175947 -1.79155949 -1.4554301  -0.76206863
 -0.53905365 -1.26124887 -1.14980033 -0.95893731]


[12:58:56] UFFTYPER: Unrecognized charge state for atom: 7
[12:58:58] UFFTYPER: Unrecognized charge state for atom: 18
[12:58:59] UFFTYPER: Unrecognized charge state for atom: 16
[12:58:59] UFFTYPER: Unrecognized charge state for atom: 17
[12:58:59] UFFTYPER: Unrecognized charge state for atom: 18
[12:59:00] UFFTYPER: Unrecognized charge state for atom: 17
[12:59:00] UFFTYPER: Unrecognized charge state for atom: 13
[12:59:00] UFFTYPER: Unrecognized charge state for atom: 13
[12:59:00] UFFTYPER: Unrecognized charge state for atom: 13
[12:59:01] UFFTYPER: Unrecognized charge state for atom: 14
[12:59:01] UFFTYPER: Unrecognized charge state for atom: 14
[12:59:01] UFFTYPER: Unrecognized charge state for atom: 13
[12:59:01] UFFTYPER: Unrecognized charge state for atom: 13
[12:59:01] UFFTYPER: Unrecognized charge state for atom: 17
[12:59:01] UFFTYPER: Unrecognized charge state for atom: 19
[12:59:02] UFFTYPER: Unrecognized charge state for atom: 7
[12:59:03] UFFTYPER: Unrecognized charge s

2024 507 2024 507
[ 3.40119738  3.40119738  3.40119738 ... -0.09431068 -0.10536052
 -0.10536052]


[13:01:56] UFFTYPER: Unrecognized charge state for atom: 17
[13:01:58] UFFTYPER: Unrecognized charge state for atom: 17


151 38 151 38
[2.10413415 2.1517622  2.2617631  2.8507065  2.85647021 2.85647021
 3.27336401 3.50254988 3.56671182 3.60277676 3.64805746 3.72810017
 3.77963382 3.8918203  3.92789635 3.9337845  3.95124372 4.03600899
 4.0500443  4.08260931 4.14154616 4.14946386 4.25561271 4.26829787
 4.32809829 4.33336146 4.33859708 4.3719763  4.38327585 4.38576962
 4.38576962 4.41400968 4.46129982 4.49980967 4.53259949 4.53474772
 4.550714   4.55597994 4.5685062  4.57264699 4.61907309 4.63860496
 4.67376298 4.70682384 4.74666975 4.78749174 4.78749174 4.78749174
 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174
 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174
 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174
 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174
 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174
 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174 4.78749174
 4.78749174 4.78749174 4.78749174 4.78749174 4.7