# Calculate 3D and 4D features for reactants
## Note: run in `crest` kernel

In [1]:
import sys
import pathlib
sys.path.append(str(pathlib.Path("__file__").absolute().parents[1]))

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem.rdmolfiles import MolToXYZFile
import pandas as pd
import morfeus

from src.util.definitions import DATA_ROOT

In [123]:
data = pd.read_csv(DATA_ROOT / "synferm_dataset_2023-12-20_39486records.csv")


def rename_func(s):
    return s.split("_")[-1]
building_blocks = pd.concat([data[[f"{s}_long", f"{s}_smiles"]].drop_duplicates().rename(columns=rename_func).assign(bb_type=s) for s in "IMT"])
assert len(building_blocks) == 67 + 71 + 41
building_blocks.head()

Unnamed: 0,long,smiles,bb_type
0,2-Pyr003,O=C(c1cccc(Cl)n1)[B-](F)(F)F.[K+],I
553,2-Pyr006,O=C(c1ccc(Br)cn1)[B-](F)(F)F.[K+],I
1029,2-Pyr007,O=C(c1cccc(F)n1)[B-](F)(F)F.[K+],I
1534,2-Pyr008,O=C(c1ccc(F)cn1)[B-](F)(F)F.[K+],I
2441,2-Pyr009,COc1ccc(C(=O)[B-](F)(F)F)nc1.[K+],I


In [55]:
from rdkit import RDLogger
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover
RDLogger.DisableLog('rdApp.info')
from rdkit.Chem.MolStandardize import rdMolStandardize
# Function to standardise SMILES
def standardise_smiles(smiles):
    remover = SaltRemover()
    uncharger = rdMolStandardize.Uncharger()
    mol = Chem.MolFromSmiles(smiles)
    res = remover.StripMol(mol)
    # remove counterion
    largest_Fragment = rdMolStandardize.LargestFragmentChooser()
    largest_mol = largest_Fragment.choose(mol)
    # neutralize monomers and terminators (i.e. ammoniums) the Uncharger will leave the [B-] as is.
    return Chem.MolToSmiles(uncharger.uncharge(largest_mol))

In [125]:
# Update the SMILES column which contains the standardised smiles
building_blocks['smiles_standard'] = building_blocks['smiles'].apply(standardise_smiles)

In [None]:
# Make directory structure for calculations
calc_dir = DATA_ROOT / "feature_calculations"
calc_dir.mkdir(exist_ok=True)

for long in building_blocks["long"]:
    (calc_dir / long).mkdir(exist_ok=True)

In [117]:
def smiles_to_xyz(smiles, output_filename):
    """A function that generates a xyz file for a SMILES input"""
    # generate Mol
    mol = smiles_to_mol(smiles)

    # Write the molecule to an XYZ file
    MolToXYZFile(mol, output_filename)
    # Write charge to .CHRG file for xtb
    with open(output_filename.parent / ".CHRG", "w") as f:
        f.write(f"{Chem.GetFormalCharge(mol)}\n")

    return None


def smiles_to_mol(smiles):
    """A function that generates a rdkit Mol for a SMILES input, in the same way the Mol for xyz is created"""
    # Convert the SMILES string to a molecule object
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol)  # Add hydrogens

    # Generate 3D coordinates
    AllChem.EmbedMolecule(mol, AllChem.ETKDG())
    AllChem.MMFFOptimizeMolecule(mol)  # Optimize the geometry

    return mol

In [None]:
# export XYZ files
for i, row in building_blocks.iterrows():
    smiles_to_xyz(row["smiles_standard"], calc_dir / row["long"] / "mmff94_out.xyz")

In [126]:
building_blocks["mol"] = building_blocks["smiles_standard"].apply(lambda x: smiles_to_mol(x))
building_blocks

Unnamed: 0,long,smiles,bb_type,smiles_standard,mol
0,2-Pyr003,O=C(c1cccc(Cl)n1)[B-](F)(F)F.[K+],I,O=C(c1cccc(Cl)n1)[B-](F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7f827d17fe60>
553,2-Pyr006,O=C(c1ccc(Br)cn1)[B-](F)(F)F.[K+],I,O=C(c1ccc(Br)cn1)[B-](F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7f827d8c5700>
1029,2-Pyr007,O=C(c1cccc(F)n1)[B-](F)(F)F.[K+],I,O=C(c1cccc(F)n1)[B-](F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7f827d17f7d0>
1534,2-Pyr008,O=C(c1ccc(F)cn1)[B-](F)(F)F.[K+],I,O=C(c1ccc(F)cn1)[B-](F)(F)F,<rdkit.Chem.rdchem.Mol object at 0x7f827d17fa00>
2441,2-Pyr009,COc1ccc(C(=O)[B-](F)(F)F)nc1.[K+],I,COc1ccc(C(=O)[B-](F)(F)F)nc1,<rdkit.Chem.rdchem.Mol object at 0x7f827d17ef80>
...,...,...,...,...,...
582,TerTH010,Cl.NNC(=S)/C=C/c1ccccc1,T,NNC(=S)/C=C/c1ccccc1,<rdkit.Chem.rdchem.Mol object at 0x7f8277f791c0>
585,TerTH020,CN(C)c1cccc(C(=S)NN)c1.Cl,T,CN(C)c1cccc(C(=S)NN)c1,<rdkit.Chem.rdchem.Mol object at 0x7f8277f79230>
586,TerTH022,Cc1noc(C)c1C(=S)N[NH3+].[Cl-],T,Cc1noc(C)c1C(=S)NN,<rdkit.Chem.rdchem.Mol object at 0x7f8277f792a0>
587,TerTH026,Cl.NNC(=S)c1cn[nH]c1,T,NNC(=S)c1cn[nH]c1,<rdkit.Chem.rdchem.Mol object at 0x7f8277f79310>


## External optimization
Structures were optimized on the cluster. Each structure was pre-optimized at the xtb-gnf2 level. Conformers were generated using crest and optimized at the xtb-gnf level, with singlepoint calculations at the xtv-gnf2 level.

Command to run on the cluster:

```bash
sbatch -n 1 --cpus-per-task=1 --time=4:00:00 --mem-per-cpu=2048 --output="stdout.txt" --error="stderr.txt" --open-mode=append --wrap="crest xtbopt.xyz --gbsa h2o -T 1 --gfn2//gfnff"
```

Optimized conformer ensembles are in `$PROJECT_ROOT/data/crest_opt/<building_block>/`.

In [109]:
from morfeus.conformer import ConformerEnsemble
from morfeus.conformer import conformers_from_rdkit

def ce_from_crest(crest_dir, mol):
    """
    Creates and refines a ConformerEnsemble from CREST output folder.
    Problem with the CREST outputs is that they do not have any connectivity information (.xyz files)
    We can assign the original RDKit Mols. After all CREST should conserve atom order.
    
    Args:
        crest_dir (str): Directory containing CREST output.
        mol (Chem.Mol): RDKit Mol that the input to crest was generated from.

    Returns:
        A ConformerEnsemble: Refined ensemble of conformers after sorting, adding RDKit Mol,
        and pruning based on RMSD and energy.
    """

    # Generate MORFEUS ConformerEnsemble object from CREST folder and sort it energetically
    ce = ConformerEnsemble.from_crest(crest_dir)
    ce.sort()

    # Add molecule representation
    ce.mol = mol
    
    # quick check: is the atom order identical?
    elem = [a.GetAtomicNum() for a in mol.GetAtoms()]
    all(elem == ce.elements)
    
    # Obtain connectivity matrix and charges from mol
    (       elements,
            conformer_coordinates,
            energies,
            connectivity_matrix,
            charges,
            _,
        ) = conformers_from_rdkit(mol)
    ce.connectivity_matrix = connectivity_matrix
    ce.formal_charges = charges
    ce.charge = int(charges.sum())

    # Prune according to rmsd and energy
    ce.prune_rmsd()
    ce.prune_energy()

    return ce

In [110]:
ensemble_dir = DATA_ROOT / "crest_opt"

In [None]:
building_blocks["ce"] = 

In [111]:
ce = ce_from_crest(ensemble_dir / "2-Pyr003", mol)

In [10]:
from morfeus import XTB

# Calculate the LUMO for every conformer of the CREST conformer ensemble and print out the Boltzmann average
for conformer in ce_crest:
    xtb = XTB(conformer.elements, conformer.coordinates)
    conformer.properties["lumo"] = xtb.get_lumo()

print(f"The average LUMO energy of 2-methoxyethylamine for the crest-derived conformer ensemble is {ce_crest.boltzmann_statistic('lumo'):.6f} eV")

The average LUMO energy of 2-methoxyethylamine for the crest-derived conformer ensemble is -0.421644 eV
