# Once Auto3D has completed, run this notebook to compare the RMSD of generated samples and the optimised geometries

Notebook outputs `data_optim.csv` with new RMSD results

In [1]:
#When the experiment finishes, the data will be here
#"geometry_stability/results/results_smiles/smiles_out.sdf"

In [2]:
#auto_3d_results_path = "geometry_stability/benchmark_100_smiles/smiles_out.sdf"
experiment_path = "experiments/"

In [3]:
#While the experiment is still running, use this
import glob

#results_fast
#results
#auto_3d_results_paths = glob.glob("geometry_stability/results_fast/results_fast_smiles/*/*3d.sdf")
#auto_3d_results_paths
# experiment_names = ['exp_2','exp_3','exp_4','exp_5','exp_6','exp_7','exp_8']
#TODO: need to retrive this from the last notebook
# SILVR + EDM, fragments 0072 + 0107
# experiment_names = ['exp_9','exp_10','exp_11','exp_12','exp_13','exp_14','exp_15']

# BRIDGE + EDM, fragments 0072 + 0107
# experiment_names = ['exp_16','exp_17','exp_18','exp_19','exp_20','exp_21','exp_22']

# BRIDGE + EDM, fragments 0072 + 0107 (250 SAMPLES)
# experiment_names = ['exp_23','exp_24','exp_25','exp_26','exp_27','exp_28','exp_29']

# BRIDGE + EDM, fragments 0072 + 0107 (Explicit Hydrogens)
# experiment_names = ['exp_30','exp_31','exp_32','exp_33','exp_34','exp_35','exp_36']

# BRIDGE + EDM, NDM-1 fragments B + C (Explicit Hydrogens)
experiment_names = ['exp_37','exp_38','exp_39','exp_40','exp_41','exp_42','exp_43']

auto_3d_results_path = f"geometry_stability/results_fast/results_fast_{experiment_names[0]}_to_{experiment_names[-1]}_smiles/{experiment_names[0]}_to_{experiment_names[-1]}_smiles_out.sdf"

auto_3d_results_paths = [auto_3d_results_path]

# This should really be in "util"

In [4]:
from rdkit.Chem import Descriptors
from rdkit.Chem.MolStandardize import rdMolStandardize
from openbabel import pybel
from rdkit import Chem


def rdkit_fix_radicals(mol,add_h=False,flatten=False,uncharge=True):
    """
    Atoms with unfilled valance get radicals assigned.
    Openbabel will have assigned bond orders based on bond length.
    Here I assume all radical electrons should instead be hydrogen atoms
    """
    for atom in mol.GetAtoms():
        radicals = atom.GetNumRadicalElectrons()
        atom.SetNumRadicalElectrons(0)
        atom.SetNumExplicitHs(atom.GetNumExplicitHs() + radicals)

    if flatten:
        mol = Chem.MolFromSmiles(Chem.MolToSmiles(mol))

    if add_h:
        mol = Chem.AddHs(mol,addCoords=True)

    if uncharge:
        un = rdMolStandardize.Uncharger()
        mol = un.uncharge(mol)

    return mol


def xyz_to_mol_clean(xyz, add_h=True, flatten=False):
    """
    add_h - add RDKit hydrogens
    flatten - run Chem.MolFromSmiles(Chem.MolToSmiles(x)) such that geometry infromation is lost
    
    Sometimes these imports fail
    In these cases this function returns False
    """
    try:
        mol_pybel = pybel.readstring("xyz", xyz)
        mol_mol2 = mol_pybel.write("mol2")

        #RDKit - clean radicals
        mol_rdkit = Chem.MolFromMol2Block(mol_mol2)
        mol_final = rdkit_fix_radicals(mol_rdkit, add_h=add_h, flatten=flatten)

        return mol_final
    
    except:
        return False
    
    
def get_mol_id_from_sdf(mol_sdf):
    string_id = mol_sdf.GetPropsAsDict()["ID"]
    string_id = string_id.split("_")[0]
    split_string_id = string_id.split("-")

    exp_id = "_".join(split_string_id[:2])
    mol_id = "_".join(split_string_id[2:])

    return exp_id, mol_id

In [5]:
from rdkit.Chem import rdMolAlign
from rdkit import Chem
import numpy as np

results = []
for auto_3d_results_path in auto_3d_results_paths:
    suppl = Chem.SDMolSupplier(auto_3d_results_path)

    
    for optimised_mol in suppl:
        exp_id, mol_id = get_mol_id_from_sdf(optimised_mol)
        with open(f"experiments/{exp_id}/{mol_id}.txt", "r") as readfile:
            sample_xyz = readfile.read()
            sample_mol = xyz_to_mol_clean(sample_xyz)
        try:
            rmsd = rdMolAlign.AlignMol(optimised_mol, sample_mol)
        except:
            print("ERROR")
            rmsd = np.NaN

        results.append([mol_id, rmsd])

[23:10:18] Running Uncharger
[23:10:18] Removed negative charge.
  Failed to kekulize aromatic bonds in OBMol::PerceiveBondOrders (title is stable:False satoms:48 tatoms:54 sratio:0.8888888888888888)

[23:10:18] Running Uncharger
[23:10:18] Running Uncharger
[23:10:18] Running Uncharger
[23:10:18] Running Uncharger
[23:10:18] Running Uncharger
[23:10:18] Removed negative charge.


# Merge optimisation data with data.csv. Make new file data_optim.csv

In [6]:
import pandas as pd

old_df = pd.read_csv(f"experiments/{experiment_names[0]}_to_{experiment_names[-1]}_data.csv")
rmsd_df = pd.DataFrame(results, columns=["mol_id", "auto3d_rmsd"]).set_index("mol_id")#Note this was rmsd
df = pd.merge(old_df,rmsd_df, on="mol_id",how='left')

df.to_csv(experiment_path+f"{experiment_names[0]}_to_{experiment_names[-1]}_data_optim.csv",index=False)

In [7]:
df

Unnamed: 0,mol_id,total_atoms,stable_ratio,exp_id,dummy_atoms,samples,silvr,comment,crude_rmsd,is_fragmented,qed,sa,smiles,auto3d_rmsd
0,mol_2024_07_17_2231283_000,54,0.777778,exp_37,0,10,0.00,(BRIDGE + EDM) Effect of SILVR rate on samplin...,5.180670,True,0.152309,,O=[N+]([O-])O.[H]OC([H])([H])C([H])([H])[H].[H...,
1,mol_2024_07_17_2230470_000,54,0.759259,exp_37,0,10,0.00,(BRIDGE + EDM) Effect of SILVR rate on samplin...,5.124135,True,0.339522,,[H]C([H])([H])[C@@]1([H])OC1=O.[H]OC([H])([H])...,
2,mol_2024_07_17_2230482_000,54,0.703704,exp_37,0,10,0.00,(BRIDGE + EDM) Effect of SILVR rate on samplin...,5.319700,True,0.129396,,[H]N([H])C([H])([H])C([H])([H])[H].[H]NC([H])(...,
3,mol_2024_07_17_2231270_000,54,0.944444,exp_37,0,10,0.00,(BRIDGE + EDM) Effect of SILVR rate on samplin...,5.452412,True,0.383417,,[H]N([H])C([H])([H])C([H])([H])[C@]([H])(C([H]...,
4,mol_2024_07_17_2230481_000,54,0.888889,exp_37,0,10,0.00,(BRIDGE + EDM) Effect of SILVR rate on samplin...,5.527328,False,0.625930,8.305324,[H]O[C@@]1([H])C([H])([H])[C@@]([H])(N2C(=O)N(...,2.188253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,mol_2024_07_17_2249481_000,54,0.000000,exp_43,0,10,0.03,(BRIDGE + EDM) Effect of SILVR rate on samplin...,3.373933,True,0.212190,,F.F.F.F.F.F.F.F.F.F.F.F.FF.FF.FF.FF.FF.FF.FF.F...,
93,mol_2024_07_17_2248013_000,54,0.000000,exp_43,0,10,0.03,(BRIDGE + EDM) Effect of SILVR rate on samplin...,3.316221,True,0.212184,,F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.FF.FF....,
94,mol_2024_07_17_2247590_000,54,0.000000,exp_43,0,10,0.03,(BRIDGE + EDM) Effect of SILVR rate on samplin...,3.585942,True,0.212184,,F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.FF.FF....,
95,mol_2024_07_17_2248551_000,54,0.000000,exp_43,0,10,0.03,(BRIDGE + EDM) Effect of SILVR rate on samplin...,3.413229,True,0.212187,,F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.F.FF.FF.FF.FF.FF...,
