# A demo notebook

## Add local project directory to path

In [None]:
import sys
import os

# Get the absolute path of the project root (where the src folder is)
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
src_path = os.path.join(project_root, "src")

# Add the src folder to sys.path
if src_path not in sys.path:
    sys.path.append(src_path)

In [None]:
%load_ext autoreload
%autoreload 2
from confgen import example_function
from confgen.io.reader.sdf import sdf_to_mol_list
from confgen.widgets.mol_visulization import draw_overlapped_mols

This is an example of using a notebook to build documentation.  See for example {meth}`~confgen.example_function`

In [None]:
import subprocess
import re

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
IPythonConsole.ipython_3d=True
from pathlib import Path
from jsme_notebook import JSMENotebook

IPythonConsole.drawOptions.addAtomIndices = True
IPythonConsole.drawOptions.annotationFontScale = 1
IPythonConsole.molSize = 300,300

import numpy as np
import py3Dmol

In [None]:
jsme = JSMENotebook()

In [None]:
smiles="CCCCC"

In [None]:
mol = Chem.MolFromSmiles(smiles)
print(f"Coordinates:{"3D" if (mol.GetNumConformers() > 0 and mol.GetConformer().Is3D() == True) else "2D"}, Number of Conformers: {mol.GetNumConformers()}")
mol

# Generate Conformers
## Add Hydrogens

In [None]:
mol = Chem.MolFromSmiles(smiles)
mol = AllChem.AddHs(mol)
print(f"Coordinates:{"3D" if (mol.GetNumConformers() > 0 and mol.GetConformer().Is3D() == True) else "2D"}, Number of Conformers: {mol.GetNumConformers()}")
mol

## Generate Conformers

In [None]:
%timeit AllChem.EmbedMultipleConfs(mol, numConfs=100,  maxAttempts=20000, pruneRmsThresh=0.2, randomSeed=0xf00d, numThreads=8)
print(f"Number of Conformers: {mol.GetNumConformers()}")

In [None]:
mol

## Optimized conformers using MMFF (Optional)

In [None]:
AllChem.MMFFOptimizeMolecule(mol)
print(f"Coordinates:{"3D" if (mol.GetNumConformers() > 0 and mol.GetConformer().Is3D() == True) else "2D"}, Number of Conformers: {mol.GetNumConformers()}")
mol

## Write conformers to sdf file
<div class="alert alert-block alert-info">
    <p>Change <b>filename</b> below to reflect the name of molecule</p>
</div>

In [None]:
filename='pentane.sdf'

In [None]:
with Chem.SDWriter(filename) as w:
    cp_mol = Chem.Mol(mol)
    for conf in mol.GetConformers():
        cp_mol.SetProp("Conf_ID",str(conf.GetId()))
        w.write(cp_mol, confId=conf.GetId())

# Optimized conformer using xTB

## Read Conformers as RDKit Mol List

In [None]:
mol_list = []
# Path to your SDF file
input_sdf_path = Path.cwd().joinpath(filename)
# Read input SDF File
mol_list = sdf_to_mol_list(input_sdf_path)

In [None]:
mol_list[0]

## Write input files for conformers

In [None]:
base_filename = input_sdf_path.stem
xtb_directory = Path.cwd().joinpath(f"{base_filename}_xtbopt")
print(f"Write {len(mol_list)} input file at {xtb_directory} using {base_filename} as base name")

Seperate input file as in sdf format is created for xTB based geometry optimization

In [None]:
# create xtbopt directory
xtb_directory.mkdir(parents=True, exist_ok=True)

molname_to_xyz = {}
molname_to_id = {}
for mol_id, mol in enumerate(mol_list, start=1):
    if (mol.GetNumConformers() > 0 and mol.GetConformer().Is3D() == True):
        # Get the first conformer
        conformer = mol.GetConformer()
        mol_name = f"{base_filename}_{mol_id}"
        file_path = xtb_directory.joinpath(f"{mol_name}.sdf")
        # Write sdf file 
        with Chem.SDWriter(str(file_path)) as w:
            cp_mol = Chem.Mol(mol)
            for conf in mol.GetConformers():
                cp_mol.SetProp("mol_name",mol_name)
                w.write(cp_mol, confId=conf.GetId())
        
        molname_to_xyz[mol_name]=file_path
        molname_to_id[mol_name] = mol_id
    else:
        print(f"{mol_id} mol don't have proper coordinates")

## Run xTB for for geometry optimization

In [None]:
for mol_name, input_file_path in molname_to_xyz.items():
    print(mol_name, input_file_path)
    try:
        command = f"cd {input_file_path.parent}; xtb {input_file_path} --opt tight --namespace {input_file_path.stem} > {input_file_path.with_suffix(".out")}"
        result = subprocess.run(command, shell=True, capture_output=True, text=True)
        # Print the result
        print(result.stdout)
    except subprocess.CalledProcessError as e:
        print(f"Command failed with error: {e.stderr}")

# Post Process xTB
## Read Output SDF file

In [None]:
# xtbopt sdf path
xtbopt_sdf_path = input_sdf_path.with_stem(f"{input_sdf_path.stem}_xtbopt")
xtbopt_sdf_path

In [None]:
# Write xTB opt geometries to SDF File
with Chem.SDWriter(str(xtbopt_sdf_path)) as w:
    for mol_name, input_file_path in molname_to_xyz.items():
        output_file = input_file_path.with_suffix(".xtbopt.sdf")
        # Read Input Mol
        input_mol = sdf_to_mol_list(input_file_path)[0]    
        # Read Output SDF File
        if output_file.is_file():
            opt_mol_list = sdf_to_mol_list(output_file)
            opt_mol = opt_mol_list[0]
            # Extract energy from title
            title = opt_mol.GetProp("_Name")
            if "energy:" in title:
                try:
                    # Regular expression pattern to match the number after 'energy:'
                    match = re.search(r'energy:\s*(-?\d+\.\d+)', title)
                    if match:
                        energy_value = match.group(1)  # Extract the matched number
                        print(f"{mol_name} Extracted value: {energy_value}")
    
                        # # Update output mol
                        for key, value in input_mol.GetPropsAsDict().items():
                            # print(f" {key}:{type(key)}, {value}:{type(value)}")
                            opt_mol.SetProp(key, str(value))
    
                        # Add Energy
                        opt_mol.SetProp("xtb_energy", str(energy_value))
                        w.write(opt_mol)
                    else:
                        print("No match found.")
                except re.error as e:  # Handle invalid regular expression error
                    print(f"Regular expression error: {e}")
                except Exception as e:  # Handle any other kind of exception
                    print(f"An error occurred: {e}")        
        else:
            print(f"Output file not found {output_file}")

## Remove duplicates

In [None]:
xtbopt_mol_list = sdf_to_mol_list(xtbopt_sdf_path)
xtbopt_mol_list[0]

### Extract energy data from xtb opt mol list

In [None]:
# create numpy arrary of conf_id and energy.
conf_data = []
mol_name_to_id = {}
# xtbopt_mol_list contain all the optimized conformer as molecule
# Code below going to assing each entry a numerical id.
for mol_id, xtbopt_mol in enumerate(xtbopt_mol_list, start=1):
    keys = list(xtbopt_mol.GetPropNames())
    if ("xtb_energy" in keys and "mol_name" in keys):
        mol_name_to_id[xtbopt_mol.GetProp("mol_name")] = mol_id
        conf_data.append([mol_id, float(xtbopt_mol.GetProp("xtb_energy"))])
    else:
        print(f"xtb_energy and mol_name not found in {mol_id} molecule in list")
data = np.array(conf_data)
# Sort array based on energy (unit kcal/mol, higher number is lower overall energy)
sorted_eng = data[data[:, 1].argsort()]
sorted_eng

In [None]:
mol_withoutH_dict = {}
mol_dict = {}
mol_id_to_name = {}
for mol_id,xtbopt_mol in enumerate(xtbopt_mol_list, start=1):
    mol_withoutH_dict[mol_id] = Chem.RemoveHs(xtbopt_mol)
    mol_dict[mol_id] = xtbopt_mol
    mol_id_to_name[mol_id] = xtbopt_mol.GetProp("mol_name")
print(f"Number of Conformers: {len(mol_withoutH_dict)}")
# mol_withoutH_dict[4]

### Filter Conformer Based on Energy & RMSD

In [None]:
hartree_to_kcal_per_mol =  627.5 #One hartree is equal to 627.5 kilocalories per mole (kcal/mol)
energy_cutoff = 0.00031873 # 0.2 kcal in Hartree
rmsd_cutoff = 0.2 

In [None]:
# Initialized Unique conformer list with lowest energy conformer.
unique_confs = np.array([sorted_eng[0]])
unique_confs

# Loop over conformers with increasing energy
for conf_data in sorted_eng[1:]:
    # print("Processing", conf_data[0], conf_data[1])
    unique=True
    # Check existing list of unique conformers.
    for existing_conf in unique_confs:
        # print(f"checking {existing_conf[0]} - {existing_conf[1]}: {abs(float(existing_conf[1]) - float(conf_data[1]))}")
        if abs(float(existing_conf[1]) - float(conf_data[1])) > energy_cutoff:
            rmsd_diff = AllChem.GetBestRMS(mol_withoutH_dict[int(conf_data[0])], mol_withoutH_dict[int(existing_conf[0])])
            # print(f" Enery & RMSD differences {conf_data[0]} - {existing_conf[0]}: {abs(existing_conf[1] - conf_data[1])}, {rmsd_diff}")
            if rmsd_diff < rmsd_cutoff:
                print(f"Iso-geometric duplicate to {existing_conf[0]}, removing {conf_data[0]}")
                unique=False
                break
        else:
            print(f"Iso-energetics duplicate to {existing_conf[0]}, removing {conf_data[0]}")
            unique=False
            break
    if unique:
        unique_confs = np.vstack([unique_confs, conf_data])
        # print(f"Added {conf_data[0]} to {unique_confs}")
    
    # if int(conf_data[0]) == 11:
    #     break
unique_confs

In [None]:
# Add the new relative energy column to the array
unique_conf_releng = np.column_stack((unique_confs , ((unique_confs[:,-1] - unique_confs[:,-1].min()) * hartree_to_kcal_per_mol)))
unique_conf_releng

### Re-Align Conformers To Global Minima

In [None]:
rmsd_list = []
for row_id, row in enumerate(unique_conf_releng):
    if row_id == 0:
        ref_mol =  mol_withoutH_dict[row[0]]
        rmsd_list.append(0)
    else:
        prb_mol = mol_withoutH_dict[row[0]]
        rmsd, transfrom_mat, best_atom_map = Chem.rdMolAlign.GetBestAlignmentTransform(prb_mol, ref_mol)
        rmsd_list.append(rmsd)
        # Get molecule with hydrogen
        prb_molH = mol_dict[row[0]]
        # Apply transform matrix
        AllChem.TransformMol(prb_molH, transfrom_mat)

In [None]:
unique_conf_releng_rmsd = np.column_stack((unique_conf_releng , rmsd_list))
unique_conf_releng_rmsd

In [None]:
ref_molH = mol_dict[16]
prb_molH = mol_dict[38]
draw_overlapped_mols([prb_molH, ref_molH], prop_name="mol_name")

## Write Unique Conformer To SDF File

In [None]:
# xtbopt sdf path
xtbunique_sdf_path = input_sdf_path.with_stem(f"{input_sdf_path.stem}_xtbunique")

with Chem.SDWriter(str(xtbunique_sdf_path)) as w:
    for row in unique_conf_releng_rmsd:
        print(f"writing: {row}")
        opt_mol = mol_dict[int(row[0])]
        opt_mol.SetProp("Rel_dE_kcal", str(round(row[-2],2)))
        opt_mol.SetProp("RMSD_To_GM", str(round(row[-1],2)))        
        w.write(opt_mol)

# Write xyz

In [None]:
# create xyz directory
xyz_dir = Path.cwd().joinpath("xyz_xtbopt")
xyz_dir.mkdir(parents=True, exist_ok=True)

# Reading SDF File
unique_mol_list = sdf_to_mol_list(xtbunique_sdf_path)
print(f" Read {len(unique_mol_list)} entry")


for mol_id, mol in enumerate(unique_mol_list, start=1):
    if (mol.GetNumConformers() > 0 and mol.GetConformer().Is3D() == True):
        # Get the first conformer
        conformer = mol.GetConformer()
        mol_name = mol.GetProp("mol_name")
        file_path = xyz_dir.joinpath(f"{mol_name}.xyz")
        # Open the file for writing xyz
        with open(str(file_path), 'w') as xyz_file:
            # Write the number of atoms
            num_atoms = mol.GetNumAtoms()
            xyz_file.write(f"{num_atoms}\n")
            # Write a comment line (can be empty or some info)
            xyz_file.write(f"{file_path.stem}\n")
            # Loop over atoms and extract atomic symbols and coordinates
            for atom in mol.GetAtoms():
                pos = conformer.GetAtomPosition(atom.GetIdx())
                symbol = atom.GetSymbol()
                xyz_file.write(f"{symbol:<3} {pos.x:>10.4f} {pos.y:>10.4f} {pos.z:>10.4f}\n")
    else:
        print(f"{mol_id} mol don't have proper coordinates")

# Write CSV File

In [None]:
# Add names:
unique_mol_name = [ mol_id_to_name[row[0]] for row in unique_conf_releng_rmsd]
# unique_mol_name
unique_conf_data = np.column_stack((unique_conf_releng , unique_mol_name))
unique_conf_data

# Pandas Tools RDKit

In [None]:
import pandas as pd
from rdkit.Chem import PandasTools

In [None]:
xtbunique_sdf_path=Path.cwd().joinpath("test_mol_xtbunique.sdf")

In [None]:
frame = PandasTools.LoadSDF(xtbunique_sdf_path)
frame

In [None]:
unique_conf_df = frame[["mol_name", "xtb_energy", "Rel_dE_kcal", "RMSD_To_GM"]]
unique_conf_df.to_csv("unique_conformers.csv")