In [1]:
# this notebook convert the Gaussian com input file from close shell to open shell,
# by removing one hydrogen from the benzylic position and change spin multiplicity to 2 (doublet)

In [2]:
import re, sys, glob, io, os
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.colors import ColorConverter
from rdkit import Chem

In [None]:
# we have to map to their corresponding core structures
common_structure_list = {}

common_structure_chemdraw = glob.glob("*.cdxml")
common_structure_chemdraw

for file in common_structure_chemdraw:
    # get the filename without the extension, we will use it as the prefix
    key = file.split(".")[0]
    substructure = Chem.MolsFromCDXMLFile(file)
    # output to Smarts format
    temp_smarts = Chem.MolToSmarts(substructure[0])
    # reparse the Smarts, this is in case the structure from the cdxml file is not the same as from the SMILES string
    substructure = Chem.MolFromSmarts(temp_smarts)
    # add the substructure to the dictionary
    common_structure_list[key] = substructure

# Prepare the list of molecules and corresponding legends (for display under each molecule)
mols = list(common_structure_list.values())
legends = list(common_structure_list.keys())

# Generate a grid image with a fixed number of columns (4 columns)
img = Chem.Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(200, 200), legends=legends)

# Display the grid of images
display(img)

In [None]:
common_structure_prefix = list(common_structure_list.keys())
common_structure_prefix

In [None]:
# grep all the sdf and com files group by prefix
sdf_files_dict = {}
com_files_dict = {}

sdf_files = glob.glob("*.sdf")
com_files = glob.glob("*.com")

# based on the prefix of the file, we will use it as the key (pyrd, pyrmd, etc)
for file in sdf_files:
    print(f"file: {file}", end=", ")
    prefix = re.search(r"(\D+)\d+_", file).group(1)
    print(f"prefix: {prefix} for file: {file}", end=", ")
    for common_prefix in common_structure_prefix:
        # we want exact match
        if prefix == common_prefix:
            print(f"Matched template: {common_prefix}")
            # create a list of files for each prefix
            if common_prefix not in sdf_files_dict:
                sdf_files_dict[common_prefix] = []
            sdf_files_dict[common_prefix].append(file)
            
# same for the log files
for file in com_files:
    prefix = re.search(r"(\D+)\d+_", file).group(1)
    print(f"prefix: {prefix} for file: {file}", end=", ")
    for common_prefix in common_structure_prefix:
        # we want exact match
        if prefix == common_prefix:
            print(f"Matched template: {common_prefix}")
            # create a list of files for each prefix
            if common_prefix not in com_files_dict:
                com_files_dict[common_prefix] = []
            com_files_dict[common_prefix].append(file)

In [None]:
# print the total count of files for each prefix
total_count = 0
for key, value in sdf_files_dict.items():
    total_count += len(value)
    print(f"{key}: {len(value)}")
print(f"Total count: {total_count}")
# sort the dict by the number after the prefix, using regex \D+(\d+)_{1}
for key, value in sdf_files_dict.items():
    for file in value:
        match = re.search(r"\D+(\d+)_{1}", file)
        sdf_files_dict[key] = sorted(value, key=lambda x: int(re.search(r"^\D+(\d+)_{1}", x).group(1)))

# same thing for the com files
total_count = 0
for key, value in com_files_dict.items():
    total_count += len(value)
    print(f"{key}: {len(value)}")
print(f"Total count: {total_count}")
for key, value in com_files_dict.items():
    for file in value:
        match = re.search(r"\D+(\d+)_{1}", file)
        com_files_dict[key] = sorted(value, key=lambda x: int(re.search(r"^\D+(\d+)_{1}", x).group(1)))

In [None]:
sdf_files_dict

In [None]:
com_files_dict

In [None]:
# go through the sdf_files_dict, load the sdf file using Chem.SDMolSupplier
# and then display the molecule
mols_dict = {}

for template, files_list in sdf_files_dict.items():
    print(f"template prefix: {template}")
    for sdf_file in files_list:
        print(f"file: {sdf_file}, filename: {sdf_file.split('.')[0]}")
        supplier = Chem.SDMolSupplier(sdf_file, removeHs=False)
        suffix = 1
        for mol in supplier:
            mol_without_H = Chem.RemoveHs(mol)

            key = f"{sdf_file.split('.')[0]}-{suffix}"
            print(f"key: {key}")
            mols_dict[key] = mol

            # there is a possibility that there are multiple matches
            submatches = mol.GetSubstructMatches(common_structure_list[template])
            # combine the submatches into a single list
            matcheslist = list([item for sublist in submatches for item in sublist])

            print(f"submatch: {submatches}, matcheslist: {matcheslist}")
            img1 = Chem.Draw.MolToImage(
                mol_without_H,
                size=(600, 600),
                highlightAtoms=matcheslist,
                highlightColor=ColorConverter().to_rgba("Salmon", alpha=0.75),
            )

            for atom in matcheslist:
                for neighbor in mol.GetAtomWithIdx(atom).GetNeighbors():
                    # limit to C neighbors, ignore those that are already in the matcheslist
                    if (
                        neighbor.GetSymbol() == "C"
                        and neighbor.GetIdx() not in matcheslist
                    ):
                        # all bonds on this neighbor should be single bond, and contains at least one hydrogen
                        if (
                            neighbor.GetBonds() is not None  # check if there is a bond
                            and neighbor.GetTotalNumHs(includeNeighbors=True)
                            > 0  # check if there is at least one hydrogen
                            and all(
                                [
                                    bond.GetBondType() == Chem.rdchem.BondType.SINGLE
                                    for bond in neighbor.GetBonds()
                                ]  # check if all bonds are single bond
                            )
                        ):
                            # get all hydrogens atom index of this neighbor
                            hydrogens = [
                                atom.GetIdx()
                                for atom in neighbor.GetNeighbors()
                                if atom.GetSymbol() == "H"
                            ]
                            hydrogens.sort()
                            # store the found hydrogen atom index at value, filename as key
                            #! increase all value in the hydrogen index by 1 since GaussianView is 1-based index
                            hydrogens_index_GaussianView = [
                                hydrogen + 1 for hydrogen in hydrogens
                            ]
                            mols_dict[key] = hydrogens_index_GaussianView

                            print(
                                f"valid neighbor index: {neighbor.GetIdx()}, neighbor's H neighbors index(0-based): {hydrogens}, index at GaussianView(1-based): {hydrogens_index_GaussianView}"
                            )
                            # highlight the neighbor in gold, highlight the hydrogens in yellow, high light the substructure in red
                            highlightAtoms = (
                                matcheslist + [neighbor.GetIdx()] + hydrogens
                            )
                            highlightAtomColors = {}
                            for atom in highlightAtoms:
                                if atom in matcheslist:
                                    highlightAtomColors[atom] = (
                                        ColorConverter().to_rgba("Salmon", alpha=0.75)
                                    )
                                elif atom == neighbor.GetIdx():
                                    highlightAtomColors[atom] = (
                                        ColorConverter().to_rgba("gold", alpha=0.75)
                                    )
                                elif atom in hydrogens:
                                    highlightAtomColors[
                                        atom
                                    ] = ColorConverter().to_rgba(
                                        "DeepSkyBlue", alpha=0.75
                                    )

                            # set explicit H count to the len of hydrogens for the neighbor atom
                            drawer = Chem.Draw.rdMolDraw2D.MolDraw2DCairo(600, 600)
                            Chem.rdDepictor.GenerateDepictionMatching2DStructure(mol, mol_without_H)
                            drawer.drawOptions().comicMode = True
                            drawer.drawOptions().addAtomIndices = True
                            drawer.drawOptions().continuousHighlight = True
                            drawer.DrawMolecule(
                                mol,
                                highlightAtoms=highlightAtoms,
                                highlightAtomColors=highlightAtomColors,
                            )
                            drawer.FinishDrawing()
                            img2 = drawer.GetDrawingText()

                            Chem.rdDepictor.Compute2DCoords(mol)
                            Chem.rdDepictor.NormalizeDepiction(mol)
                            drawer = Chem.Draw.rdMolDraw2D.MolDraw2DCairo(600, 600)
                            drawer.drawOptions().comicMode = True
                            drawer.drawOptions().addAtomIndices = True
                            drawer.drawOptions().continuousHighlight = True
                            drawer.DrawMolecule(
                                mol,
                                highlightAtoms=highlightAtoms,
                                highlightAtomColors=highlightAtomColors,
                            )
                            drawer.FinishDrawing()
                            img3 = drawer.GetDrawingText()

            # display both images in a grid
            fig, ax = plt.subplots(1, 3, figsize=(15, 5))
            ax[0].imshow(img1)
            ax[0].axis("off")
            ax[0].set_title("Found substructure")
            ax[1].imshow(Image.open(io.BytesIO(img2)))
            ax[1].axis("off")
            ax[1].set_title("Found benzylic hydrogen (in blue)")
            ax[2].imshow(Image.open(io.BytesIO(img3)))
            ax[2].axis("off")
            ax[2].set_title("in 2D")
            plt.show()

            # increase the suffix to differentiate different conformers
            suffix += 1

In [None]:
mols_dict

In [None]:
# read the corresponding com file
# search for the line that contains the charge and multiplicity in this case it is 0 1, change it to 0 2
# then search for the line that contains the hydrogen atom index, remove it
# these line start at after the first 0 1 line, remove the lowest index in the value list
# write the new com file with the same name but with _open_shell suffix
for key, value in mols_dict.items():
    print(f"key: {key}, value: {value}")
    first_match = True

    with open(f"{key}.com", "r") as f:
        lines = f.readlines()
        # find the line that contains the charge and multiplicity
        for index, line in enumerate(lines):
            # use regex (\d{1})\s{1}(\d{1}) to strictly match the charge and multiplicity
            if re.search(r"(\d{1})\s{1}(\d{1})", line):
                # change the multiplicity to 2
                lines[index] = re.sub(r"(\d{1})\s{1}(\d{1})", r"\1 2", line)
                # print the new line as \n
                # print(f"finding line: {line} at index: {index}, change to: {lines[index]}")
                print(
                    f"find multiplicity line: {repr(line)} at index: {index} (0-based), change to: {repr(lines[index])}"
                )
                if first_match:
                    first_match = False
                    # calculate the index of the line after the charge and multiplicity line
                    benzyl_hydrogen_index = index + value[0]
                    print(
                        f"Remove one benzylic hydrogen at index: {benzyl_hydrogen_index} (0-based, pick the lowest index), content: {repr(lines[benzyl_hydrogen_index])}"
                    )
                    lines.pop(benzyl_hydrogen_index)
        # write the new com file to a subfolder name open_shell
        # first make the subfolder
        os.makedirs("open_shell", exist_ok=True)
        with open(f"open_shell/{key}_open_shell.com", "w") as f:
            f.writelines(lines)