### <p style="font-family: Arial; color: gold; "> If you want, you can filter down the jobs for open shell and anion jobs by using only the lowest energy conformer for each molecule. This notebook will do that based on the FINISHED DFT results.

### <p style="font-family: Arial; color: red; ">Requirement: put the finished DFT results for the closed shell species into a subfolders named **3.DFT_log_files**

### <p style="font-family: Arial; color: red; "> You should have the same naming convention all the folder being used.

In [1]:
import re, os, sys, shutil, subprocess

### <p style="font-family: Arial; color: gold; font-weight: bold;"> define your input and output folders below

In [2]:
# define your desired folder names here
DFT_log_folder = "1.DFT_log_files"
os.makedirs(DFT_log_folder, exist_ok=True)

filter_anion = True
input_anion_com_folder = "2.anion_com_tom"
anion_com_output = "3.anion_com_tom_filtered"
os.makedirs(anion_com_output, exist_ok=True)
for file in os.listdir(anion_com_output):
    os.remove(os.path.join(anion_com_output, file))

filter_open_shell = True
input_open_shell_com_folder = "1.open_shell_com_tom"
open_shell_com_output = "3.open_shell_com_tom_filtered"
os.makedirs(open_shell_com_output, exist_ok=True)
for file in os.listdir(open_shell_com_output):
    os.remove(os.path.join(open_shell_com_output, file))

file_extension = ".com"

In [3]:
def parse_goodvibes_output(output: str, temp: float = 298.15) -> dict:
    # This function extracts the desired values from the GoodVibes output
    lines = output.splitlines()
    data = {}
    column_mapping = {
        "E_SPC": "E_spc (Hartree)",
        "E": "E (Hartree)",
        "ZPE": "ZPE(Hartree)",
        "H_SPC": "H_spc(Hartree)",
        "T.S": "T*S",
        "T.qh-S": "T*qh_S",
        "G(T)_SPC": "G(T)_spc(Hartree)",
        "qh-G(T)_SPC": "qh_G(T)_spc(Hartree)",
    }

    # Find the index positions of the two lines of asterisks
    start_index = None
    end_index = None
    header_line = None
    for i, line in enumerate(lines):
        if re.match(r"^\s*\*{12,}\s*$", line):  # Matches lines with 12 or more asterisks
            if start_index is None:
                start_index = i
                header_line = lines[i - 1]  # The header line is the one before the first line of asterisks
            else:
                end_index = i
                break  # We only need the first two lines of asterisks

    # Parse the header line to determine the order of properties
    headers = []
    if header_line:
        headers = re.split(r"\s+", header_line.strip())[1:]  # get rid of the first column, which is the structure column

    # Extract relevant lines between the two asterisk lines
    if start_index is not None and end_index is not None and end_index > start_index:
        for line in lines[start_index + 1 : end_index]:
            if re.match(r"^\s*o", line):  # Matches lines starting with 'o' (with any amount of whitespace before)
                parts = re.split(r"\s+", line.strip())  # Split the line by whitespace
                structure_name = parts[1]  # The structure name is in parts[1]

                # Initialize a dictionary for this structure
                structure_data = {}
                # Populate the structure's data dictionary using headers and corresponding values
                for i, header in enumerate(headers):
                    if header in column_mapping:
                        structure_data[column_mapping[header]] = float(parts[i + 2])  # Offset by 2 for correct column indexing
                    else:
                        structure_data[header] = float(parts[i + 2])

                structure_data["T"] = temp  # Add temperature to each structure's data
                data[structure_name] = structure_data  # Add to the main data dictionary

    return data


def get_goodvibes_e_batch(temp: float = 298.15) -> dict:
    # Construct command-line arguments for GoodVibes
    cmd_args = [
        sys.executable,
        "-m",
        "goodvibes",
        os.path.join(DFT_log_folder, "*.log"),
        "--spc",
        "link",
        "-t",
        str(temp),
    ]
    # Run the GoodVibes command and capture the output
    result = subprocess.run(cmd_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    # Parse the output
    parsed_data = parse_goodvibes_output(result.stdout, temp)
    return parsed_data

In [4]:
# Run the function
result = get_goodvibes_e_batch()
# Sort the result by the structure number
result = {
    k: v
    for k, v in sorted(result.items(), key=lambda x: int(re.search(r"^\D+(\d+)_{1}", x[0]).group(1)))
}

In [5]:
result

{'pyrd1_conf-1': {'E_spc (Hartree)': -441.215801,
  'E (Hartree)': -441.305193,
  'ZPE(Hartree)': 0.163282,
  'H_spc(Hartree)': -441.043098,
  'T*S': 0.043006,
  'T*qh_S': 0.04266,
  'G(T)_spc(Hartree)': -441.086104,
  'qh_G(T)_spc(Hartree)': -441.085759,
  'T': 298.15},
 'pyrdz1_conf-1': {'E_spc (Hartree)': -303.585861,
  'E (Hartree)': -303.638581,
  'ZPE(Hartree)': 0.103728,
  'H_spc(Hartree)': -303.475215,
  'T*S': 0.036948,
  'T*qh_S': 0.036617,
  'G(T)_spc(Hartree)': -303.512163,
  'qh_G(T)_spc(Hartree)': -303.511832,
  'T': 298.15},
 'pyrmd1_conf-1': {'E_spc (Hartree)': -457.262567,
  'E (Hartree)': -457.34237,
  'ZPE(Hartree)': 0.151533,
  'H_spc(Hartree)': -457.101761,
  'T*S': 0.04271,
  'T*qh_S': 0.042452,
  'G(T)_spc(Hartree)': -457.144471,
  'qh_G(T)_spc(Hartree)': -457.144213,
  'T': 298.15},
 'pyrz1_conf-1': {'E_spc (Hartree)': -457.255348,
  'E (Hartree)': -457.336221,
  'ZPE(Hartree)': 0.151285,
  'H_spc(Hartree)': -457.094752,
  'T*S': 0.042943,
  'T*qh_S': 0.042501,


In [6]:
# grouped results by substrates
def group_results_by_substrates(result: dict) -> dict:
    # Initialize a new dictionary to
    result_grouped_by_substrates = {}
    # Regex pattern to capture the substrate name before "_", for example pyrd1_conf-1 -> pyrd1
    substrate_pattern = re.compile(r"^(.+)_")
    for conformer_name, properties in result.items():
        # Extract substrate name using regex
        match = substrate_pattern.match(conformer_name)
        if match:
            substrate_name = match.group(1)

            # Append conformer data to the list for this substrate
            if substrate_name not in result_grouped_by_substrates:
                result_grouped_by_substrates[substrate_name] = []
            result_grouped_by_substrates[substrate_name].append(
                {conformer_name: properties}
            )
    return result_grouped_by_substrates


result_grouped_by_substrates = group_results_by_substrates(result)

In [7]:
result_grouped_by_substrates

{'pyrd1': [{'pyrd1_conf-1': {'E_spc (Hartree)': -441.215801,
    'E (Hartree)': -441.305193,
    'ZPE(Hartree)': 0.163282,
    'H_spc(Hartree)': -441.043098,
    'T*S': 0.043006,
    'T*qh_S': 0.04266,
    'G(T)_spc(Hartree)': -441.086104,
    'qh_G(T)_spc(Hartree)': -441.085759,
    'T': 298.15}}],
 'pyrdz1': [{'pyrdz1_conf-1': {'E_spc (Hartree)': -303.585861,
    'E (Hartree)': -303.638581,
    'ZPE(Hartree)': 0.103728,
    'H_spc(Hartree)': -303.475215,
    'T*S': 0.036948,
    'T*qh_S': 0.036617,
    'G(T)_spc(Hartree)': -303.512163,
    'qh_G(T)_spc(Hartree)': -303.511832,
    'T': 298.15}}],
 'pyrmd1': [{'pyrmd1_conf-1': {'E_spc (Hartree)': -457.262567,
    'E (Hartree)': -457.34237,
    'ZPE(Hartree)': 0.151533,
    'H_spc(Hartree)': -457.101761,
    'T*S': 0.04271,
    'T*qh_S': 0.042452,
    'G(T)_spc(Hartree)': -457.144471,
    'qh_G(T)_spc(Hartree)': -457.144213,
    'T': 298.15}}],
 'pyrz1': [{'pyrz1_conf-1': {'E_spc (Hartree)': -457.255348,
    'E (Hartree)': -457.336221,


In [8]:
# Function to select the conformer with the lowest specified energy property for each substrate
def select_lowest_energy_conformer(substrates: dict, column: str = "E (Hartree)") -> dict:
    print(f"filtering based on {column}")
    result_filtered = {}
    for substrate, conformers in substrates.items():
        print(f"--------------------------------------------------------------")
        print(f"for substrate {substrate}, we have {len(conformers)} conformers")
        for conformer in conformers:
            print(
                f"{list(conformer.keys())[0]} have {column}: {list(conformer.values())[0].get(column, None)}"
            )
        # Find the conformer with the minimum specified energy value
        min_conformer = min(
            conformers, key=lambda x: list(x.values())[0].get(column, float("inf"))
        )
        print(f"lowest energy conformer: {list(min_conformer.keys())[0]}")
        # Add the lowest energy conformer to the result dictionary
        result_filtered[substrate] = min_conformer
    return result_filtered


filtered_result = select_lowest_energy_conformer(
    result_grouped_by_substrates, column="E (Hartree)"
)

filtering based on E (Hartree)
--------------------------------------------------------------
for substrate pyrd1, we have 1 conformers
pyrd1_conf-1 have E (Hartree): -441.305193
lowest energy conformer: pyrd1_conf-1
--------------------------------------------------------------
for substrate pyrdz1, we have 1 conformers
pyrdz1_conf-1 have E (Hartree): -303.638581
lowest energy conformer: pyrdz1_conf-1
--------------------------------------------------------------
for substrate pyrmd1, we have 1 conformers
pyrmd1_conf-1 have E (Hartree): -457.34237
lowest energy conformer: pyrmd1_conf-1
--------------------------------------------------------------
for substrate pyrz1, we have 1 conformers
pyrz1_conf-1 have E (Hartree): -457.336221
lowest energy conformer: pyrz1_conf-1
--------------------------------------------------------------
for substrate pyrd2, we have 1 conformers
pyrd2_conf-1 have E (Hartree): -441.302704
lowest energy conformer: pyrd2_conf-1
----------------------------------

In [9]:
filtered_result_filenames = []
for substrate, conformer in filtered_result.items():
    for conformer_name, properties in conformer.items():
        filtered_result_filenames.append(conformer_name)
print(f"filter down to {len(filtered_result_filenames)} COM files")

filter down to 35 COM files


In [10]:
if filter_anion:  # Filter anion com files
    print(f"filtering anion com files, save to {anion_com_output} folder")
    for filename in filtered_result_filenames:
        actual_filename = filename + "_anion" + file_extension
        print(
            f"{input_anion_com_folder}{os.sep}{actual_filename} -> {anion_com_output}{os.sep}{actual_filename}"
        )
        shutil.copy(input_anion_com_folder + os.sep + actual_filename, anion_com_output)

filtering anion com files, save to 3.anion_com_tom_filtered folder
2.anion_com_tom\pyrd1_conf-1_anion.com -> 3.anion_com_tom_filtered\pyrd1_conf-1_anion.com
2.anion_com_tom\pyrdz1_conf-1_anion.com -> 3.anion_com_tom_filtered\pyrdz1_conf-1_anion.com
2.anion_com_tom\pyrmd1_conf-1_anion.com -> 3.anion_com_tom_filtered\pyrmd1_conf-1_anion.com
2.anion_com_tom\pyrz1_conf-1_anion.com -> 3.anion_com_tom_filtered\pyrz1_conf-1_anion.com
2.anion_com_tom\pyrd2_conf-1_anion.com -> 3.anion_com_tom_filtered\pyrd2_conf-1_anion.com
2.anion_com_tom\pyrdz2_conf-1_anion.com -> 3.anion_com_tom_filtered\pyrdz2_conf-1_anion.com
2.anion_com_tom\pyrmd2_conf-1_anion.com -> 3.anion_com_tom_filtered\pyrmd2_conf-1_anion.com
2.anion_com_tom\pyrz2_conf-1_anion.com -> 3.anion_com_tom_filtered\pyrz2_conf-1_anion.com
2.anion_com_tom\pyrd3_conf-1_anion.com -> 3.anion_com_tom_filtered\pyrd3_conf-1_anion.com
2.anion_com_tom\pyrdz3_conf-1_anion.com -> 3.anion_com_tom_filtered\pyrdz3_conf-1_anion.com
2.anion_com_tom\pyrmd3_

In [11]:
if filter_open_shell:  # Filter open shell com files
    print(f"filtering open shell com files")
    for filename in filtered_result_filenames:
        actual_filename = filename + "_openshell" + file_extension
        print(
            f"{input_open_shell_com_folder}{os.sep}{actual_filename} -> {open_shell_com_output}{os.sep}{actual_filename}"
        )
        shutil.copy(input_open_shell_com_folder + os.sep + actual_filename, open_shell_com_output)

filtering open shell com files
1.open_shell_com_tom\pyrd1_conf-1_openshell.com -> 3.open_shell_com_tom_filtered\pyrd1_conf-1_openshell.com
1.open_shell_com_tom\pyrdz1_conf-1_openshell.com -> 3.open_shell_com_tom_filtered\pyrdz1_conf-1_openshell.com
1.open_shell_com_tom\pyrmd1_conf-1_openshell.com -> 3.open_shell_com_tom_filtered\pyrmd1_conf-1_openshell.com
1.open_shell_com_tom\pyrz1_conf-1_openshell.com -> 3.open_shell_com_tom_filtered\pyrz1_conf-1_openshell.com
1.open_shell_com_tom\pyrd2_conf-1_openshell.com -> 3.open_shell_com_tom_filtered\pyrd2_conf-1_openshell.com
1.open_shell_com_tom\pyrdz2_conf-1_openshell.com -> 3.open_shell_com_tom_filtered\pyrdz2_conf-1_openshell.com
1.open_shell_com_tom\pyrmd2_conf-1_openshell.com -> 3.open_shell_com_tom_filtered\pyrmd2_conf-1_openshell.com
1.open_shell_com_tom\pyrz2_conf-1_openshell.com -> 3.open_shell_com_tom_filtered\pyrz2_conf-1_openshell.com
1.open_shell_com_tom\pyrd3_conf-1_openshell.com -> 3.open_shell_com_tom_filtered\pyrd3_conf-1_ope