### <p style="font-family: Arial; color: gold; "> If you want, you can filter down the jobs for open shell and anion jobs by using only the lowest energy conformer for each molecule. This notebook will do that based on the FINISHED DFT results.

### <p style="font-family: Arial; color: red; ">Requirement: put the finished DFT results for the closed shell species into a subfolders named **3.DFT_log_files**

### <p style="font-family: Arial; color: red; "> You should have the same naming convention all the folder being used.

In [1]:
import re, os, sys, shutil, subprocess

### <p style="font-family: Arial; color: gold; font-weight: bold;"> define your input and output folders below

In [2]:
# define your desired folder names here
DFT_log_folder = "1.DFT_log_files"

filter_anion = True
input_anion_com_folder = "2.anion_com"
output_anion_com = "3.anion_com_filtered"

filter_open_shell = True
input_open_shell_com_folder = "1.open_shell_com"
output_open_shell_com = "3.open_shell_com_filtered"

In [3]:
def parse_goodvibes_output(output: str, temp: float = 298.15) -> dict:
    # This function extracts the desired values from the GoodVibes output
    lines = output.splitlines()
    data = {}
    column_mapping = {
        "E_SPC": "E_spc (Hartree)",
        "E": "E (Hartree)",
        "ZPE": "ZPE(Hartree)",
        "H_SPC": "H_spc(Hartree)",
        "T.S": "T*S",
        "T.qh-S": "T*qh_S",
        "G(T)_SPC": "G(T)_spc(Hartree)",
        "qh-G(T)_SPC": "qh_G(T)_spc(Hartree)",
    }

    # Find the index positions of the two lines of asterisks
    start_index = None
    end_index = None
    header_line = None
    for i, line in enumerate(lines):
        if re.match(r"^\s*\*{12,}\s*$", line):  # Matches lines with 12 or more asterisks
            if start_index is None:
                start_index = i
                header_line = lines[i - 1]  # The header line is the one before the first line of asterisks
            else:
                end_index = i
                break  # We only need the first two lines of asterisks

    # Parse the header line to determine the order of properties
    headers = []
    if header_line:
        headers = re.split(r"\s+", header_line.strip())[1:]  # get rid of the first column, which is the structure column

    # Extract relevant lines between the two asterisk lines
    if start_index is not None and end_index is not None and end_index > start_index:
        for line in lines[start_index + 1 : end_index]:
            if re.match(r"^\s*o", line):  # Matches lines starting with 'o' (with any amount of whitespace before)
                parts = re.split(r"\s+", line.strip())  # Split the line by whitespace
                structure_name = parts[1]  # The structure name is in parts[1]

                # Initialize a dictionary for this structure
                structure_data = {}
                # Populate the structure's data dictionary using headers and corresponding values
                for i, header in enumerate(headers):
                    if header in column_mapping:
                        structure_data[column_mapping[header]] = float(parts[i + 2])  # Offset by 2 for correct column indexing
                    else:
                        structure_data[header] = float(parts[i + 2])

                structure_data["T"] = temp  # Add temperature to each structure's data
                data[structure_name] = structure_data  # Add to the main data dictionary

    return data


def get_goodvibes_e_batch(temp: float = 298.15) -> dict:
    # Construct command-line arguments for GoodVibes
    cmd_args = [
        sys.executable,
        "-m",
        "goodvibes",
        os.path.join(DFT_log_folder, "*.log"),
        "--spc",
        "link",
        "-t",
        str(temp),
    ]
    # Run the GoodVibes command and capture the output
    result = subprocess.run(cmd_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    # Parse the output
    parsed_data = parse_goodvibes_output(result.stdout, temp)
    return parsed_data

In [4]:
# Run the function
result = get_goodvibes_e_batch()
# Sort the result by the structure number
result = {
    k: v
    for k, v in sorted(result.items(), key=lambda x: int(re.search(r"^\D+(\d+)_{1}", x[0]).group(1)))
}

### <p style="font-family: Arial; color: gold; font-weight: bold;"> raw output, each conformer is in a separate dictionary entry

In [5]:
result

{'cinnoline001_conf-1': {'E_spc (Hartree)': -916.822182,
  'E (Hartree)': -916.899135,
  'ZPE(Hartree)': 0.141515,
  'H_spc(Hartree)': -916.670241,
  'T*S': 0.045658,
  'T*qh_S': 0.045417,
  'G(T)_spc(Hartree)': -916.715899,
  'qh_G(T)_spc(Hartree)': -916.715658,
  'T': 298.15},
 'imidazopyrazine001_conf-1': {'E_spc (Hartree)': -3008.78186,
  'E (Hartree)': -3006.352654,
  'ZPE(Hartree)': 0.123636,
  'H_spc(Hartree)': -3008.648304,
  'T*S': 0.045253,
  'T*qh_S': 0.045129,
  'G(T)_spc(Hartree)': -3008.693558,
  'qh_G(T)_spc(Hartree)': -3008.693433,
  'T': 298.15},
 'isoquinoline001_conf-1': {'E_spc (Hartree)': -3014.798915,
  'E (Hartree)': -3012.402628,
  'ZPE(Hartree)': 0.154357,
  'H_spc(Hartree)': -3014.63409,
  'T*S': 0.046453,
  'T*qh_S': 0.045903,
  'G(T)_spc(Hartree)': -3014.680543,
  'qh_G(T)_spc(Hartree)': -3014.679993,
  'T': 298.15},
 'pyrazine001_conf-1': {'E_spc (Hartree)': -476.900135,
  'E (Hartree)': -476.996748,
  'ZPE(Hartree)': 0.206006,
  'H_spc(Hartree)': -476.6818

In [6]:
# grouped results by substrates
def group_results_by_substrates(result: dict) -> dict:
    # Initialize a new dictionary to
    result_grouped_by_substrates = {}
    # Regex pattern to capture the substrate name before "_", for example pyrd1_conf-1 -> pyrd1
    substrate_pattern = re.compile(r"^(.+?)_")
    for conformer_name, properties in result.items():
        # Extract substrate name using regex
        match = substrate_pattern.match(conformer_name)
        if match:
            substrate_name = match.group(1)

            # Append conformer data to the list for this substrate
            if substrate_name not in result_grouped_by_substrates:
                result_grouped_by_substrates[substrate_name] = []
            result_grouped_by_substrates[substrate_name].append(
                {conformer_name: properties}
            )
    return result_grouped_by_substrates


result_grouped_by_substrates = group_results_by_substrates(result)

### <p style="font-family: Arial; color: gold; font-weight: bold;"> group by molecule id, each entry is a list of conformers under the same molecule id

In [7]:
result_grouped_by_substrates

{'cinnoline001': [{'cinnoline001_conf-1': {'E_spc (Hartree)': -916.822182,
    'E (Hartree)': -916.899135,
    'ZPE(Hartree)': 0.141515,
    'H_spc(Hartree)': -916.670241,
    'T*S': 0.045658,
    'T*qh_S': 0.045417,
    'G(T)_spc(Hartree)': -916.715899,
    'qh_G(T)_spc(Hartree)': -916.715658,
    'T': 298.15}}],
 'imidazopyrazine001': [{'imidazopyrazine001_conf-1': {'E_spc (Hartree)': -3008.78186,
    'E (Hartree)': -3006.352654,
    'ZPE(Hartree)': 0.123636,
    'H_spc(Hartree)': -3008.648304,
    'T*S': 0.045253,
    'T*qh_S': 0.045129,
    'G(T)_spc(Hartree)': -3008.693558,
    'qh_G(T)_spc(Hartree)': -3008.693433,
    'T': 298.15}}],
 'isoquinoline001': [{'isoquinoline001_conf-1': {'E_spc (Hartree)': -3014.798915,
    'E (Hartree)': -3012.402628,
    'ZPE(Hartree)': 0.154357,
    'H_spc(Hartree)': -3014.63409,
    'T*S': 0.046453,
    'T*qh_S': 0.045903,
    'G(T)_spc(Hartree)': -3014.680543,
    'qh_G(T)_spc(Hartree)': -3014.679993,
    'T': 298.15}}],
 'pyrazine001': [{'pyrazin

### <p style="font-family: Arial; color: gold; font-weight: bold;"> only keep the lowest energy conformer for each molecule

In [8]:
# Function to select the conformer with the lowest specified energy property for each substrate
def select_lowest_energy_conformer(substrates: dict, column: str = "G(T)_spc(Hartree)") -> dict:
    print(f"filtering based on {column}")
    result_filtered = {}
    for substrate, conformers in substrates.items():
        print(f"--------------------------------------------------------------")
        print(f"for substrate {substrate}, we have {len(conformers)} conformers")
        for conformer in conformers:
            print(
                f"{list(conformer.keys())[0]} have {column}: {list(conformer.values())[0].get(column, None)}"
            )
        # Find the conformer with the minimum specified energy value
        min_conformer = min(
            conformers, key=lambda x: list(x.values())[0].get(column, float("inf"))
        )
        print(f"lowest energy conformer: {list(min_conformer.keys())[0]}")
        # Add the lowest energy conformer to the result dictionary
        result_filtered[substrate] = min_conformer
    return result_filtered


filtered_result = select_lowest_energy_conformer(result_grouped_by_substrates, column="G(T)_spc(Hartree)")

filtering based on G(T)_spc(Hartree)
--------------------------------------------------------------
for substrate cinnoline001, we have 1 conformers
cinnoline001_conf-1 have G(T)_spc(Hartree): -916.715899
lowest energy conformer: cinnoline001_conf-1
--------------------------------------------------------------
for substrate imidazopyrazine001, we have 1 conformers
imidazopyrazine001_conf-1 have G(T)_spc(Hartree): -3008.693558
lowest energy conformer: imidazopyrazine001_conf-1
--------------------------------------------------------------
for substrate isoquinoline001, we have 1 conformers
isoquinoline001_conf-1 have G(T)_spc(Hartree): -3014.680543
lowest energy conformer: isoquinoline001_conf-1
--------------------------------------------------------------
for substrate pyrazine001, we have 4 conformers
pyrazine001_conf-1 have G(T)_spc(Hartree): -476.731287
pyrazine001_conf-2 have G(T)_spc(Hartree): -476.731275
pyrazine001_conf-3 have G(T)_spc(Hartree): -476.730052
pyrazine001_conf-4 

In [9]:
filtered_result_filenames = []
for substrate, conformer in filtered_result.items():
    for conformer_name, properties in conformer.items():
        filtered_result_filenames.append(conformer_name)
print(f"filter down to {len(filtered_result_filenames)} COM files")

filter down to 252 COM files


### <p style="font-family: Arial; color: gold; font-weight: bold;"> filter anion com files

In [10]:
if filter_anion:  # Filter anion com files
    print(f"filtering anion com files, save to {output_anion_com} folder")
    # check if the output folder exists
    if not os.path.exists(output_anion_com):
        raise FileNotFoundError(f"{output_anion_com} folder does not exist, rerun the 0.clean_setup_folders.ipynb notebook")
    for filename in filtered_result_filenames:
        actual_filename = filename + "_anion" + ".com"
        print(
            f"{input_anion_com_folder}{os.sep}{actual_filename} -> {output_anion_com}{os.sep}{actual_filename}"
        )
        shutil.copy(input_anion_com_folder + os.sep + actual_filename, output_anion_com)

filtering anion com files, save to 3.anion_com_filtered folder
2.anion_com\cinnoline001_conf-1_anion.com -> 3.anion_com_filtered\cinnoline001_conf-1_anion.com
2.anion_com\imidazopyrazine001_conf-1_anion.com -> 3.anion_com_filtered\imidazopyrazine001_conf-1_anion.com
2.anion_com\isoquinoline001_conf-1_anion.com -> 3.anion_com_filtered\isoquinoline001_conf-1_anion.com
2.anion_com\pyrazine001_conf-1_anion.com -> 3.anion_com_filtered\pyrazine001_conf-1_anion.com
2.anion_com\pyrazolopyridine001_conf-1_anion.com -> 3.anion_com_filtered\pyrazolopyridine001_conf-1_anion.com
2.anion_com\pyrazolopyrimidine001_conf-1_anion.com -> 3.anion_com_filtered\pyrazolopyrimidine001_conf-1_anion.com
2.anion_com\pyridazine001_conf-2_anion.com -> 3.anion_com_filtered\pyridazine001_conf-2_anion.com
2.anion_com\pyridine001_conf-2_anion.com -> 3.anion_com_filtered\pyridine001_conf-2_anion.com
2.anion_com\pyrimidine001_conf-1_anion.com -> 3.anion_com_filtered\pyrimidine001_conf-1_anion.com
2.anion_com\quinazoline

### <p style="font-family: Arial; color: gold; font-weight: bold;"> filter radical com files

In [11]:
if filter_open_shell:  # Filter open shell com files
    print(f"filtering open shell com files")
    for filename in filtered_result_filenames:
        actual_filename = filename + "_openshell" + ".com"
        print(
            f"{input_open_shell_com_folder}{os.sep}{actual_filename} -> {output_open_shell_com}{os.sep}{actual_filename}"
        )
        shutil.copy(input_open_shell_com_folder + os.sep + actual_filename, output_open_shell_com)

filtering open shell com files
1.open_shell_com\cinnoline001_conf-1_openshell.com -> 3.open_shell_com_filtered\cinnoline001_conf-1_openshell.com
1.open_shell_com\imidazopyrazine001_conf-1_openshell.com -> 3.open_shell_com_filtered\imidazopyrazine001_conf-1_openshell.com
1.open_shell_com\isoquinoline001_conf-1_openshell.com -> 3.open_shell_com_filtered\isoquinoline001_conf-1_openshell.com
1.open_shell_com\pyrazine001_conf-1_openshell.com -> 3.open_shell_com_filtered\pyrazine001_conf-1_openshell.com
1.open_shell_com\pyrazolopyridine001_conf-1_openshell.com -> 3.open_shell_com_filtered\pyrazolopyridine001_conf-1_openshell.com
1.open_shell_com\pyrazolopyrimidine001_conf-1_openshell.com -> 3.open_shell_com_filtered\pyrazolopyrimidine001_conf-1_openshell.com
1.open_shell_com\pyridazine001_conf-2_openshell.com -> 3.open_shell_com_filtered\pyridazine001_conf-2_openshell.com
1.open_shell_com\pyridine001_conf-2_openshell.com -> 3.open_shell_com_filtered\pyridine001_conf-2_openshell.com
1.open_sh