In [1]:
import re, os, sys, json, subprocess

In [2]:
def parse_goodvibes_output(output: str, temp: float = 298.15) -> dict:
    # This function extracts the desired values from the GoodVibes output
    lines = output.splitlines()
    data = {}
    column_mapping = {
        "E_SPC": "E_spc (Hartree)",
        "E": "E (Hartree)",
        "ZPE": "ZPE(Hartree)",
        "H_SPC": "H_spc(Hartree)",
        "T.S": "T*S",
        "T.qh-S": "T*qh_S",
        "G(T)_SPC": "G(T)_spc(Hartree)",
        "qh-G(T)_SPC": "qh_G(T)_spc(Hartree)",
    }

    # Find the index positions of the two lines of asterisks
    start_index = None
    end_index = None
    header_line = None
    for i, line in enumerate(lines):
        if re.match(
            r"^\s*\*{12,}\s*$", line
        ):  # Matches lines with 12 or more asterisks
            if start_index is None:
                start_index = i
                header_line = lines[
                    i - 1
                ]  # The header line is the one before the first line of asterisks
            else:
                end_index = i
                break  # We only need the first two lines of asterisks

    # Parse the header line to determine the order of properties
    headers = []
    if header_line:
        headers = re.split(r"\s+", header_line.strip())[
            1:
        ]  # get rid of the first column, which is the structure column

    # Extract relevant lines between the two asterisk lines
    if start_index is not None and end_index is not None and end_index > start_index:
        for line in lines[start_index + 1 : end_index]:
            if re.match(
                r"^\s*o", line
            ):  # Matches lines starting with 'o' (with any amount of whitespace before)
                parts = re.split(r"\s+", line.strip())  # Split the line by whitespace
                structure_name = parts[1]  # The structure name is in parts[1]

                # Initialize a dictionary for this structure
                structure_data = {}
                # Populate the structure's data dictionary using headers and corresponding values
                for i, header in enumerate(headers):
                    if header in column_mapping:
                        structure_data[column_mapping[header]] = float(
                            parts[i + 2]
                        )  # Offset by 2 for correct column indexing
                    else:
                        structure_data[header] = float(parts[i + 2])

                structure_data["T"] = temp  # Add temperature to each structure's data
                data[structure_name] = structure_data  # Add to the main data dictionary

    return data

In [3]:
def get_goodvibes_e_batch(temp: float = 298.15) -> dict:
    # Construct command-line arguments for GoodVibes
    cmd_args = [
        sys.executable,
        "-m",
        "goodvibes",
        "*.log",
        "--spc",
        "link",
        "-t",
        str(temp),
    ]
    # Run the GoodVibes command and capture the output
    result = subprocess.run(
        cmd_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
    )
    # Parse the output
    parsed_data = parse_goodvibes_output(result.stdout, temp)
    return parsed_data

In [4]:
# Run the function
result = get_goodvibes_e_batch()
# Sort the result by the structure number
result = {
    k: v
    for k, v in sorted(
        result.items(), key=lambda x: int(re.search(r"^\D+(\d+)_{1}", x[0]).group(1))
    )
}
with open("goodvibes_output.json", "w") as f:
    json.dump(result, f, indent=2)

In [5]:
# grouped results by substrates
def group_results_by_substrates(result: dict) -> dict:
    # Initialize a new dictionary to 
    result_grouped_by_substrates = {}
    # Regex pattern to capture the substrate name before "_", for example pyrd1_conf-1 -> pyrd1
    substrate_pattern = re.compile(r"^(.+)_")
    for conformer_name, properties in result.items():
        # Extract substrate name using regex
        match = substrate_pattern.match(conformer_name)
        if match:
            substrate_name = match.group(1)

            # Append conformer data to the list for this substrate
            if substrate_name not in result_grouped_by_substrates:
                result_grouped_by_substrates[substrate_name] = []
            result_grouped_by_substrates[substrate_name].append(
                {conformer_name: properties}
            )
    return result_grouped_by_substrates

result_grouped_by_substrates = group_results_by_substrates(result)
with open("goodvibes_output_grouped.json", "w") as f:
    json.dump(result_grouped_by_substrates, f, indent=2)

In [6]:
# Function to select the conformer with the lowest specified energy property for each substrate
def select_lowest_energy_conformer(
    substrates: dict, column: str = "E_spc (Hartree)"
) -> dict:
    result_filtered = {}
    for substrate, conformers in substrates.items():
        # Find the conformer with the minimum specified energy value
        min_conformer = min(
            conformers, key=lambda x: list(x.values())[0].get(column, float("inf"))
        )
        # Add the lowest energy conformer to the result dictionary
        result_filtered[substrate] = min_conformer
    return result_filtered


filtered_result = select_lowest_energy_conformer(
    result_grouped_by_substrates, column="E_spc (Hartree)"
)
filtered_result
with open("goodvibes_output_filtered.json", "w") as f:
    json.dump(filtered_result, f, indent=2)