# **Run this after all the GP notebooks**

### This copy all the summary file and merge them by tagging _anion and _opensheel to the corresponding species properties
### Then it convert from INTERNAL naming convention to PUSLISHED naming convention

In [1]:
import os
import re
import pandas as pd

# define the spreadsheet that contain the id, mapped id and smiles
smiles_spreadsheet = "smiles_mbbc_renamed.xlsx"
origin_id_header = "id_renamed"
mapped_id_header = "ms_id"

# define the summary file names in each species folder
summary_filename = "Summary_Properties_all.xlsx"
postprocessed_results_folder = "5.postprocessed_results"
anion_summary_file = os.path.join(
    "..", "gp_anion", postprocessed_results_folder, summary_filename
)
closed_shell_summary_file = os.path.join(
    "..", "gp_closed_shell", postprocessed_results_folder, summary_filename
)
openshell_summary_file = os.path.join(
    "..", "gp_openshell", postprocessed_results_folder, summary_filename
)


output_summary_filename = "Summary_Properties_all_species.xlsx"
output_summary_filename_published = (
    output_summary_filename.split(".")[0] + "_published.xlsx"
)

In [2]:
# check if these file exists, if not throw an error
if not os.path.exists(anion_summary_file):
    raise FileNotFoundError(f'{anion_summary_file} not found')
if not os.path.exists(closed_shell_summary_file):
    raise FileNotFoundError(f'{closed_shell_summary_file} not found')
if not os.path.exists(openshell_summary_file):
    raise FileNotFoundError(f'{openshell_summary_file} not found')

anion_summary_df = pd.read_excel(anion_summary_file)
display(anion_summary_df.head(1))
closed_shell_summary_df = pd.read_excel(closed_shell_summary_file)
display(closed_shell_summary_df.head(1))
openshell_summary_df = pd.read_excel(openshell_summary_file)
display(openshell_summary_df.head(1))

Unnamed: 0,Compound_Name,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,qh_G(T)_spc(Hartree),T,HOMO_Boltz,LUMO_Boltz,...,SASA_volume(Å³)_Boltz,SASA_sphericity_Boltz,NBO_charge_C1_Boltz,NBO_charge_C2_Boltz,distance_C1_C2(Å)_Boltz,%Vbur_C1_2.0Å_Boltz,%Vbur_C2_2.0Å_Boltz,Sterimol_L_C1_C2(Å)_morfeus_Boltz,Sterimol_B1_C1_C2(Å)_morfeus_Boltz,Sterimol_B5_C1_C2(Å)_morfeus_Boltz
0,cinnoline001,-916.254603,0.126956,-916.117407,0.045383,0.044889,-916.162295,298.15,-0.06257,0.12212,...,493.651517,0.900413,-0.57425,-0.0595,1.3649,85.424199,96.826575,7.229331,1.710807,6.519865


Unnamed: 0,Compound_Name,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,qh_G(T)_spc(Hartree),T,HOMO_Boltz,LUMO_Boltz,...,NMR_shift_C1_Boltz,NMR_shift_C2_Boltz,distance_C1_C2(Å)_Boltz,%Vbur_C1_2.0Å_Boltz,%Vbur_C2_2.0Å_Boltz,Sterimol_L_C1_C2(Å)_morfeus_Boltz,Sterimol_B1_C1_C2(Å)_morfeus_Boltz,Sterimol_B5_C1_C2(Å)_morfeus_Boltz,pyramidalization_Gavrish_C1(°)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,cinnoline001,-916.822182,0.141515,-916.670241,0.045658,0.045417,-916.715658,298.15,-0.30166,-0.05758,...,169.0436,37.2257,1.50178,94.040548,96.787836,7.095002,1.888265,6.62731,6.052108,0.814133


Unnamed: 0,Compound_Name,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,qh_G(T)_spc(Hartree),T,HOMO_Boltz,LUMO_Boltz,...,SASA_volume(Å³)_Boltz,SASA_sphericity_Boltz,NBO_charge_C1_Boltz,NBO_charge_C2_Boltz,distance_C1_C2(Å)_Boltz,%Vbur_C1_2.0Å_Boltz,%Vbur_C2_2.0Å_Boltz,Sterimol_L_C1_C2(Å)_morfeus_Boltz,Sterimol_B1_C1_C2(Å)_morfeus_Boltz,Sterimol_B5_C1_C2(Å)_morfeus_Boltz
0,cinnoline001,-916.169644,0.127772,-916.031706,0.045728,0.045424,-916.07713,298.15,-0.26787,-0.04585,...,491.694011,0.901822,-0.25479,-0.12516,1.38825,85.253099,96.920196,7.24425,1.710229,6.467309


In [3]:
# Initialize merged_df with Compound_Name column
anion_remaining = anion_summary_df.copy()
closed_shell_remaining = closed_shell_summary_df.copy()
openshell_remaining = openshell_summary_df.copy()
merged_df = closed_shell_summary_df[['Compound_Name']].copy()
merged_df.set_index('Compound_Name', inplace=True)

# Set Compound_Name as the index for all dataframes to enable row-wise merging
anion_remaining.set_index('Compound_Name', inplace=True)
closed_shell_remaining.set_index('Compound_Name', inplace=True)
openshell_remaining.set_index('Compound_Name', inplace=True)

# define column to skip
exclude_columns = ["T"]

# Process each column in closed_shell_remaining
for column in closed_shell_remaining.columns:
    # Add column from closed_shell_remaining to merged_df
    merged_df[column] = closed_shell_remaining[column]

    # Check and add columns from anion_remaining
    if column in anion_remaining.columns:
        if column not in exclude_columns:
            anion_col_df = anion_remaining[[column]].rename(columns={column: f"{column}_anion"})
            merged_df = merged_df.merge(anion_col_df, how='left', left_index=True, right_index=True)
        anion_remaining.drop(columns=[column], inplace=True)

    # Check and add columns from openshell_remaining
    if column in openshell_remaining.columns:
        if column not in exclude_columns:
            openshell_col_df = openshell_remaining[[column]].rename(columns={column: f"{column}_openshell"})
            merged_df = merged_df.merge(openshell_col_df, how='left', left_index=True, right_index=True)
        openshell_remaining.drop(columns=[column], inplace=True)

# Add remaining columns from anion_remaining
for column in anion_remaining.columns:
    remaining_anion_df = anion_remaining[[column]].rename(columns={column: f"{column}_anion"})
    merged_df = merged_df.merge(remaining_anion_df, how='left', left_index=True, right_index=True)

# Add remaining columns from openshell_remaining
for column in openshell_remaining.columns:
    remaining_openshell_df = openshell_remaining[[column]].rename(columns={column: f"{column}_openshell"})
    merged_df = merged_df.merge(remaining_openshell_df, how='left', left_index=True, right_index=True)

# Reset the index for the final dataframe
merged_df.reset_index(inplace=True)

# Display the resulting dataframe
display(merged_df.head())

Unnamed: 0,Compound_Name,E_spc (Hartree),E_spc (Hartree)_anion,E_spc (Hartree)_openshell,ZPE(Hartree),ZPE(Hartree)_anion,ZPE(Hartree)_openshell,H_spc(Hartree),H_spc(Hartree)_anion,H_spc(Hartree)_openshell,...,Sterimol_L_C1_C2(Å)_morfeus_Boltz_anion,Sterimol_L_C1_C2(Å)_morfeus_Boltz_openshell,Sterimol_B1_C1_C2(Å)_morfeus_Boltz,Sterimol_B1_C1_C2(Å)_morfeus_Boltz_anion,Sterimol_B1_C1_C2(Å)_morfeus_Boltz_openshell,Sterimol_B5_C1_C2(Å)_morfeus_Boltz,Sterimol_B5_C1_C2(Å)_morfeus_Boltz_anion,Sterimol_B5_C1_C2(Å)_morfeus_Boltz_openshell,pyramidalization_Gavrish_C1(°)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,cinnoline001,-916.822182,-916.254603,-916.169644,0.141515,0.126956,0.127772,-916.670241,-916.117407,-916.031706,...,7.229331,7.24425,1.888265,1.710807,1.710229,6.62731,6.519865,6.467309,6.052108,0.814133
1,imidazopyrazine001,-3008.78186,-3008.182049,-3008.127397,0.123636,0.107956,0.110076,-3008.648304,-3008.064051,-3008.00774,...,8.274198,8.183801,1.918522,1.757723,1.760709,4.867583,4.929419,4.830913,5.947583,0.803614
2,isoquinoline001,-3014.798915,-3014.203581,-3014.146461,0.154357,0.138532,0.140381,-3014.63409,-3014.054511,-3013.995666,...,7.111922,7.225379,1.889356,2.003615,1.742754,5.691907,5.694925,5.662842,6.066947,0.815513
3,isoquinoline002,-3014.810159,-3014.224397,-3014.155448,0.153908,0.13886,0.140346,-3014.645601,-3014.074972,-3014.004721,...,8.444481,8.54958,1.892364,1.7,1.756394,5.72437,5.7655,5.729767,5.915914,0.800419
4,pyrazine001,-476.900135,-476.298659,-476.245904,0.206006,0.191131,0.192688,-476.681848,-476.095642,-476.04143,...,8.777404,8.745725,2.737626,2.593618,2.710316,3.316209,3.538838,3.373775,5.965537,0.805425


In [4]:
# check for non-ascii char
nonascii_chars = set()
for column in merged_df.columns:
    non_ascii_characters = re.findall(r"[^\x00-\x7F]+", column)
    if non_ascii_characters:
        print(
            f"Non-ascii character found in column: {column}, with characters: {non_ascii_characters}"
        )
        nonascii_chars.update(non_ascii_characters)

Non-ascii character found in column: μ_Boltz, with characters: ['μ']
Non-ascii character found in column: μ_Boltz_anion, with characters: ['μ']
Non-ascii character found in column: μ_Boltz_openshell, with characters: ['μ']
Non-ascii character found in column: η_Boltz, with characters: ['η']
Non-ascii character found in column: η_Boltz_anion, with characters: ['η']
Non-ascii character found in column: η_Boltz_openshell, with characters: ['η']
Non-ascii character found in column: ω_Boltz, with characters: ['ω']
Non-ascii character found in column: ω_Boltz_anion, with characters: ['ω']
Non-ascii character found in column: ω_Boltz_openshell, with characters: ['ω']
Non-ascii character found in column: volume(Bohr_radius³/mol)_Boltz, with characters: ['³']
Non-ascii character found in column: volume(Bohr_radius³/mol)_Boltz_anion, with characters: ['³']
Non-ascii character found in column: volume(Bohr_radius³/mol)_Boltz_openshell, with characters: ['³']
Non-ascii character found in column: SA

In [5]:
char_map = {
    "Å": "Amgstrom",
    "²": "^3",
    "³": "^3",
    "η": "eta",
    "μ": "mu",
    "ω": "omega",
    "°": "deg",
}
# go through each column name and replace the non-ascii char with the char_map
for char, replacement in char_map.items():
    merged_df.columns = merged_df.columns.str.replace(char, replacement)

In [6]:
# write the merged dataframe to a new excel file
with pd.ExcelWriter(output_summary_filename) as writer:
    merged_df.to_excel(writer, sheet_name="Summary_Properties_All", index=False)
    
    # automatically adjusts the width of the columns
    for column in merged_df.columns:
        column_width = max(merged_df[column].astype(str).map(len).max(), len(column))
        col_idx = merged_df.columns.get_loc(column)
        writer.sheets["Summary_Properties_All"].set_column(col_idx, col_idx, column_width)

### Convert to published format

In [7]:
# Read in the mapping from smiles_with_mapping.xlsx
df = pd.read_excel(smiles_spreadsheet, header=0)
df = df[[origin_id_header, mapped_id_header]]

# Convert the mapping column to a dictionary with id being key, mapping being value
mapping_dict_from_internal_to_published = dict(zip(df[origin_id_header], df[mapped_id_header]))

In [8]:
mapping_dict_from_internal_to_published

{'pyridazine001': 'Het096',
 'pyridine001': 'Het097',
 'pyridine002': 'Het098',
 'pyridazine002': 'Het099',
 'pyridine003': 'Het100',
 'pyrazine001': 'Het101',
 'pyridazine003': 'Het102',
 'pyridine004': 'Het103',
 'pyridine005': 'Het104',
 'pyridine006': 'Het105',
 'pyridine007': 'Het106',
 'pyridine008': 'Het107',
 'pyrimidine001': 'Het108',
 'pyridine009': 'Het109',
 'pyrimidine002': 'Het110',
 'pyrimidine003': 'Het111',
 'pyrimidine004': 'Het112',
 'pyrimidine005': 'Het113',
 'pyridine010': 'Het114',
 'pyrimidine006': 'Het115',
 'pyridine011': 'Het116',
 'pyridine012': 'Het117',
 'pyridine013': 'Het118',
 'pyrimidine007': 'Het119',
 'pyridine014': 'Het120',
 'pyrimidine008': 'Het121',
 'pyrimidine009': 'Het122',
 'pyrimidine010': 'Het123',
 'pyridine015': 'Het124',
 'pyridine016': 'Het125',
 'pyrimidine011': 'Het126',
 'pyrimidine012': 'Het127',
 'pyridine017': 'Het128',
 'pyridine018': 'Het129',
 'pyrimidine013': 'Het130',
 'pyridine019': 'Het131',
 'pyridine020': 'Het132',
 'pyri

In [9]:
# define all the files you need to convert
files = [output_summary_filename]

In [10]:
# read each file
for file in files:
    content = pd.read_excel(file, header=0)
    # display(content)
    # what we care is the Compound_Name column
    # go over row, use regular expression to match the column with the key from mapping_dict_from_internal_to_published
    # if found, replace the value with the value from mapping_dict_from_internal_to_published
    for index, row in content.iterrows():
        for key, value in mapping_dict_from_internal_to_published.items():
            if key in row["Compound_Name"]:
                content.at[index, "Compound_Name"] = row["Compound_Name"].replace(
                    key, value
                )
    # then sort the row in ascend by Compound_Name, use this "\D+(\d+)" to find the number in the string
    content["Compound_numbering"] = content["Compound_Name"].str.extract("(\d+)", expand=False).astype(int)
    content = content.sort_values(by="Compound_numbering")
    content = content.drop(columns=["Compound_numbering"])
    # reset the index
    content = content.reset_index(drop=True)
    # display(content)
    display(f"saving to {output_summary_filename_published}")
    # save the file
    with pd.ExcelWriter(output_summary_filename_published) as writer:
        content.to_excel(writer, sheet_name="Summary_Properties_All", index=False)
        
        # automatically adjusts the width of the columns
        for column in content.columns:
            column_width = max(content[column].astype(str).map(len).max(), len(column))
            col_idx = content.columns.get_loc(column)
            writer.sheets["Summary_Properties_All"].set_column(col_idx, col_idx, column_width)

'saving to Summary_Properties_all_species_published.xlsx'