### <p style="font-family: Arial; color: gold; font-weight: bold;">**create by Tom Tan in 8.30.2024** </p>
##### Now one notebook will deal with all the prefix, the downside is that the define properties will be the same for all the prefix.

***
# **1. Imports**

In [1]:
import os, re, glob
import pandas as pd

import get_properties_functions_for_WI as gp

common_structure_folder = "1.common_structure"
log_files_folder = "2.log_files"
sdf_files_folder = "3.sdf_files"
temp_folder = "temp"
atom_mappings_folder = "4.atom_mappings"
output_folder = "5.postprocessed_results"

***
# **2. Import the atom map from preprocess notebook**
### <p style="font-family: Arial; color: gold; font-weight: bold;"> **Grep all available prefix based on the log file** </p>

In [2]:
prefixs = {}
for file in glob.glob("*.xlsx", root_dir=atom_mappings_folder):
    key = re.search(r"^(\D+)_atom_map", file)
    if key and key.group(1) in prefixs:
        prefixs[key.group(1)].append(file)
    else:
        prefixs[key.group(1)] = [file]

In [3]:
atom_map_df_all = {}
for prefix in prefixs:
    atom_mappings = atom_mappings_folder + os.sep + prefix + "_atom_map.xlsx"
    atom_map_df = pd.read_excel(
        atom_mappings, "Sheet1", index_col=0, header=0, engine="openpyxl"
    )
    # add the log_files_folder before all the log_names cells
    atom_map_df["log_name"] = log_files_folder + os.sep + atom_map_df["log_name"]
    atom_map_df_all[prefix] = atom_map_df.copy(deep=True)
    print(f"prefix: {prefix}")
    display(atom_map_df.head(3))

prefix: pyrdz


Unnamed: 0,log_name,N3,N4,C5,C6,C7,C2,C1,H1
0,2.log_files\pyrdz1_conf-1_openshell,N7,N6,C5,C4,C3,C2,C1,H8
1,2.log_files\pyrdz2_conf-1_openshell,C3,N4,N5,C6,C7,C2,C1,H8
2,2.log_files\pyrdz3_conf-1_openshell,N8,N7,C6,C5,C4,C3,C2,H12


prefix: pyrd


Unnamed: 0,log_name,C3,C4,N5,C6,C7,C2,C1,H1
0,2.log_files\pyrd1_conf-1_openshell,C5,C4,N3,C11,C10,C2,C1,H12
1,2.log_files\pyrd2_conf-1_openshell,C11,N10,C9,C4,C3,C2,C1,H12
2,2.log_files\pyrd3_conf-1_openshell,C3,C4,N5,C6,C7,C2,C1,H12


prefix: pyrmd


Unnamed: 0,log_name,N3,C4,C5,C6,N7,C2,C1,H1
0,2.log_files\pyrmd1_conf-1_openshell,N11,C10,C5,C4,N3,C2,C1,H12
1,2.log_files\pyrmd2_conf-1_openshell,C10,N9,C8,C3,N11,C2,C1,H12
2,2.log_files\pyrmd3_conf-2_openshell,N4,C5,C10,C11,N12,C3,C2,H16


prefix: pyrz


Unnamed: 0,log_name,C3,N4,C5,C6,N7,C2,C1,H1
0,2.log_files\pyrz1_conf-1_openshell,C11,N10,C5,C4,N3,C2,C1,H12
1,2.log_files\pyrz2_conf-1_openshell,C12,N11,C10,C5,N4,C3,C2,H16
2,2.log_files\pyrz3_conf-1_openshell,C7,N6,C5,C4,N3,C2,C1,H8


# **3. Define Properties to Collect**
### <p style="font-family: Arial; color: gold"> !!!User input required, Change/comment the properties block to the one you want to collect. </p>

In [4]:
for prefix, _ in atom_map_df_all.items():
    print(f"processing prefix: {prefix}")
    pd.set_option("display.max_columns", None)
    # ---------------GoodVibes Engergies---------------
    # uses the GoodVibes 2021 Branch (Jupyter Notebook Compatible)
    # calculates the quasi harmonic corrected G(T) and single point corrected G(T) as well as other thermodynamic properties
    # inputs: dataframe, temperature
    df = atom_map_df_all[prefix].copy(deep=True)
    df = gp.get_goodvibes_e(df, 298.15)

    # ---------------Frontier Orbitals-----------------
    # E(HOMO), E(LUMO), mu(chemical potential or negative of molecular electronegativity), eta(hardness/softness), omega(electrophilicity index)
    df = gp.get_frontierorbs(df)

    # ---------------Polarizability--------------------
    # Exact polarizability
    df = gp.get_polarizability(df)

    # ---------------Dipole----------------------------
    # Total dipole moment magnitude in Debye
    df = gp.get_dipole(df)

    # ---------------Volume----------------------------
    # Molar volume
    # requires the Gaussian keyword = "volume" in the .com file
    df = gp.get_volume(df)

    # ---------------SASA------------------------------
    # Uses morfeus to calculat sovlent accessible surface area and the volume under the SASA
    df = gp.get_SASA(df)

    # ---------------NBO-------------------------------
    # natural charge from NBO
    # requires the Gaussian keyword = "pop=nbo7" in the .com file
    nbo_list = ["C1", "C2"]
    df = gp.get_nbo(df, nbo_list)

    # ---------------NMR-------------------------------
    # isotropic NMR shift
    # requires the Gaussian keyword = "nmr=giao" in the .com file
    nmr_list = ["C1", "C2"]
    df = gp.get_nmr(df, nmr_list)

    # ---------------Distance--------------------------
    # distance between 2 atoms
    dist_list_of_lists = [["C1", "C2"]]
    df = gp.get_distance(df, dist_list_of_lists)

    # ---------------Angle-----------------------------
    # angle between 3 atoms
    # angle_list_of_lists = [["C5", "N1", "C1"]]
    # df = gp.get_angles(df, angle_list_of_lists)

    # ---------------Dihedral--------------------------
    # dihedral angle between 4 atoms
    # dihedral_list_of_lists = [["C4", "C5", "N1", "C1"], ["C2", "C1", "N1", "C5"]]
    # df = gp.get_dihedral(df, dihedral_list_of_lists)

    # ---------------Vbur Scan-------------------------
    # uses morfeus to calculate the buried volume at a series of radii (including hydrogens)
    # inputs: dataframe, list of atoms, start_radius, end_radius, and step_size
    # if you only want a single radius, put the same value for start_radius and end_radius (keep step_size > 0)
    vbur_list = ["C1", "C2"]
    df = gp.get_vbur_scan(df, vbur_list, 2, 2, 0.5)

    # ---------------Sterimol morfeus------------------
    # uses morfeus to calculate Sterimol L, B1, and B5 values
    # NOTE: this is much faster than the corresponding DBSTEP function (recommendation: use as default/if you don't need Sterimol2Vec)
    sterimol_list_of_lists = [["C1", "C2"]]
    df = gp.get_sterimol_morfeus(df, sterimol_list_of_lists)

    # ---------------Buried Sterimol-------------------
    # uses morfeus to calculate Sterimol L, B1, and B5 values within a given sphere of radius r_buried
    # atoms outside the sphere + 0.5 vdW radius are deleted and the Sterimol vectors are calculated
    # for more information: https://kjelljorner.github.io/morfeus/sterimol.html
    # inputs: dataframe, list of atom pairs, r_buried
    # sterimol_list_of_lists = [["C1", "C2"]]
    # df = gp.get_buried_sterimol(df, sterimol_list_of_lists, 5.5)

    # ---------------Sterimol DBSTEP-------------------
    # uses DBSTEP to calculate Sterimol L, B1, and B5 values
    # default grid point spacing (0.05 Angstrom) is used (can use custom spacing or vdw radii in the get_properties_functions script)
    # more info here: https://github.com/patonlab/DBSTEP
    # NOTE: this takes longer than the morfeus function (recommendation: only use this if you need Sterimol2Vec)
    # sterimol_list_of_lists = [["N1", "C1"], ["N1", "C5"]]
    # df = gp.get_sterimol_dbstep(df, sterimol_list_of_lists)

    # ---------------Sterimol2Vec----------------------
    # uses DBSTEP to calculate Sterimol Bmin and Bmax values at intervals from 0 to end_radius, with a given step_size
    # default grid point spacing (0.05 Angstrom) is used (can use custom spacing or vdw radii in the get_properties_functions script)
    # more info here: https://github.com/patonlab/DBSTEP
    # inputs: dataframe, list of atom pairs, end_radius, and step_size
    # sterimol2vec_list_of_lists = [["N1", "C1"], ["N1", "C5"]]
    # df = gp.get_sterimol2vec(df, sterimol2vec_list_of_lists, 1, 1.0)

    # ---------------Pyramidalization------------------
    # uses morfeus to calculate pyramidalization based on the 3 atoms in closest proximity to the defined atom
    # collects values based on two definitions of pyramidalization
    # details on these values can be found here: https://kjelljorner.github.io/morfeus/pyramidalization.html
    pyr_list = ["C1"]
    df = gp.get_pyramidalization(df, pyr_list)

    # ---------------Plane Angle-----------------------
    # !plane angle between 2 planes (each defined by 6 atoms)
    # planeangle_list_of_lists = [["N1", "C1", "C5"], ["C2", "C3", "C4"]]
    # df = gp.get_planeangle(df, planeangle_list_of_lists)

    # --------------LP energy - custom from first cell---------------
    # lp_list = ["N1"]
    # df = gp.get_one_lp_energy(df, lp_list)

    # ---------------Time----------------------------------
    # returns the total CPU time and total Wall time (not per subjob) because we are pioneers
    # if used in summary df, will give the average (not Boltzmann average) in the Boltzmann average column
    # df = gp.get_time(df)

    # ---------------ChelpG----------------------------
    # ChelpG ESP charge
    # requires the Gaussian keyword = "pop=chelpg" in the .com file
    # a_list = ["C1", "C2", "C3", "C4", "C5", "N1"]
    # df = gp.get_chelpg(df, a_list)

    # ---------------Hirshfeld-------------------------
    # Hirshfeld charge, CM5 charge, Hirshfeld atom dipole
    # requires the Gaussian keyword = "pop=hirshfeld" in the .com file
    # a_list = ["C1", "C2", "C3", "C4", "C5", "N1"]
    # df = gp.get_hirshfeld(df, a_list)

    # !new functions below!
    # ---------------Natural Bond Order (total/covalent/ionic)-------------------------
    # Natural Bond Order (total/covalent/ionic) between 2 atoms, might return non-numerical values
    # requires Natural Resonance Theory Analysis in Gaussian input file ("$nbo nrt $end" in the .com file)
    natural_bond_order_list = [["C1", "C2"]]
    df = gp.get_natural_bond_order(df, natural_bond_order_list)

    # ---------------Natural Atomic Valencies, Electron Counts, and Charges-------------------------
    # Natural Atomic Valencies, Electron Counts, and Charges of a atom
    # requires Natural Resonance Theory Analysis in Gaussian input file ("$nbo nrt $end" in the .com file)
    natural_atomic_valencies_list = ["C1", "C2"]
    df = gp.get_natural_atomic_valencies(df, natural_atomic_valencies_list)

    display(df)
    # copy the changes back to atom_map_df_all
    atom_map_df_all[prefix] = df.copy(deep=True)
    
# delete the "Goodvibes_output.dat" temp file
if os.path.exists("Goodvibes_output.dat"):
    os.remove("Goodvibes_output.dat")

processing prefix: pyrdz
Goodvibes function has completed
Frontier orbitals function has completed
Polarizability function has completed
Dipole function has completed
Volume function has completed
SASA function has completed
NBO function has completed for ['C1', 'C2']
****no NMR data found in file: 2.log_files\pyrdz1_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrdz2_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrdz3_conf-1_openshell.log
NMR function has completed for ['C1', 'C2']
Distance function has completed for [['C1', 'C2']]
Vbur scan function has completed for ['C1', 'C2'] from 2 to 2
Morfeus Sterimol function has completed for [['C1', 'C2']]
Pyramidalization function has completed for ['C1']
****No Natural Bond Order section found in: 2.log_files\pyrdz1_conf-1_openshell.log
****No Natural Bond Order section found in: 2.log_files\pyrdz2_conf-1_openshell.log
****No Natural Bond Order section found in: 2.log_files\pyrdz3_conf-1_openshell.log

Unnamed: 0,log_name,N3,N4,C5,C6,C7,C2,C1,H1,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,G(T)_spc(Hartree),qh_G(T)_spc(Hartree),T,HOMO,LUMO,μ,η,ω,polar_iso(Debye),polar_aniso(Debye),dipole(Debye),volume(Bohr_radius³/mol),SASA_surface_area(Å²),SASA_volume(Å³),SASA_sphericity,NBO_charge_C1,NBO_charge_C2,NMR_shift_C1,NMR_shift_C2,distance_C1_C2(Å),%Vbur_C1_2.0Å,%Vbur_C2_2.0Å,Sterimol_L_C1_C2(Å)_morfeus,Sterimol_B1_C1_C2(Å)_morfeus,Sterimol_B5_C1_C2(Å)_morfeus,pyramidalization_Gavrish_C1(°),pyramidalization_Agranat-Radhakrishnan_C1,C1_C2_Bond_Order,Natural_Valency_C1,Natural_Valency_C2
0,2.log_files\pyrdz1_conf-1_openshell,N7,N6,C5,C4,C3,C2,C1,H8,-302.930027,0.090329,-302.833243,0.036086,0.036089,-302.869329,-302.869332,298.15,-0.26295,-0.02245,-0.1427,0.2405,0.04234,71.903,55.6405,4.0676,835.507,246.631338,330.617365,0.937532,-0.25254,0.02923,no data,no data,1.4073,84.75594,95.861312,6.652501,1.700245,3.261611,0.001161,0.000175,no data,no data,no data
1,2.log_files\pyrdz2_conf-1_openshell,C3,N4,N5,C6,C7,C2,C1,H8,-302.930289,0.090406,-302.833406,0.036145,0.036148,-302.869551,-302.869554,298.15,-0.27375,-0.01057,-0.14216,0.26318,0.03839,71.7672,53.425,4.6732,812.23,245.641237,330.094241,0.940318,-0.24646,-0.15475,no data,no data,1.40238,84.949638,96.920196,6.157302,1.70052,3.270417,0.006961,0.001048,no data,no data,no data
2,2.log_files\pyrdz3_conf-1_openshell,N8,N7,C6,C5,C4,C3,C2,H12,-342.243845,0.118646,-342.117063,0.040414,0.04016,-342.157476,-342.157223,298.15,-0.24596,-0.01835,-0.132155,0.22761,0.03837,86.0281,67.1591,3.7834,853.3,276.211315,379.377533,0.917537,-0.04725,0.04263,no data,no data,1.41493,92.03577,95.851627,6.671775,1.83477,3.266725,0.003601,0.00054,no data,no data,no data


processing prefix: pyrd
Goodvibes function has completed
Frontier orbitals function has completed
Polarizability function has completed
Dipole function has completed
Volume function has completed
SASA function has completed
NBO function has completed for ['C1', 'C2']
****no NMR data found in file: 2.log_files\pyrd1_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrd2_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrd3_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrd4_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrd5_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrd6_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrd7_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrd8_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrd9_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrd10_conf-1_openshell.log
****no NMR data found in file: 

Unnamed: 0,log_name,C3,C4,N5,C6,C7,C2,C1,H1,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,G(T)_spc(Hartree),qh_G(T)_spc(Hartree),T,HOMO,LUMO,μ,η,ω,polar_iso(Debye),polar_aniso(Debye),dipole(Debye),volume(Bohr_radius³/mol),SASA_surface_area(Å²),SASA_volume(Å³),SASA_sphericity,NBO_charge_C1,NBO_charge_C2,NMR_shift_C1,NMR_shift_C2,distance_C1_C2(Å),%Vbur_C1_2.0Å,%Vbur_C2_2.0Å,Sterimol_L_C1_C2(Å)_morfeus,Sterimol_B1_C1_C2(Å)_morfeus,Sterimol_B5_C1_C2(Å)_morfeus,pyramidalization_Gavrish_C1(°),pyramidalization_Agranat-Radhakrishnan_C1,C1_C2_Bond_Order,Natural_Valency_C1,Natural_Valency_C2
0,2.log_files\pyrd1_conf-1_openshell,C5,C4,N3,C11,C10,C2,C1,H12,-440.561372,0.150098,-440.402347,0.042178,0.042101,-440.444526,-440.444448,298.15,-0.24438,-0.02207,-0.133225,0.22231,0.03992,131.949,128.771,1.7941,1146.863,320.84286,463.582193,0.902837,-0.25346,0.07856,no data,no data,1.40455,84.79468,95.94202,8.780835,1.700435,4.634242,0.003385,0.0005095253,no data,no data,no data
1,2.log_files\pyrd2_conf-1_openshell,C11,N10,C9,C4,C3,C2,C1,H12,-440.56109,0.150065,-440.402068,0.042236,0.042144,-440.444305,-440.444213,298.15,-0.23505,-0.02556,-0.130305,0.20949,0.04053,132.41,133.341,2.0008,1399.336,319.532785,462.70445,0.905394,-0.28585,-0.17875,no data,no data,1.39808,85.00452,96.916968,8.964221,1.700709,4.320231,0.003771,0.000567407,no data,no data,no data
2,2.log_files\pyrd3_conf-1_openshell,C3,C4,N5,C6,C7,C2,C1,H12,-440.560235,0.150252,-440.401044,0.042278,0.042163,-440.443322,-440.443207,298.15,-0.24968,-0.01785,-0.133765,0.23183,0.03859,126.302,98.8392,2.4175,1335.951,314.4928,457.418032,0.912884,-0.26452,-0.08789,no data,no data,1.39449,85.30798,96.852402,7.014938,1.701592,5.719376,0.001438,0.0002161625,no data,no data,no data
3,2.log_files\pyrd4_conf-1_openshell,C5,C4,N3,C11,C6,C2,C1,H12,-440.559301,0.150202,-440.40014,0.042404,0.042207,-440.442543,-440.442347,298.15,-0.24069,-0.01824,-0.129465,0.22245,0.03767,127.017,98.3662,2.2854,1073.205,314.392814,457.166165,0.912839,-0.26571,0.10001,no data,no data,1.39445,85.114282,95.861312,6.997686,1.700262,5.730849,0.00169,0.0002541405,no data,no data,no data
4,2.log_files\pyrd5_conf-1_openshell,C3,N11,C10,C9,C4,C2,C1,H12,-440.560694,0.150126,-440.401621,0.042258,0.042156,-440.443879,-440.443778,298.15,-0.23259,-0.02678,-0.129685,0.20581,0.04086,131.954,129.704,2.2544,1076.634,319.602793,462.822473,0.90535,-0.27655,0.0508,no data,no data,1.39878,84.836648,95.906508,8.950891,1.700104,4.388775,0.000516,7.763637e-05,no data,no data,no data
5,2.log_files\pyrd6_conf-1_openshell,C3,N4,C5,C6,C11,C2,C1,H12,-440.559622,0.150164,-440.400479,0.042348,0.042242,-440.442826,-440.442721,298.15,-0.23123,-0.0232,-0.127215,0.20803,0.0389,126.242,98.7696,2.5005,1177.015,313.262735,456.64905,0.915441,-0.30031,-0.14146,no data,no data,1.38985,85.317665,96.871772,7.039688,1.701501,5.71331,0.001028,0.0001545262,no data,no data,no data
6,2.log_files\pyrd7_conf-1_openshell,C10,C5,N4,C12,C11,C3,C2,H16,-479.875564,0.178347,-479.686583,0.046391,0.045993,-479.732974,-479.732575,298.15,-0.23188,-0.01792,-0.1249,0.21396,0.03646,146.864,138.605,1.4327,1519.222,350.002835,512.199125,0.884515,-0.04446,0.0901,no data,no data,1.41213,92.042226,95.822572,8.816115,1.836963,4.555283,0.001691,0.0002540082,no data,no data,no data
7,2.log_files\pyrd8_conf-1_openshell,C12,N11,C10,C5,C4,C3,C2,H16,-479.873294,0.178133,-479.684366,0.047165,0.046332,-479.731531,-479.730698,298.15,-0.22683,-0.02062,-0.123725,0.20621,0.03712,146.358,141.665,2.5201,1489.138,347.402625,510.501323,0.889165,-0.06647,-0.1746,no data,no data,1.40546,92.019628,96.81689,8.980805,1.812052,4.28312,0.0,9.149531e-11,no data,no data,no data
8,2.log_files\pyrd9_conf-1_openshell,C4,C5,N6,C7,C12,C3,C2,H16,-479.867345,0.178945,-479.677916,0.046678,0.045712,-479.724594,-479.723628,298.15,-0.23629,-0.01229,-0.12429,0.224,0.03448,140.523,105.283,3.1276,1483.783,338.812617,501.255002,0.900666,-0.05642,-0.07975,no data,no data,1.40503,92.41671,96.810434,7.171004,1.821431,5.67146,0.000772,0.0001143745,no data,no data,no data
9,2.log_files\pyrd10_conf-1_openshell,C6,C5,N4,C12,C7,C3,C2,H16,-479.866207,0.17883,-479.676895,0.046674,0.0457,-479.72357,-479.722595,298.15,-0.22825,-0.01278,-0.120515,0.21547,0.0337,141.397,106.326,2.1339,1396.107,339.472645,502.069789,0.899889,-0.0599,0.10711,no data,no data,1.4054,92.319861,95.767691,7.142878,1.759066,5.687589,0.164326,0.02428145,no data,no data,no data


processing prefix: pyrmd
Goodvibes function has completed
Frontier orbitals function has completed
Polarizability function has completed
Dipole function has completed
Volume function has completed
SASA function has completed
NBO function has completed for ['C1', 'C2']
****no NMR data found in file: 2.log_files\pyrmd1_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrmd2_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrmd3_conf-2_openshell.log
****no NMR data found in file: 2.log_files\pyrmd4_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrmd5_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrmd6_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrmd7_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrmd8_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrmd9_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrmd10_conf-1_openshell.log
NMR function has com

Unnamed: 0,log_name,N3,C4,C5,C6,N7,C2,C1,H1,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,G(T)_spc(Hartree),qh_G(T)_spc(Hartree),T,HOMO,LUMO,μ,η,ω,polar_iso(Debye),polar_aniso(Debye),dipole(Debye),volume(Bohr_radius³/mol),SASA_surface_area(Å²),SASA_volume(Å³),SASA_sphericity,NBO_charge_C1,NBO_charge_C2,NMR_shift_C1,NMR_shift_C2,distance_C1_C2(Å),%Vbur_C1_2.0Å,%Vbur_C2_2.0Å,Sterimol_L_C1_C2(Å)_morfeus,Sterimol_B1_C1_C2(Å)_morfeus,Sterimol_B5_C1_C2(Å)_morfeus,pyramidalization_Gavrish_C1(°),pyramidalization_Agranat-Radhakrishnan_C1,C1_C2_Bond_Order,Natural_Valency_C1,Natural_Valency_C2
0,2.log_files\pyrmd1_conf-1_openshell,N11,C10,C5,C4,N3,C2,C1,H12,-456.606644,0.138347,-456.459507,0.04203,0.041922,-456.501537,-456.501429,298.15,-0.25541,-0.03607,-0.14574,0.21934,0.04842,124.978,120.848,2.0432,1106.548,315.25242,452.627008,0.904314,-0.23463,0.24652,no data,no data,1.40552,84.649406,94.731405,8.797604,1.700282,4.41369,0.001162,0.000175,no data,no data,no data
1,2.log_files\pyrmd2_conf-1_openshell,C10,N9,C8,C3,N11,C2,C1,H12,-456.587224,0.136659,-456.44179,0.042023,0.041969,-456.483813,-456.483758,298.15,-0.29927,-0.04383,-0.17155,0.25544,0.05761,113.601,82.1634,2.9119,1280.507,312.392031,449.555075,0.908461,-0.28547,0.20413,no data,no data,1.4785,86.266787,95.774148,7.016737,2.019846,5.717709,0.979818,0.147138,no data,no data,no data
2,2.log_files\pyrmd3_conf-2_openshell,N4,C5,C10,C11,N12,C3,C2,H16,-495.921385,0.166582,-495.744299,0.046263,0.045809,-495.790562,-495.790108,298.15,-0.2414,-0.03198,-0.13669,0.20942,0.04461,140.157,132.349,1.3446,1209.26,344.922397,501.517104,0.88502,-0.03348,0.25924,no data,no data,1.41233,92.000258,94.650697,8.826077,1.839073,4.376254,0.001028,0.000154,no data,no data,no data
3,2.log_files\pyrmd4_conf-1_openshell,C11,N10,C9,C4,N12,C3,C2,H16,-495.914512,0.167157,-495.736987,0.0467,0.045643,-495.783687,-495.78263,298.15,-0.25374,-0.02168,-0.13771,0.23206,0.04086,134.236,100.658,3.654,1298.212,333.76221,490.949083,0.901719,-0.02981,0.13124,no data,no data,1.41066,92.345687,95.787061,7.182108,1.821027,5.662686,0.00031,4.6e-05,no data,no data,no data
4,2.log_files\pyrmd5_conf-1_openshell,N7,C6,C5,C4,N3,C2,C1,H8,-302.96869,0.091298,-302.87104,0.035927,0.035927,-302.906967,-302.906967,298.15,-0.26667,-0.0102,-0.138435,0.25647,0.03736,71.4904,55.5458,1.5056,621.066,247.651549,330.877904,0.934161,-0.23155,0.25241,no data,no data,1.41083,84.542872,94.744318,6.689929,1.700415,3.25475,0.000189,2.8e-05,no data,no data,no data
5,2.log_files\pyrmd6_conf-1_openshell,C4,N5,C6,C7,N3,C2,C1,H8,-302.968962,0.091419,-302.871175,0.035955,0.035957,-302.907129,-302.907132,298.15,-0.27603,-0.00258,-0.139305,0.27345,0.03548,70.8663,51.754,2.7819,931.5,246.411463,330.035241,0.937267,-0.23062,0.08952,no data,no data,1.40995,84.79468,95.945248,6.127653,1.7,3.266518,3.1e-05,5e-06,no data,no data,no data
6,2.log_files\pyrmd7_conf-1_openshell,C5,N4,C3,C7,N6,C2,C1,H8,-302.967347,0.091131,-302.869786,0.036057,0.036061,-302.905843,-302.905847,298.15,-0.25574,-0.01669,-0.136215,0.23905,0.03881,71.6151,55.9694,2.5609,749.118,245.131401,329.205287,0.940581,-0.27768,-0.21648,no data,no data,1.40245,85.059401,96.875,6.692529,1.700463,3.262785,0.001018,0.000153,no data,no data,no data
7,2.log_files\pyrmd8_conf-1_openshell,N4,C5,C6,C7,N8,C3,C2,H12,-342.283197,0.119588,-342.155586,0.040193,0.039967,-342.195779,-342.195552,298.15,-0.24836,-0.00639,-0.127375,0.24197,0.03353,85.7987,67.457,0.9607,1013.087,277.69153,380.121828,0.91384,-0.03525,0.26549,no data,no data,1.41694,92.006715,94.679752,6.699442,1.835322,3.295654,0.000217,3.3e-05,no data,no data,no data
8,2.log_files\pyrmd9_conf-1_openshell,C5,N4,C8,C7,N6,C3,C2,H12,-342.283768,0.11978,-342.155962,0.040197,0.039986,-342.196159,-342.195948,298.15,-0.25726,0.00186,-0.1277,0.25912,0.03147,85.0945,63.1066,2.995,1077.581,276.101439,378.944683,0.917204,-0.03117,0.10299,no data,no data,1.41634,92.045455,95.851627,6.140002,1.833993,3.270054,0.000698,0.000105,no data,no data,no data
9,2.log_files\pyrmd10_conf-1_openshell,C6,N7,C8,C4,N5,C3,C2,H12,-342.279133,0.119311,-342.151605,0.040943,0.040376,-342.192548,-342.191981,298.15,-0.24318,-0.0128,-0.12799,0.23038,0.03555,84.8523,64.1415,3.1797,984.507,273.051241,377.16224,0.924539,-0.06471,-0.20856,no data,no data,1.41069,92.038998,96.865315,6.701352,1.808426,3.264944,0.000427,6.4e-05,no data,no data,no data


processing prefix: pyrz
Goodvibes function has completed
Frontier orbitals function has completed
Polarizability function has completed
Dipole function has completed
Volume function has completed
SASA function has completed
NBO function has completed for ['C1', 'C2']
****no NMR data found in file: 2.log_files\pyrz1_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrz2_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrz3_conf-1_openshell.log
****no NMR data found in file: 2.log_files\pyrz4_conf-1_openshell.log
NMR function has completed for ['C1', 'C2']
Distance function has completed for [['C1', 'C2']]
Vbur scan function has completed for ['C1', 'C2'] from 2 to 2
Morfeus Sterimol function has completed for [['C1', 'C2']]
Pyramidalization function has completed for ['C1']
****No Natural Bond Order section found in: 2.log_files\pyrz1_conf-1_openshell.log
****No Natural Bond Order section found in: 2.log_files\pyrz2_conf-1_openshell.log
****No Natural Bond

Unnamed: 0,log_name,C3,N4,C5,C6,N7,C2,C1,H1,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,G(T)_spc(Hartree),qh_G(T)_spc(Hartree),T,HOMO,LUMO,μ,η,ω,polar_iso(Debye),polar_aniso(Debye),dipole(Debye),volume(Bohr_radius³/mol),SASA_surface_area(Å²),SASA_volume(Å³),SASA_sphericity,NBO_charge_C1,NBO_charge_C2,NMR_shift_C1,NMR_shift_C2,distance_C1_C2(Å),%Vbur_C1_2.0Å,%Vbur_C2_2.0Å,Sterimol_L_C1_C2(Å)_morfeus,Sterimol_B1_C1_C2(Å)_morfeus,Sterimol_B5_C1_C2(Å)_morfeus,pyramidalization_Gavrish_C1(°),pyramidalization_Agranat-Radhakrishnan_C1,C1_C2_Bond_Order,Natural_Valency_C1,Natural_Valency_C2
0,2.log_files\pyrz1_conf-1_openshell,C11,N10,C5,C4,N3,C2,C1,H12,-456.601997,0.138128,-456.455073,0.042009,0.041906,-456.497082,-456.496978,298.15,-0.25412,-0.03678,-0.14545,0.21734,0.04867,128.021,131.357,0.319,1225.531,315.002406,452.324429,0.904629,-0.25215,0.03199,no data,no data,1.39879,84.839876,95.890367,8.807194,1.700283,4.366581,0.001238,0.000186,no data,no data,no data
1,2.log_files\pyrz2_conf-1_openshell,C12,N11,C10,C5,N4,C3,C2,H16,-495.916761,0.166411,-495.739848,0.046219,0.045792,-495.786067,-495.78564,298.15,-0.24152,-0.03164,-0.13658,0.20988,0.04444,143.216,142.668,1.18,1370.972,344.992386,501.652251,0.885,-0.04119,0.04064,no data,no data,1.40526,92.116477,95.858084,8.831332,1.837124,4.332953,0.000774,0.000116,no data,no data,no data
2,2.log_files\pyrz3_conf-1_openshell,C7,N6,C5,C4,N3,C2,C1,H8,-302.962267,0.091034,-302.86485,0.035983,0.035984,-302.900832,-302.900833,298.15,-0.25894,-0.01748,-0.13821,0.24146,0.03956,73.127,59.8841,0.5359,748.31,246.511473,330.220215,0.937237,-0.25895,0.02573,no data,no data,1.40265,84.885072,95.838714,6.685713,1.700294,3.27037,6e-05,9e-06,no data,no data,no data
3,2.log_files\pyrz4_conf-1_openshell,C8,N7,C6,C5,N4,C3,C2,H12,-342.27414,0.119119,-342.146813,0.041151,0.040403,-342.187964,-342.187216,298.15,-0.24589,-0.01522,-0.130555,0.23067,0.03695,86.4422,69.3871,1.1923,845.886,274.791318,378.490891,0.920841,-0.05304,0.03713,no data,no data,1.41171,91.974432,95.932335,6.698011,1.808452,3.266763,0.000772,0.000116,no data,no data,no data


## 3.1 Save collected properties to Excel and pickle file

In [5]:
for prefix, df in atom_map_df_all.items():
    # save the pandas dataframe to a xlsx file
    with pd.ExcelWriter(temp_folder + os.sep + prefix + "_extracted_properties.xlsx") as writer:
        df.to_excel(writer)

# **4. Post-processing**

In [6]:
import re
import pandas as pd
import numpy as np
from tabulate import tabulate

In [7]:
# for numerically named compounds, prefix is any text common to all BEFORE the number and suffix is common to all AFTER the number
# this is a template for our files that are all named "AcXXX_clust-X.log" or "AcXXX_conf-X.log"
suffix = "_"
prefixs = {}
for file in glob.glob("*.xlsx", root_dir=atom_mappings_folder):
    key = re.search(r"^(\D+)_atom_map", file)
    if key and key.group(1) in prefixs:
        prefixs[key.group(1)].append(file)
    else:
        prefixs[key.group(1)] = [file]

# columns that provide atom mapping information are dropped, not need if these columns contain cells that cannot be convert to float
# e.g. atom_columns_to_drop = ["C3", "C4", "C5", "N1", "C1", "C2"]
atom_columns_to_drop = []

# title of the column for the energy you want to use for boltzmann averaging and lowest E conformer determination
energy_col_header = "G(T)_spc(Hartree)"

### Option to import an Excel sheet if you're using properties or energies collected outside of this notebook

##### If you would like to use post-processing functionality (i.e. Boltzmann averaging, lowest E conformers, etc.) you can read in a dataframe with properties (e.g. QikProp properties) or energies (e.g. if you don't/can't run linked jobs) collected outside of this notebook. 

In [8]:
atom_map_df_all = {}

for prefix in prefixs:
    df = pd.read_excel(
        temp_folder + os.sep +
        prefix + "_extracted_properties.xlsx",
        "Sheet1",
        index_col=0,
        header=0,
        engine="openpyxl",
    )
    display(df.head(2))
    atom_map_df_all[prefix] = df.copy(deep=True)

Unnamed: 0,log_name,N3,N4,C5,C6,C7,C2,C1,H1,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,G(T)_spc(Hartree),qh_G(T)_spc(Hartree),T,HOMO,LUMO,μ,η,ω,polar_iso(Debye),polar_aniso(Debye),dipole(Debye),volume(Bohr_radius³/mol),SASA_surface_area(Å²),SASA_volume(Å³),SASA_sphericity,NBO_charge_C1,NBO_charge_C2,NMR_shift_C1,NMR_shift_C2,distance_C1_C2(Å),%Vbur_C1_2.0Å,%Vbur_C2_2.0Å,Sterimol_L_C1_C2(Å)_morfeus,Sterimol_B1_C1_C2(Å)_morfeus,Sterimol_B5_C1_C2(Å)_morfeus,pyramidalization_Gavrish_C1(°),pyramidalization_Agranat-Radhakrishnan_C1,C1_C2_Bond_Order,Natural_Valency_C1,Natural_Valency_C2
0,2.log_files\pyrdz1_conf-1_openshell,N7,N6,C5,C4,C3,C2,C1,H8,-302.930027,0.090329,-302.833243,0.036086,0.036089,-302.869329,-302.869332,298.15,-0.26295,-0.02245,-0.1427,0.2405,0.04234,71.903,55.6405,4.0676,835.507,246.631338,330.617365,0.937532,-0.25254,0.02923,no data,no data,1.4073,84.75594,95.861312,6.652501,1.700245,3.261611,0.001161,0.000175,no data,no data,no data
1,2.log_files\pyrdz2_conf-1_openshell,C3,N4,N5,C6,C7,C2,C1,H8,-302.930289,0.090406,-302.833406,0.036145,0.036148,-302.869551,-302.869554,298.15,-0.27375,-0.01057,-0.14216,0.26318,0.03839,71.7672,53.425,4.6732,812.23,245.641237,330.094241,0.940318,-0.24646,-0.15475,no data,no data,1.40238,84.949638,96.920196,6.157302,1.70052,3.270417,0.006961,0.001048,no data,no data,no data


Unnamed: 0,log_name,C3,C4,N5,C6,C7,C2,C1,H1,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,G(T)_spc(Hartree),qh_G(T)_spc(Hartree),T,HOMO,LUMO,μ,η,ω,polar_iso(Debye),polar_aniso(Debye),dipole(Debye),volume(Bohr_radius³/mol),SASA_surface_area(Å²),SASA_volume(Å³),SASA_sphericity,NBO_charge_C1,NBO_charge_C2,NMR_shift_C1,NMR_shift_C2,distance_C1_C2(Å),%Vbur_C1_2.0Å,%Vbur_C2_2.0Å,Sterimol_L_C1_C2(Å)_morfeus,Sterimol_B1_C1_C2(Å)_morfeus,Sterimol_B5_C1_C2(Å)_morfeus,pyramidalization_Gavrish_C1(°),pyramidalization_Agranat-Radhakrishnan_C1,C1_C2_Bond_Order,Natural_Valency_C1,Natural_Valency_C2
0,2.log_files\pyrd1_conf-1_openshell,C5,C4,N3,C11,C10,C2,C1,H12,-440.561372,0.150098,-440.402347,0.042178,0.042101,-440.444526,-440.444448,298.15,-0.24438,-0.02207,-0.133225,0.22231,0.03992,131.949,128.771,1.7941,1146.863,320.84286,463.582193,0.902837,-0.25346,0.07856,no data,no data,1.40455,84.79468,95.94202,8.780835,1.700435,4.634242,0.003385,0.00051,no data,no data,no data
1,2.log_files\pyrd2_conf-1_openshell,C11,N10,C9,C4,C3,C2,C1,H12,-440.56109,0.150065,-440.402068,0.042236,0.042144,-440.444305,-440.444213,298.15,-0.23505,-0.02556,-0.130305,0.20949,0.04053,132.41,133.341,2.0008,1399.336,319.532785,462.70445,0.905394,-0.28585,-0.17875,no data,no data,1.39808,85.00452,96.916968,8.964221,1.700709,4.320231,0.003771,0.000567,no data,no data,no data


Unnamed: 0,log_name,N3,C4,C5,C6,N7,C2,C1,H1,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,G(T)_spc(Hartree),qh_G(T)_spc(Hartree),T,HOMO,LUMO,μ,η,ω,polar_iso(Debye),polar_aniso(Debye),dipole(Debye),volume(Bohr_radius³/mol),SASA_surface_area(Å²),SASA_volume(Å³),SASA_sphericity,NBO_charge_C1,NBO_charge_C2,NMR_shift_C1,NMR_shift_C2,distance_C1_C2(Å),%Vbur_C1_2.0Å,%Vbur_C2_2.0Å,Sterimol_L_C1_C2(Å)_morfeus,Sterimol_B1_C1_C2(Å)_morfeus,Sterimol_B5_C1_C2(Å)_morfeus,pyramidalization_Gavrish_C1(°),pyramidalization_Agranat-Radhakrishnan_C1,C1_C2_Bond_Order,Natural_Valency_C1,Natural_Valency_C2
0,2.log_files\pyrmd1_conf-1_openshell,N11,C10,C5,C4,N3,C2,C1,H12,-456.606644,0.138347,-456.459507,0.04203,0.041922,-456.501537,-456.501429,298.15,-0.25541,-0.03607,-0.14574,0.21934,0.04842,124.978,120.848,2.0432,1106.548,315.25242,452.627008,0.904314,-0.23463,0.24652,no data,no data,1.40552,84.649406,94.731405,8.797604,1.700282,4.41369,0.001162,0.000175,no data,no data,no data
1,2.log_files\pyrmd2_conf-1_openshell,C10,N9,C8,C3,N11,C2,C1,H12,-456.587224,0.136659,-456.44179,0.042023,0.041969,-456.483813,-456.483758,298.15,-0.29927,-0.04383,-0.17155,0.25544,0.05761,113.601,82.1634,2.9119,1280.507,312.392031,449.555075,0.908461,-0.28547,0.20413,no data,no data,1.4785,86.266787,95.774148,7.016737,2.019846,5.717709,0.979818,0.147138,no data,no data,no data


Unnamed: 0,log_name,C3,N4,C5,C6,N7,C2,C1,H1,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,G(T)_spc(Hartree),qh_G(T)_spc(Hartree),T,HOMO,LUMO,μ,η,ω,polar_iso(Debye),polar_aniso(Debye),dipole(Debye),volume(Bohr_radius³/mol),SASA_surface_area(Å²),SASA_volume(Å³),SASA_sphericity,NBO_charge_C1,NBO_charge_C2,NMR_shift_C1,NMR_shift_C2,distance_C1_C2(Å),%Vbur_C1_2.0Å,%Vbur_C2_2.0Å,Sterimol_L_C1_C2(Å)_morfeus,Sterimol_B1_C1_C2(Å)_morfeus,Sterimol_B5_C1_C2(Å)_morfeus,pyramidalization_Gavrish_C1(°),pyramidalization_Agranat-Radhakrishnan_C1,C1_C2_Bond_Order,Natural_Valency_C1,Natural_Valency_C2
0,2.log_files\pyrz1_conf-1_openshell,C11,N10,C5,C4,N3,C2,C1,H12,-456.601997,0.138128,-456.455073,0.042009,0.041906,-456.497082,-456.496978,298.15,-0.25412,-0.03678,-0.14545,0.21734,0.04867,128.021,131.357,0.319,1225.531,315.002406,452.324429,0.904629,-0.25215,0.03199,no data,no data,1.39879,84.839876,95.890367,8.807194,1.700283,4.366581,0.001238,0.000186,no data,no data,no data
1,2.log_files\pyrz2_conf-1_openshell,C12,N11,C10,C5,N4,C3,C2,H16,-495.916761,0.166411,-495.739848,0.046219,0.045792,-495.786067,-495.78564,298.15,-0.24152,-0.03164,-0.13658,0.20988,0.04444,143.216,142.668,1.18,1370.972,344.992386,501.652251,0.885,-0.04119,0.04064,no data,no data,1.40526,92.116477,95.858084,8.831332,1.837124,4.332953,0.000774,0.000116,no data,no data,no data


## 4.1 Generating a list of compounds that have conformational ensembles

**ONLY RUN THE AUTOMATED OR THE MANUAL CELL, NOT BOTH**

**AUTOMATED:** if your compounds are named consistenly, this section generates your compound list based on the similar naming structure

In [9]:
compound_list_all = {}

for prefix, df in atom_map_df_all.items():
    print(f"processing prefix: {prefix}")
    compound_list = []

    for index, row in df.iterrows():
        log_file = row["log_name"]  # read file name from df
        # first split by "\" take the last part
        log_file = log_file.split(os.sep)[-1]
        prefix_and_compound = log_file.split(str(suffix))
        compound = prefix_and_compound[0].split(str(prefix))  # splits again to get "XXX" (entry 1) (and we don't use the empty string "" (entry 0))
        compound_list.append(compound[1])

    compound_list = list(set(compound_list))  # removes duplicate stuctures that result from having conformers of each
    compound_list.sort(key=lambda x: int(re.search(r"\d+", x).group()))  # reorders numerically (not sure if it reorders alphabetically)
    print(f"items numbering: {compound_list}")
    compound_list_all[prefix] = compound_list

    # this should generate a list that looks like this: ['24', '27', '34', '48']

processing prefix: pyrdz
items numbering: ['1', '2', '3']
processing prefix: pyrd
items numbering: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18']
processing prefix: pyrmd
items numbering: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
processing prefix: pyrz
items numbering: ['1', '2', '3', '4']


## 4.2 Post-processing to get properties for each compound

##### changes made in 8/30/2024 <br> 1. avoid divide by zero error in the Boltzmann averaging, the original code had the if block order reversed, which caused the error. <br> 2. data cleaning by remove columns contain cell that cannot be converted to float. <br> 3. concat all data into row before concat them into the final dataframe. The originl modify individual cells which result in fragmented and raise performance warning.

In [10]:
all_df_master_for_all_prefixes = {}
properties_df_master_for_all_prefixes = {}

for prefix, df in atom_map_df_all.items():
    compound_list = compound_list_all[prefix]
    all_df_master = pd.DataFrame(columns=[])
    properties_df_master = pd.DataFrame(columns=[])

    for compound in compound_list:
        # defines the common start to all files using the input above
        substring = log_files_folder + os.sep + str(prefix) + str(compound) + str(suffix)

        # makes a data frame for one compound at a time for post-processing
        valuesdf = df[df["log_name"].str.startswith(substring)]
        valuesdf = valuesdf.drop(columns=atom_columns_to_drop)
        valuesdf = valuesdf.reset_index(
            drop=True
        )  # you must re-index otherwise the 2nd, 3rd, etc. compounds fail

        # filter column that are characters, we will attempt to convert them to numeric numbers, if fail, we will drop them
        for column in valuesdf:
            try:
                # exclude column "log_name"
                if column == "log_name":
                    continue
                valuesdf[column] = pd.to_numeric(valuesdf[column])
            except:
                print(f"Column {column} contains non-numeric values")
                valuesdf = valuesdf.drop(columns=column)
                valuesdf = valuesdf.reset_index(
                    drop=True
                )  # reset the index after dropping columns

        # define columns that won't be included in summary properties or are treated differently because they don't make sense to Boltzmann average
        non_boltz_columns = [
            "G(Hartree)",
            "∆G(Hartree)",
            "∆G(kcal/mol)",
            "e^(-∆G/RT)",
            "Mole Fraction",
        ]  # don't boltzman average columns containing these strings in the column label
        reg_avg_columns = [
            "CPU_time_total(hours)",
            "Wall_time_total(hours)",
        ]  # don't boltzmann average these either, we average them in case that is helpful
        gv_extra_columns = [
            "G(T)_spc(Hartree)",
        ]
        gv_extra_columns.remove(str(energy_col_header))

        # calculate the summary properties based on all conformers (Boltzmann Average, Minimum, Maximum, Boltzmann Weighted Std)
        valuesdf["∆G(Hartree)"] = (
            valuesdf[energy_col_header] - valuesdf[energy_col_header].min()
        )
        valuesdf["∆G(kcal/mol)"] = valuesdf["∆G(Hartree)"] * 627.5
        valuesdf["e^(-∆G/RT)"] = np.exp(
            (valuesdf["∆G(kcal/mol)"] * -1000) / (1.987204 * 298.15)
        )  # R is in cal/(K*mol)
        valuesdf["Mole Fraction"] = valuesdf["e^(-∆G/RT)"] / valuesdf["e^(-∆G/RT)"].sum()
        values_boltz_row = []
        values_min_row = []
        values_max_row = []
        values_boltz_stdev_row = []
        values_range_row = []
        values_exclude_columns = []

        for column in valuesdf:
            if "log_name" in column:
                values_boltz_row.append("Boltzmann Averages")
                values_min_row.append("Ensemble Minimum")
                values_max_row.append("Ensemble Maximum")
                values_boltz_stdev_row.append("Boltzmann Standard Deviation")
                values_range_row.append("Ensemble Range")
                values_exclude_columns.append(column)  # used later to build final dataframe
            elif any(phrase in column for phrase in non_boltz_columns) or any(
                phrase in column for phrase in gv_extra_columns
            ):
                values_boltz_row.append("")
                values_min_row.append("")
                values_max_row.append("")
                values_boltz_stdev_row.append("")
                values_range_row.append("")
            elif any(phrase in column for phrase in reg_avg_columns):
                values_boltz_row.append(
                    valuesdf[column].mean()
                )  # intended to print the average CPU/wall time in the boltz column
                values_min_row.append("")
                values_max_row.append("")
                values_boltz_stdev_row.append("")
                values_range_row.append("")
            else:
                valuesdf[column] = pd.to_numeric(
                    valuesdf[column]
                )  # to hopefully solve the error that sometimes occurs where the float(Mole Fraction) cannot be mulitplied by the string(property)
                values_boltz_row.append(
                    (valuesdf[column] * valuesdf["Mole Fraction"]).sum()
                )
                values_min_row.append(valuesdf[column].min())
                values_max_row.append(valuesdf[column].max())
                values_range_row.append(valuesdf[column].max() - valuesdf[column].min())

                # this section generates the weighted std deviation (weighted by mole fraction)
                # formula: https://www.statology.org/weighted-standard-deviation-excel/

                boltz = (valuesdf[column] * valuesdf["Mole Fraction"]).sum()  # number
                delta_values_sq = []

                # makes a list of the "deviation" for each conformer
                for index, row in valuesdf.iterrows():
                    value = row[column]
                    delta_value_sq = (value - boltz) ** 2
                    delta_values_sq.append(delta_value_sq)

                # w is list of weights (i.e. mole fractions)
                w = list(valuesdf["Mole Fraction"])
                # !swap the order here to avoid division by zero error
                if (
                    len(w) == 1
                ):  # if there is only one conformer in the ensemble, set the weighted standard deviation to 0
                    wstdev = 0
                # np.average(delta_values_sq, weights=w) generates sum of each (delta_value_sq * mole fraction)
                else:
                    wstdev = np.sqrt(
                        (np.average(delta_values_sq, weights=w))
                        / (((len(w) - 1) / len(w)) * np.sum(w))
                    )
                values_boltz_stdev_row.append(wstdev)

        valuesdf.loc[len(valuesdf)] = values_boltz_row
        valuesdf.loc[len(valuesdf)] = values_boltz_stdev_row
        valuesdf.loc[len(valuesdf)] = values_min_row
        valuesdf.loc[len(valuesdf)] = values_max_row
        valuesdf.loc[len(valuesdf)] = values_range_row

        # final output format is built here:
        explicit_order_front_columns = [
            "log_name",
            energy_col_header,
            "∆G(Hartree)",
            "∆G(kcal/mol)",
            "e^(-∆G/RT)",
            "Mole Fraction",
        ]

        # reorders the dataframe using front columns defined above
        valuesdf = valuesdf[
            explicit_order_front_columns
            + [
                col
                for col in valuesdf.columns
                if col not in explicit_order_front_columns
                and col not in values_exclude_columns
            ]
        ]

        # determine the index of the lowest energy conformer
        low_e_index = valuesdf[valuesdf["∆G(Hartree)"] == 0].index.tolist()
        # copy the row to a new_row with the name of the log changed to Lowest E Conformer
        new_row = pd.DataFrame(valuesdf.loc[low_e_index[0]]).T
        new_row["log_name"] = "Lowest E Conformer"

        valuesdf = pd.concat([valuesdf, new_row], ignore_index=True, axis=0)

        # ------------------------------EDIT THIS SECTION IF YOU WANT A SPECIFIC CONFORMER----------------------------------
        # if you want all properties for a conformer with a particular property (i.e. all properties for the Vbur_min conformer)
        # this template can be adjusted for min/max/etc.

        # find the index for the min or max column:
        ensemble_min_index = valuesdf[
            valuesdf["log_name"] == "Ensemble Minimum"
        ].index.tolist()

        # find the min or max value of the property (based on index above)
        # saves the value in a list (min_value) with one entry (this is why we call min_value[0])
        min_value = valuesdf.loc[ensemble_min_index, "%Vbur_C1_2.0Å"].tolist()
        vbur_min_index = valuesdf[valuesdf["%Vbur_C1_2.0Å"] == min_value[0]].index.tolist()

        # copy the row to a new_row with the name of the log changed to Property_min_conformer
        new_row = pd.DataFrame(valuesdf.loc[vbur_min_index[0]]).T
        new_row["log_name"] = "%Vbur_C1_2.0Å_min_Conformer"

        valuesdf = pd.concat([valuesdf, new_row], ignore_index=True, axis=0)

        # --------------------------------------------------------------------------------------------------------------------

        # !here we define a list of properties we only want the minimal value for
        min_property_list = [
            "E_spc (Hartree)",
            "H_spc(Hartree)",
            "T",
            "T*S",
            "T*qh_S",
            "ZPE(Hartree)",
            "qh_G(T)_spc(Hartree)",
            "G(T)_spc(Hartree)",
        ]
        # extract the "Lowest E Conformer" row out of the dataframe
        Low_E_Conformer_row = pd.DataFrame(
            valuesdf.loc[valuesdf["log_name"] == "Lowest E Conformer"]
        )
        # extract the "Boltzmann Averages" row out of the dataframe
        Boltz_Avg_row = pd.DataFrame(
            valuesdf.loc[valuesdf["log_name"] == "Boltzmann Averages"]
        )
        # display(valuesdf) # debug display for finding the row index
        # display(Low_E_Conformer_row)

        # appends the frame to the master output
        all_df_master = pd.concat([all_df_master, valuesdf])

        # drop all the individual conformers
        dropindex = valuesdf[valuesdf["log_name"].str.startswith(substring)].index
        valuesdf = valuesdf.drop(dropindex)
        valuesdf = valuesdf.reset_index(drop=True)

        # drop the columns created to determine the mole fraction and some that
        valuesdf = valuesdf.drop(columns=explicit_order_front_columns)
        try:
            valuesdf = valuesdf.drop(columns=gv_extra_columns)
        except:
            pass
        try:
            valuesdf = valuesdf.drop(columns=reg_avg_columns)
        except:
            pass

        # ---------------------THIS MAY NEED TO CHANGE DEPENDING ON HOW YOU LABEL YOUR COMPOUNDS------------------------------
        compound_name = prefix + str(compound)
        # --------------------------------------------------------------------------------------------------------------------

        properties_df = pd.DataFrame({"Compound_Name": [compound_name]})

        # builds a dataframe (for each compound) by adding summary properties as new columns
        for column in valuesdf:
            # print(column)
            # the indexes need to match the values dataframe - display it to double check if you need to make changes
            # (uncomment the display(valuesdf) in row 124 of this cell)

            # create a list of headers for the properties_df
            # if you're collecting properties for a specific conformer, edit the header to reflect that, it should match the order in the valuesdf log_name column
            if column in min_property_list:
                # ! if we are working with a property that we only want the minimum value for, we only need one header
                headers = [
                    f"{column}",
                ]
                # use data from the Low_E_Conformer_row
                row_dataframe = pd.DataFrame(
                    [Low_E_Conformer_row[column].values], columns=headers
                )
            else:
                headers = [
                    f"{column}_Boltz",
                ]
                row_dataframe = pd.DataFrame([Boltz_Avg_row[column].values], columns=headers)
            # Extract values for the current column from valuesdf and create a DataFrame
            # Display the DataFrame for verification
            # display(row_dataframe)
            # Concatenate the new DataFrame to the properties_df along the columns (axis=1)
            properties_df = pd.concat([properties_df, row_dataframe], axis=1)

        # concatenates the individual acid properties df into the master properties df
        properties_df_master = pd.concat([properties_df_master, properties_df], axis=0)

    # Reset the index of the master DataFrames
    all_df_master = all_df_master.reset_index(drop=True)
    all_df_master_for_all_prefixes[prefix] = all_df_master.copy(deep=True)
    properties_df_master = properties_df_master.reset_index(drop=True)
    properties_df_master_for_all_prefixes[prefix] = properties_df_master.copy(deep=True)
    
    # Print in tabulated format
    print(tabulate(properties_df_master_for_all_prefixes[prefix], headers="keys", tablefmt="pretty"))
    print(tabulate(all_df_master_for_all_prefixes[prefix], headers="keys", tablefmt="pretty"))

Column N3 contains non-numeric values
Column N4 contains non-numeric values
Column C5 contains non-numeric values
Column C6 contains non-numeric values
Column C7 contains non-numeric values
Column C2 contains non-numeric values
Column C1 contains non-numeric values
Column H1 contains non-numeric values
Column NMR_shift_C1 contains non-numeric values
Column NMR_shift_C2 contains non-numeric values
Column C1_C2_Bond_Order contains non-numeric values
Column Natural_Valency_C1 contains non-numeric values
Column Natural_Valency_C2 contains non-numeric values
Column N3 contains non-numeric values
Column N4 contains non-numeric values
Column C5 contains non-numeric values
Column C6 contains non-numeric values
Column C7 contains non-numeric values
Column C2 contains non-numeric values
Column C1 contains non-numeric values
Column H1 contains non-numeric values
Column NMR_shift_C1 contains non-numeric values
Column NMR_shift_C2 contains non-numeric values
Column C1_C2_Bond_Order contains non-num

In [11]:
# merge all the properties_df_master_for_all_prefixes into a single dataframe, combine column with the same name
properties_df_master_for_all_prefixes_merged = pd.DataFrame(columns=[])
for prefix, df in properties_df_master_for_all_prefixes.items():
    display(df.head(1))
    properties_df_master_for_all_prefixes_merged = pd.concat([properties_df_master_for_all_prefixes_merged, df], axis=0)
    properties_df_master_for_all_prefixes_merged.reset_index(drop=True, inplace=True)

print(f"Combine summary properties for all prefixes: ")
display(properties_df_master_for_all_prefixes_merged.head(5))

Unnamed: 0,Compound_Name,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,qh_G(T)_spc(Hartree),T,HOMO_Boltz,LUMO_Boltz,μ_Boltz,η_Boltz,ω_Boltz,polar_iso(Debye)_Boltz,polar_aniso(Debye)_Boltz,dipole(Debye)_Boltz,volume(Bohr_radius³/mol)_Boltz,SASA_surface_area(Å²)_Boltz,SASA_volume(Å³)_Boltz,SASA_sphericity_Boltz,NBO_charge_C1_Boltz,NBO_charge_C2_Boltz,distance_C1_C2(Å)_Boltz,%Vbur_C1_2.0Å_Boltz,%Vbur_C2_2.0Å_Boltz,Sterimol_L_C1_C2(Å)_morfeus_Boltz,Sterimol_B1_C1_C2(Å)_morfeus_Boltz,Sterimol_B5_C1_C2(Å)_morfeus_Boltz,pyramidalization_Gavrish_C1(°)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,pyrdz1,-302.930027,0.090329,-302.833243,0.036086,0.036089,-302.869332,298.15,-0.26295,-0.02245,-0.1427,0.2405,0.04234,71.903,55.6405,4.0676,835.507,246.631338,330.617365,0.937532,-0.25254,0.02923,1.4073,84.75594,95.861312,6.652501,1.700245,3.261611,0.001161,0.000175


Unnamed: 0,Compound_Name,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,qh_G(T)_spc(Hartree),T,HOMO_Boltz,LUMO_Boltz,μ_Boltz,η_Boltz,ω_Boltz,polar_iso(Debye)_Boltz,polar_aniso(Debye)_Boltz,dipole(Debye)_Boltz,volume(Bohr_radius³/mol)_Boltz,SASA_surface_area(Å²)_Boltz,SASA_volume(Å³)_Boltz,SASA_sphericity_Boltz,NBO_charge_C1_Boltz,NBO_charge_C2_Boltz,distance_C1_C2(Å)_Boltz,%Vbur_C1_2.0Å_Boltz,%Vbur_C2_2.0Å_Boltz,Sterimol_L_C1_C2(Å)_morfeus_Boltz,Sterimol_B1_C1_C2(Å)_morfeus_Boltz,Sterimol_B5_C1_C2(Å)_morfeus_Boltz,pyramidalization_Gavrish_C1(°)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,pyrd1,-440.561372,0.150098,-440.402347,0.042178,0.042101,-440.444448,298.15,-0.24438,-0.02207,-0.133225,0.22231,0.03992,131.949,128.771,1.7941,1146.863,320.84286,463.582193,0.902837,-0.25346,0.07856,1.40455,84.79468,95.94202,8.780835,1.700435,4.634242,0.003385,0.00051


Unnamed: 0,Compound_Name,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,qh_G(T)_spc(Hartree),T,HOMO_Boltz,LUMO_Boltz,μ_Boltz,η_Boltz,ω_Boltz,polar_iso(Debye)_Boltz,polar_aniso(Debye)_Boltz,dipole(Debye)_Boltz,volume(Bohr_radius³/mol)_Boltz,SASA_surface_area(Å²)_Boltz,SASA_volume(Å³)_Boltz,SASA_sphericity_Boltz,NBO_charge_C1_Boltz,NBO_charge_C2_Boltz,distance_C1_C2(Å)_Boltz,%Vbur_C1_2.0Å_Boltz,%Vbur_C2_2.0Å_Boltz,Sterimol_L_C1_C2(Å)_morfeus_Boltz,Sterimol_B1_C1_C2(Å)_morfeus_Boltz,Sterimol_B5_C1_C2(Å)_morfeus_Boltz,pyramidalization_Gavrish_C1(°)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,pyrmd1,-456.606644,0.138347,-456.459507,0.04203,0.041922,-456.501429,298.15,-0.25541,-0.03607,-0.14574,0.21934,0.04842,124.978,120.848,2.0432,1106.548,315.25242,452.627008,0.904314,-0.23463,0.24652,1.40552,84.649406,94.731405,8.797604,1.700282,4.41369,0.001162,0.000175


Unnamed: 0,Compound_Name,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,qh_G(T)_spc(Hartree),T,HOMO_Boltz,LUMO_Boltz,μ_Boltz,η_Boltz,ω_Boltz,polar_iso(Debye)_Boltz,polar_aniso(Debye)_Boltz,dipole(Debye)_Boltz,volume(Bohr_radius³/mol)_Boltz,SASA_surface_area(Å²)_Boltz,SASA_volume(Å³)_Boltz,SASA_sphericity_Boltz,NBO_charge_C1_Boltz,NBO_charge_C2_Boltz,distance_C1_C2(Å)_Boltz,%Vbur_C1_2.0Å_Boltz,%Vbur_C2_2.0Å_Boltz,Sterimol_L_C1_C2(Å)_morfeus_Boltz,Sterimol_B1_C1_C2(Å)_morfeus_Boltz,Sterimol_B5_C1_C2(Å)_morfeus_Boltz,pyramidalization_Gavrish_C1(°)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,pyrz1,-456.601997,0.138128,-456.455073,0.042009,0.041906,-456.496978,298.15,-0.25412,-0.03678,-0.14545,0.21734,0.04867,128.021,131.357,0.319,1225.531,315.002406,452.324429,0.904629,-0.25215,0.03199,1.39879,84.839876,95.890367,8.807194,1.700283,4.366581,0.001238,0.000186


Combine summary properties for all prefixes: 


Unnamed: 0,Compound_Name,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,qh_G(T)_spc(Hartree),T,HOMO_Boltz,LUMO_Boltz,μ_Boltz,η_Boltz,ω_Boltz,polar_iso(Debye)_Boltz,polar_aniso(Debye)_Boltz,dipole(Debye)_Boltz,volume(Bohr_radius³/mol)_Boltz,SASA_surface_area(Å²)_Boltz,SASA_volume(Å³)_Boltz,SASA_sphericity_Boltz,NBO_charge_C1_Boltz,NBO_charge_C2_Boltz,distance_C1_C2(Å)_Boltz,%Vbur_C1_2.0Å_Boltz,%Vbur_C2_2.0Å_Boltz,Sterimol_L_C1_C2(Å)_morfeus_Boltz,Sterimol_B1_C1_C2(Å)_morfeus_Boltz,Sterimol_B5_C1_C2(Å)_morfeus_Boltz,pyramidalization_Gavrish_C1(°)_Boltz,pyramidalization_Agranat-Radhakrishnan_C1_Boltz
0,pyrdz1,-302.930027,0.090329,-302.833243,0.036086,0.036089,-302.869332,298.15,-0.26295,-0.02245,-0.1427,0.2405,0.04234,71.903,55.6405,4.0676,835.507,246.631338,330.617365,0.937532,-0.25254,0.02923,1.4073,84.75594,95.861312,6.652501,1.700245,3.261611,0.001161,0.000175
1,pyrdz2,-302.930289,0.090406,-302.833406,0.036145,0.036148,-302.869554,298.15,-0.27375,-0.01057,-0.14216,0.26318,0.03839,71.7672,53.425,4.6732,812.23,245.641237,330.094241,0.940318,-0.24646,-0.15475,1.40238,84.949638,96.920196,6.157302,1.70052,3.270417,0.006961,0.001048
2,pyrdz3,-342.243845,0.118646,-342.117063,0.040414,0.04016,-342.157223,298.15,-0.24596,-0.01835,-0.132155,0.22761,0.03837,86.0281,67.1591,3.7834,853.3,276.211315,379.377533,0.917537,-0.04725,0.04263,1.41493,92.03577,95.851627,6.671775,1.83477,3.266725,0.003601,0.00054
3,pyrd1,-440.561372,0.150098,-440.402347,0.042178,0.042101,-440.444448,298.15,-0.24438,-0.02207,-0.133225,0.22231,0.03992,131.949,128.771,1.7941,1146.863,320.84286,463.582193,0.902837,-0.25346,0.07856,1.40455,84.79468,95.94202,8.780835,1.700435,4.634242,0.003385,0.00051
4,pyrd2,-440.56109,0.150065,-440.402068,0.042236,0.042144,-440.444213,298.15,-0.23505,-0.02556,-0.130305,0.20949,0.04053,132.41,133.341,2.0008,1399.336,319.532785,462.70445,0.905394,-0.28585,-0.17875,1.39808,85.00452,96.916968,8.964221,1.700709,4.320231,0.003771,0.000567


# 5. Export the data

In [12]:
# Define the filename for the Excel file
with pd.ExcelWriter(output_folder + os.sep + "Properties_postprocessed_all_prefixes.xlsx", engine="xlsxwriter") as writer:
    for prefix, properties_df_master in properties_df_master_for_all_prefixes.items():
        print(f"Writing to Excel file for prefix: {prefix}")
        all_df_master = all_df_master_for_all_prefixes[prefix]

        all_df_master.to_excel(writer, sheet_name="All_Conformer_Properties_" + prefix, index=False)
        # automatically adjusts the width of the columns
        for column in all_df_master.columns:
            column_width = max(all_df_master[column].astype(str).map(len).max(), len(column))
            col_idx = all_df_master.columns.get_loc(column)
            writer.sheets["All_Conformer_Properties_" + prefix].set_column(col_idx, col_idx, column_width)
        properties_df_master.to_excel(writer, sheet_name="Summary_Properties_" + prefix, index=False)
        # automatically adjusts the width of the columns
        for column in properties_df_master.columns:
            column_width = max(properties_df_master[column].astype(str).map(len).max(), len(column))
            col_idx = properties_df_master.columns.get_loc(column)
            writer.sheets["Summary_Properties_" + prefix].set_column(col_idx, col_idx, column_width)

Writing to Excel file for prefix: pyrdz
Writing to Excel file for prefix: pyrd
Writing to Excel file for prefix: pyrmd
Writing to Excel file for prefix: pyrz


In [13]:
# write the combined properties_df_master_for_all_prefixes to an Excel file
with pd.ExcelWriter(output_folder + os.sep + "Summary_Properties_all.xlsx", engine="xlsxwriter") as writer:
    properties_df_master_for_all_prefixes_merged.to_excel(writer, sheet_name="Summary_Properties", index=False)
    # automatically adjusts the width of the columns
    for column in properties_df_master_for_all_prefixes_merged.columns:
        column_width = max(properties_df_master_for_all_prefixes_merged[column].astype(str).map(len).max(), len(column))
        col_idx = properties_df_master_for_all_prefixes_merged.columns.get_loc(column)
        writer.sheets["Summary_Properties"].set_column(col_idx, col_idx, column_width)