# Imports

Brittany C. Haas and Melissa A. Hardy's jupyter notebook for automated collection of molecular descriptors and post-processing (i.e., Boltzmann average, min/max values, etc.).

**NOTE: Make sure to use the get_properties_environment file to set your conda environment.**

In [1]:
import os,re,sys,pickle,datetime,time,random,itertools,glob
from itertools import permutations
import warnings
warnings.filterwarnings("ignore")
import numpy as np
np.set_printoptions(threshold=sys.maxsize) #print out full arrays
import openpyxl
from openpyxl import load_workbook
import pandas as pd
from pandas import ExcelWriter
import xlsxwriter

import math
randomstate = 42

from rdkit import Chem

import goodvibes.GoodVibes as gv
import goodvibes.thermo as thermo
import goodvibes.io as io
import goodvibes.pes as pes
from morfeus import ConeAngle
from morfeus import Sterimol
import get_properties_functions_for_WI as gp

npa_pattern = re.compile("Summary of Natural Population Analysis:")
nbo_os_pattern = re.compile("beta spin orbitals")
nborbs_pattern = "NATURAL BOND ORBITALS (Summary):" #"Natural Bond Orbitals (Summary)"
nborbs2_pattern = re.compile("NATURAL BOND ORBITALS (Summary):")

def get_one_lp_energy(dataframe, a_list): #a function to get the NB orbitals for all atoms (a_list, form ["C1", "C4", "O2"]) in a dataframe that contains file name and atom number
    nborbs_dataframe = pd.DataFrame(columns=[]) #define an empty df to place results in
                
    for index, row in dataframe.iterrows(): #iterate over the dataframe 
        try: #try to get the data
            atomnum_list = [] 
            for atom in a_list: 
                atomnum = row[str(atom)] #the atom number (i.e. 16) to add to the list is the df entry of this row for the labeled atom (i.e. "C1")
                atomnum_list.append(str(atomnum)) #append that to atomnum_list to make a list of the form [16, 17, 29]
            
            log_file = row['log_name'] #read file name from df
            filecont, error = gp.get_filecont(log_file) #read the contents of the log file
            if error != "":
                print(error)
                row_i = {}
                for a in range(0, len(a_list)):
                    entry = {'NBO_charge_'+str(a_list[a]): "no data"}
                    row_i.update(entry)
                nborbs_dataframe = nborbs_dataframe.append(row_i, ignore_index=True)
                continue
        
            nborbsstart = 0
            #this section finds the line (nborbsstart) where the nbo data is located
            for i in range(len(filecont)-1,0,-1):
                if nborbs_pattern in filecont[i]:#search the file content for the phrase which indicates the start of the NB orbitals section 
                    nborbsstart = i   
            if nborbsstart == 0: 
                error = "****no Natural Bond Orbitals found in: " + str(row['log_name']) + ".log"
                print(error)
                row_i = {}
                for a in range(0, len(a_list)):
                    entry = {'NBO_charge_'+str(a_list[a]): "no data"}
                    row_i.update(entry)
                nborbs_dataframe = nborbs_dataframe.append(row_i, ignore_index=True)
                continue
           
            for atom in a_list: 
                k = 0
                atom_num = row[str(atom)]
                for j in range(nborbsstart,len(filecont)):
                    if str(atom_num) in " ".join(re.findall("([A-Z][a-z]? *[0-9]+)",filecont[j])).split() and ("LP" in filecont[j]):
                        orbital_section = re.search("[0-9]+\.[A-Z\*(0-9 ]+\)",filecont[j]).group(0) #type of MO
                        orbital = orbital_section.split(". ")
                        orb = orbital[1]
                        des = orb.split(" ")
                        orb_type = des[0]
                        occ_energy = [x for x in re.findall(r"[-+]?\d*\.\d+",filecont[j])]
                        occ = occ_energy[0]
                        energy = occ_energy[1]
                        k += 1
                        #print(k)
                if k == 0: 
                    error = "****no LPs for atom " + str(atom)+ " in: " + str(row['log_name']) + ".log"
                    print(error)
                    row_i = {}
                    for atom in a_list:
                        entry = {'NBO_LP_occupancy_' + str(atom): "no data", 'NBO_LP_energy_' + str(atom): "no data"}
                        row_i.update(entry)
                    nborbs_dataframe = nborbs_dataframe.append(row_i, ignore_index=True)
                    pass
                if k == 2: 
                    error = "****more than one LP for atom " + str(atom)+ " in: " + str(row['log_name']) + ".log"
                    print(error)
                    row_i = {}
                    for atom in a_list:
                        entry = {'NBO_LP_occupancy_' + str(atom): "no data", 'NBO_LP_energy_' + str(atom): "no data"}
                        row_i.update(entry)
                    nborbs_dataframe = nborbs_dataframe.append(row_i, ignore_index=True)
                    continue
            
            #this adds the data from the nboout into the new property df
            row_i = {}
            for atom in a_list:
                entry = {'NBO_LP_occupancy_' + str(atom): occ, 'NBO_LP_energy_' + str(atom): energy}
                row_i.update(entry)
            #print(row_i)
            print(row_i)
            nborbs_dataframe = nborbs_dataframe.append(row_i, ignore_index=True)
        except:
            print('****Unable to acquire NBO orbitals for:', row['log_name'], ".log")
            row_i = {}
            for a in range(0, len(a_list)):
                entry = {'NBO_charge_'+str(a_list[a]): "no data"}
                row_i.update(entry)
            nborbs_dataframe = nborbs_dataframe.append(row_i, ignore_index=True)
    print("NBOrbs function has completed for", a_list)
    return(pd.concat([dataframe, nborbs_dataframe], axis = 1))


D3 import failed


# Atom Inputs Dataframe

Portions of this section were adapted from code written Jordan P. Liles.

## Generate dataframe with atom numbers

### Use command line to prepare files

To create files: navigate to folder that contains all the log files you wish to analyze.

    module load openbabel
    obabel *.log -osdf -m  
    ls *.log > log_ids.txt
    cat *.sdf >> molecules.sdf

You will use the log_ids.txt and molecules.sdf files in the rest of 2.1.

### Define SMARTS substructure


Recommended to draw the common substructure (with general atoms) in Chemdraw and copy as SMILES (this will generate a SMARTS string)

More information about SMARTS and available characters here: https://www.daylight.com/dayhtml/doc/theory/theory.smarts.html


In [9]:
#this is the common smarts substructure for the molecules you will analyze
#you have to explicitly draw hydrogens into the SMARTS structure if you want to collect properties for hydrogen atoms
substructure = Chem.MolFromSmarts('[H]N([*])[H]')

### Generate preliminary dataframe

In [10]:
#generate a list of molecules using RDkit
all_compounds = Chem.SDMolSupplier('molecules.sdf', removeHs=False) 
#molecules.sdf is generated with the instructions above
#it is a single sdf that contains the structures/atom numbers etc. for every molecule you will analyze

#uses RDKit to search for the substructure in each compound you will analyze
atoms = []
for molecule in all_compounds:
    if molecule is not None:
        submatch = molecule.GetSubstructMatches(substructure) #find substructure
        matchlist = list([item for sublist in submatch for item in sublist]) #list of zero-indexed atom numbers
        match_idx = [x+1 for x in matchlist] #this line changes from 0-indexed to 1-indexed (for Gaussian)
        atoms.append(match_idx) #append 1-indexed list to atoms (a list of lists)
        
#this loop extracts log names from log_ids and splits them to the desired format
filenames = open("log_ids.txt", "r") #generate this with instruction above
#it is a text file that contains the file name for every molecule you will analyze
list_of_filenames = [(line.strip()).split() for line in filenames] #list of the file names (each of which includes all conformers)
list_of_files = []
for filename in list_of_filenames:
    file = filename[0].split(".")
    list_of_files.append(file[0])
filenames.close()

#put the atom numbers for the substructure for each log file into a dataframe
prelim_df = pd.DataFrame(atoms) 
index=list_of_files
prelim_df.insert(0,column='log_name',value=list_of_files)
display(prelim_df)

Unnamed: 0,log_name,0,1,2,3
0,an1000_conf-1,17,6,5,18
1,an1000_conf-3,17,6,5,18
2,an1000_conf-4,17,6,5,18
3,an103_conf-1,13,1,2,14
4,an106_conf-1,28,14,13,29
...,...,...,...,...,...
2200,an998_conf-4,16,1,2,17
2201,an998_conf-5,16,1,2,17
2202,an999_conf-1,16,7,6,17
2203,an999_conf-2,16,7,6,17


### Define column headers using GaussView

Using the preliminary dataframe displayed above, open one of your files and check the atom numbers. 

Assign labels to each column using the cell below. You will call these column headers when you select your properties. 

**User input required:**

In [11]:
atom_labels = {'log_name': 'log_name',
                0: 'H3',
                1: 'N1',
                2: 'C2',
                3: 'H4'}

### Generate labeled dataframe

**NOTE: it is very important you assign these correctly otherwise the properties you collect will be for the wrong atoms and not produce meaningful correlations.** 

We recommend checking the numbering/headers for at least two different compounds. 

Numbering for different conformers of the same compounds will likely be the same (but may not be for some symmetrical groups).

In [12]:
#rename columns using the user input above
atom_map_df = prelim_df.rename(columns=atom_labels)
display(atom_map_df)

#you can use this to clean up the table if you have more atoms in your substructure than you want to collect descriptors for
#atom_map_df = atom_map_df.drop(columns= ['C4', 'C1']) 
#display(atom_map_df.head())

df = atom_map_df #df is what properties will be appended to, this creates a copy so that you have the original preserved 

Unnamed: 0,log_name,H3,N1,C2,H4
0,an1000_conf-1,17,6,5,18
1,an1000_conf-3,17,6,5,18
2,an1000_conf-4,17,6,5,18
3,an103_conf-1,13,1,2,14
4,an106_conf-1,28,14,13,29
...,...,...,...,...,...
2200,an998_conf-4,16,1,2,17
2201,an998_conf-5,16,1,2,17
2202,an999_conf-1,16,7,6,17
2203,an999_conf-2,16,7,6,17


### Save atom map to Excel (if desired)

In [13]:
writer = pd.ExcelWriter('atom_map_an1_an3.xlsx')
atom_map_df.to_excel(writer)
writer.save()

## Import a manually-generated atom mapping dataframe

If you need to manually generate the atom mapping dataframe, check out the atom_map_sample.xlsx to make sure you have the desired format. 

In [7]:
#atom_map_df = pd.read_excel('atom_map_an1_an1000.xlsx','Sheet1',index_col=0,header=0,engine='openpyxl')
#display(atom_map_df.head())
#df = atom_map_df #df is what properties will be appended to, this creates a copy so that you have the original preserved 

FileNotFoundError: [Errno 2] No such file or directory: 'atom_map_an1_an1000.xlsx'

# Define Properties to Collect

## Available property functions and how to call them: 

In [10]:
#this box has functions to choose from
df = atom_map_df

#---------------GoodVibes Engergies---------------
#uses the GoodVibes 2021 Branch (Jupyter Notebook Compatible)
#calculates the quasi harmonic corrected G(T) and single point corrected G(T) as well as other thermodynamic properties
#inputs: dataframe, temperature
#df = gp.get_goodvibes_e(df, 298.15)

#---------------Frontier Orbitals-----------------
#E(HOMO), E(LUMO), mu(chemical potential or negative of molecular electronegativity), eta(hardness/softness), omega(electrophilicity index)
#df = gp.get_frontierorbs(df)

#---------------Polarizability--------------------
#Exact polarizability
#df = gp.get_polarizability(df)

#---------------Dipole----------------------------
#Total dipole moment magnitude in Debye
#df = gp.get_dipole(df)

#---------------Volume----------------------------
#Molar volume
#requires the Gaussian keyword = "volume" in the .com file
#df = gp.get_volume(df)

#---------------SASA------------------------------
#Uses morfeus to calculat sovlent accessible surface area and the volume under the SASA
#df = gp.get_SASA(df)

#---------------NBO-------------------------------
#natural charge from NBO
#requires the Gaussian keyword = "pop=nbo7" in the .com file
#nbo_list = ["C1", "N", "H1", "H2"]
#df = gp.get_nbo(df, nbo_list) 

#---------------NMR-------------------------------
#isotropic NMR shift
#requires the Gaussian keyword = "nmr=giao" in the .com file
#nmr_list = ["C1", "H1", "N1"]
#df = gp.get_nmr(df, nmr_list) 

#---------------Distance--------------------------
#distance between 2 atoms
#dist_list_of_lists = [["N", "C1"], ["N", "H1"]]
#df = gp.get_distance(df, dist_list_of_lists) 

#---------------Angle-----------------------------
#angle between 3 atoms
#angle_list_of_lists = [["H1", "N", "C1"], ["N", "C1", "C2"]]
#df = gp.get_angles(df, angle_list_of_lists) 

#---------------Dihedral--------------------------
#dihedral angle between 4 atoms
#dihedral_list_of_lists = [["O2", "C1", "O3", "H5"], ["C4", "C1", "O3", "H5"]]
#df = gp.get_dihedral(df, dihedral_list_of_lists) 

#---------------Vbur Scan-------------------------
#uses morfeus to calculate the buried volume at a series of radii (including hydrogens)
#inputs: dataframe, list of atoms, start_radius, end_radius, and step_size
#if you only want a single radius, put the same value for start_radius and end_radius (keep step_size > 0)
#vbur_list = ["N", "H1"]
#df = gp.get_vbur_scan(df, vbur_list, 2, 4, 0.5)
    
#---------------Sterimol morfeus------------------
#uses morfeus to calculate Sterimol L, B1, and B5 values
#NOTE: this is much faster than the corresponding DBSTEP function (recommendation: use as default/if you don't need Sterimol2Vec)
#sterimol_list_of_lists = [["N1", "H1"], ["N1", "C1"]]
#df = gp.get_sterimol_morfeus(df, sterimol_list_of_lists) 

#---------------Buried Sterimol-------------------
#uses morfeus to calculate Sterimol L, B1, and B5 values within a given sphere of radius r_buried
#atoms outside the sphere + 0.5 vdW radius are deleted and the Sterimol vectors are calculated
#for more information: https://kjelljorner.github.io/morfeus/sterimol.html
#inputs: dataframe, list of atom pairs, r_buried
#sterimol_list_of_lists = [["N", "C1"]]
#df = gp.get_buried_sterimol(df, sterimol_list_of_lists, 5.5) 

#---------------Sterimol DBSTEP-------------------
#uses DBSTEP to calculate Sterimol L, B1, and B5 values
#default grid point spacing (0.05 Angstrom) is used (can use custom spacing or vdw radii in the get_properties_functions script)
#more info here: https://github.com/patonlab/DBSTEP
#NOTE: this takes longer than the morfeus function (recommendation: only use this if you need Sterimol2Vec)
#sterimol_list_of_lists = [["N", "H1"]]
#df = gp.get_sterimol_dbstep(df, sterimol_list_of_lists) 

#---------------Sterimol2Vec----------------------
#uses DBSTEP to calculate Sterimol Bmin and Bmax values at intervals from 0 to end_radius, with a given step_size 
#default grid point spacing (0.05 Angstrom) is used (can use custom spacing or vdw radii in the get_properties_functions script)
#more info here: https://github.com/patonlab/DBSTEP
#inputs: dataframe, list of atom pairs, end_radius, and step_size
#sterimol2vec_list_of_lists = [["C1", "C4"]]
#df = gp.get_sterimol2vec(df, sterimol2vec_list_of_lists, 1, 1.0) 

#---------------Pyramidalization------------------
#uses morfeus to calculate pyramidalization based on the 3 atoms in closest proximity to the defined atom
#collects values based on two definitions of pyramidalization
#details on these values can be found here: https://kjelljorner.github.io/morfeus/pyramidalization.html
#pyr_list = ["C1", "C4"]
#df = gp.get_pyramidalization(df, pyr_list)

#---------------Plane Angle-----------------------
#plane angle between 2 planes (each defined by 3 atoms)
#planeangle_list_of_lists = [["O2", "C1", "O3", "H5", "C1", "C4"], ["O2", "C1", "O3", "H5", "C1", "C4"]]
#df = gp.get_planeangle(df, planeangle_list_of_lists) 

#--------------LP energy - custom from first cell---------------
#lp_list = ["N1"]
#df = get_one_lp_energy(df, lp_list) 

pd.options.display.max_columns = None
display(df)

****Failed or incomplete jobs for an1_conf-1.log
****Failed or incomplete jobs for an2_conf-1.log
****Failed or incomplete jobs for an3_conf-1.log
****Failed or incomplete jobs for an3_conf-2.log
****Failed or incomplete jobs for an4_conf-1.log
****Failed or incomplete jobs for an5_conf-1.log
****Failed or incomplete jobs for an6_conf-1.log
****Failed or incomplete jobs for an7_conf-1.log
****Failed or incomplete jobs for an8_conf-1.log
****Failed or incomplete jobs for an9_conf-1.log
NMR function has completed for ['C1', 'H1', 'N1']


Unnamed: 0,log_name,H1,N1,H2,C1,C2,C3,C4,C5,C6,NMR_shift_C1,NMR_shift_H1,NMR_shift_N1
0,an1_conf-1,2,7,6,5,4,3,1,9,8,no data,no data,no data
1,an2_conf-1,5,4,3,2,8,7,6,15,14,no data,no data,no data
2,an3_conf-1,6,8,9,3,4,5,7,16,15,no data,no data,no data
3,an3_conf-2,6,5,4,3,9,8,7,15,16,no data,no data,no data
4,an4_conf-1,2,3,4,5,7,8,1,9,10,no data,no data,no data
5,an5_conf-1,2,3,4,5,7,8,1,10,9,no data,no data,no data
6,an6_conf-1,3,2,8,7,6,5,4,13,12,no data,no data,no data
7,an7_conf-1,2,8,7,5,4,3,1,10,9,no data,no data,no data
8,an8_conf-1,6,5,4,3,2,8,7,16,15,no data,no data,no data
9,an9_conf-1,2,8,6,5,4,3,1,10,9,no data,no data,no data


## Copy and modify available property functions above to customize

We recommend copying the entire cell above. You will need to change the atom number lists to match your desired column headers and delete (or comment out) any properites you don't want to collect.

In [26]:
df = atom_map_df

#---------------GoodVibes Engergies---------------
#uses the GoodVibes 2021 Branch (Jupyter Notebook Compatible)
#calculates the quasi harmonic corrected G(T) and single point corrected G(T) as well as other thermodynamic properties
#inputs: dataframe, temperature
df = gp.get_goodvibes_e(df, 298.15)

#--------------LP energy - custom from first cell---------------
lp_list = ["N1"]
df = get_one_lp_energy(df, lp_list) 

#---------------Frontier Orbitals-----------------
#E(HOMO), E(LUMO), mu(chemical potential or negative of molecular electronegativity), eta(hardness/softness), omega(electrophilicity index)
df = gp.get_frontierorbs(df)

#---------------Polarizability--------------------
#Exact polarizability
df = gp.get_polarizability(df)

#---------------Dipole----------------------------
#Total dipole moment magnitude in Debye
df = gp.get_dipole(df)

#---------------Volume----------------------------
#Molar volume
#requires the Gaussian keyword = "volume" in the .com file
df = gp.get_volume(df)

#---------------SASA------------------------------
#Uses morfeus to calculat sovlent accessible surface area and the volume under the SASA
df = gp.get_SASA(df)

#---------------NBO-------------------------------
#natural charge from NBO
#requires the Gaussian keyword = "pop=nbo7" in the .com file
nbo_list = ["H3", "N1", "C2", "H4"]
df = gp.get_nbo(df, nbo_list) 

#---------------NMR-------------------------------
#isotropic NMR shift
#requires the Gaussian keyword = "nmr=giao" in the .com file
nmr_list = ["H3", "N1", "C2", "H4"]
df = gp.get_nmr(df, nmr_list) 

#---------------Distance--------------------------
#distance between 2 atoms
dist_list_of_lists = [["N1", "C2"]]
df = gp.get_distance(df, dist_list_of_lists) 

#---------------Angle-----------------------------
#angle between 3 atoms
angle_list_of_lists = [["H3", "N1", "C2"], ["H4", "N1", "C2"]]
df = gp.get_angles(df, angle_list_of_lists) 

#---------------Pyramidalization------------------
#uses morfeus to calculate pyramidalization based on the 3 atoms in closest proximity to the defined atom
#collects values based on two definitions of pyramidalization
#details on these values can be found here: https://kjelljorner.github.io/morfeus/pyramidalization.html
pyr_list = ["N1"]

pd.options.display.max_columns = None
display(df)



   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory

   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory

   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory

   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory

   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory

   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory

   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory

   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory

   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory

   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory

   Using vibrational scale factor 1.0 for B3LYP/6-31G(d,p) level of theory
Goodvibes function has completed


Unnamed: 0,log_name,H3,N1,C2,H4,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,T*qh_S,G(T)_spc(Hartree),qh_G(T)_spc(Hartree),T
0,an1_conf-1,8,1,2,9,-287.584401,0.117332,-287.460336,0.035876,0.035883,-287.496212,-287.49622,298.15
1,an1_conf-1_UT,8,1,2,9,-287.584401,0.117332,-287.460336,0.035876,0.035883,-287.496212,-287.49622,298.15
2,an2_conf-1,14,6,5,15,-326.893704,0.144741,-326.740336,0.041163,0.040286,-326.7815,-326.780622,298.15
3,an3_conf-1,15,7,6,16,-402.106943,0.149949,-401.947683,0.042109,0.041816,-401.989792,-401.989499,298.15
4,an3_conf-2,15,7,6,16,-402.1035,0.149307,-401.944582,0.044417,0.043049,-401.988999,-401.987631,298.15
5,an4_conf-1,9,1,2,10,-747.188054,0.107792,-747.072342,0.039331,0.039286,-747.111674,-747.111628,298.15
6,an5_conf-1,9,1,2,10,-2861.18187,0.107425,-2861.066324,0.040604,0.040524,-2861.106928,-2861.106848,298.15
7,an6_conf-1,12,4,3,13,-326.895871,0.145294,-326.742354,0.039161,0.039165,-326.781515,-326.781519,298.15
8,an7_conf-1,9,1,2,10,-386.836288,0.109111,-386.719615,0.03806,0.038055,-386.757675,-386.75767,298.15
9,an8_conf-1,15,7,6,16,-326.894812,0.144782,-326.741451,0.040836,0.040127,-326.782287,-326.781579,298.15


In [36]:
writer = pd.ExcelWriter('properties_1.xlsx')
df.to_excel(writer)
writer.save()

In [30]:
#this box has functions to choose from
df = atom_map_df

#---------------Vbur Scan-------------------------
#uses morfeus to calculate the buried volume at a series of radii (including hydrogens)
#inputs: dataframe, list of atoms, start_radius, end_radius, and step_size
#if you only want a single radius, put the same value for start_radius and end_radius (keep step_size > 0)
vbur_list = ["N1"]
df = gp.get_vbur_scan(df, vbur_list, 2, 6, 0.5)

#---------------Sterimol morfeus------------------
#uses morfeus to calculate Sterimol L, B1, and B5 values
#NOTE: this is much faster than the corresponding DBSTEP function (recommendation: use as default/if you don't need Sterimol2Vec)
sterimol_list_of_lists = [["N1", "C2"]]
df = gp.get_sterimol_morfeus(df, sterimol_list_of_lists) 


pd.options.display.max_columns = None
display(df)

Vbur scan function has completed for ['N1'] from 2  to  6
Morfeus Sterimol function has completed for [['N1', 'C2']]


Unnamed: 0,log_name,H3,N1,C2,H4,%Vbur_N1_2.0Å,%Vbur_N1_2.5Å,%Vbur_N1_3.0Å,%Vbur_N1_3.5Å,%Vbur_N1_4.0Å,%Vbur_N1_4.5Å,%Vbur_N1_5.0Å,%Vbur_N1_5.5Å,%Vbur_N1_6.0Å,Sterimol_L_N1_C2(Å)_morfeus,Sterimol_B1_N1_C2(Å)_morfeus,Sterimol_B5_N1_C2(Å)_morfeus
0,an1_conf-1,8,1,2,9,83.057851,63.121826,48.081794,38.270476,30.816678,24.810047,20.396905,16.857042,13.623294,6.797181,1.7,3.250754
1,an1_conf-1_UT,8,1,2,9,83.057851,63.121826,48.081794,38.270476,30.816678,24.810047,20.396905,16.857042,13.623294,6.797181,1.7,3.250754
2,an2_conf-1,14,6,5,15,82.818957,62.973694,47.975641,38.211657,30.763919,24.766629,20.356895,16.844643,13.937027,7.845465,1.856756,3.249106
3,an3_conf-1,15,7,6,16,82.528409,62.760451,47.828516,38.085865,30.676634,24.713171,20.324966,16.826486,13.98102,8.891597,1.710944,3.251603
4,an3_conf-2,15,7,6,16,82.809272,63.027412,48.0073,38.227381,30.775169,24.79078,20.383108,17.019125,14.25042,8.729512,1.7,3.300866
5,an4_conf-1,9,1,2,10,83.109504,63.180427,48.095761,38.291441,30.826376,24.824429,20.405774,16.875495,13.924781,8.10317,1.7,3.250927
6,an5_conf-1,9,1,2,10,83.122417,63.173916,48.099486,38.287947,30.819006,24.826329,20.407548,16.879037,13.961744,8.353476,1.7,3.251523
7,an6_conf-1,12,4,3,13,83.490444,65.61564,52.054157,43.013884,35.812876,29.488321,24.275094,19.859115,15.930995,6.791071,1.776538,4.476997
8,an7_conf-1,9,1,2,10,82.79313,62.988345,47.985883,38.21981,30.783704,24.789152,20.37621,16.8318,13.748696,7.411719,1.7,3.250182
9,an8_conf-1,15,7,6,16,82.880294,63.089269,48.057583,38.27397,30.805816,25.064041,20.999811,17.834853,15.023244,6.787992,1.777491,4.486953


In [31]:
writer = pd.ExcelWriter('properties_2.xlsx')
df.to_excel(writer)
writer.save()

In [32]:
#this box has functions to choose from
df = atom_map_df

#---------------Sterimol DBSTEP-------------------
#uses DBSTEP to calculate Sterimol L, B1, and B5 values
#default grid point spacing (0.05 Angstrom) is used (can use custom spacing or vdw radii in the get_properties_functions script)
#more info here: https://github.com/patonlab/DBSTEP
#NOTE: this takes longer than the morfeus function (recommendation: only use this if you need Sterimol2Vec)
sterimol_list_of_lists = [["N1", "C2"]]
df = gp.get_sterimol_dbstep(df, sterimol_list_of_lists) 

pd.options.display.max_columns = None
display(df)

   an1_conf-1.log / Bmin:  1.65 / Bmax:  3.22 / L:  6.35
   an1_conf-1_UT.log / Bmin:  1.65 / Bmax:  3.22 / L:  6.35
   an2_conf-1.log / Bmin:  1.85 / Bmax:  3.22 / L:  7.40
   an3_conf-1.log / Bmin:  1.71 / Bmax:  3.23 / L:  8.45
   an3_conf-2.log / Bmin:  1.65 / Bmax:  3.29 / L:  8.30
   an4_conf-1.log / Bmin:  1.65 / Bmax:  3.22 / L:  7.70
   an5_conf-1.log / Bmin:  1.65 / Bmax:  3.22 / L:  7.90
   an6_conf-1.log / Bmin:  1.75 / Bmax:  4.46 / L:  6.35
   an7_conf-1.log / Bmin:  1.65 / Bmax:  3.22 / L:  7.00
   an8_conf-1.log / Bmin:  1.77 / Bmax:  4.47 / L:  6.35
   an9_conf-1.log / Bmin:  1.67 / Bmax:  4.46 / L:  6.35
DBSTEP Sterimol function has completed for [['N1', 'C2']]


Unnamed: 0,log_name,H3,N1,C2,H4,Sterimol_B1_N1_C2(Å)_dbstep,Sterimol_B5_N1_C2(Å)_dbstep,Sterimol_L_N1_C2(Å)_dbstep
0,an1_conf-1,8,1,2,9,1.65,3.219084,6.35
1,an1_conf-1_UT,8,1,2,9,1.65,3.219084,6.35
2,an2_conf-1,14,6,5,15,1.85,3.219084,7.4
3,an3_conf-1,15,7,6,16,1.712924,3.231099,8.45
4,an3_conf-2,15,7,6,16,1.65,3.289757,8.3
5,an4_conf-1,9,1,2,10,1.65,3.219084,7.7
6,an5_conf-1,9,1,2,10,1.65,3.219084,7.9
7,an6_conf-1,12,4,3,13,1.754804,4.463743,6.35
8,an7_conf-1,9,1,2,10,1.65,3.219084,7.0
9,an8_conf-1,15,7,6,16,1.77339,4.472695,6.35


In [33]:
writer = pd.ExcelWriter('properties_3.xlsx')
df.to_excel(writer)
writer.save()

In [34]:
#this box has functions to choose from
df = atom_map_df

#---------------Sterimol2Vec----------------------
#uses DBSTEP to calculate Sterimol Bmin and Bmax values at intervals from 0 to end_radius, with a given step_size 
#default grid point spacing (0.05 Angstrom) is used (can use custom spacing or vdw radii in the get_properties_functions script)
#more info here: https://github.com/patonlab/DBSTEP
#inputs: dataframe, list of atom pairs, end_radius, and step_size
sterimol2vec_list_of_lists = [["N1", "C2"]]
df = gp.get_sterimol2vec(df, sterimol2vec_list_of_lists, 3, .5) 

pd.options.display.max_columns = None
display(df)

   an1_conf-1.log / R:  0.00 / Bmin:  1.53 / Bmax:  2.30 
   an1_conf-1.log / R:  0.50 / Bmin:  1.64 / Bmax:  3.06 
   an1_conf-1.log / R:  1.00 / Bmin:  1.65 / Bmax:  3.21 
   an1_conf-1.log / R:  1.50 / Bmin:  1.65 / Bmax:  3.21 
   an1_conf-1.log / R:  2.00 / Bmin:  1.65 / Bmax:  3.21 
   an1_conf-1.log / R:  2.50 / Bmin:  1.65 / Bmax:  3.15 
   an1_conf-1.log / R:  3.00 / Bmin:  1.60 / Bmax:  3.06 

   L parameter is  6.35 Ang
   an1_conf-1_UT.log / R:  0.00 / Bmin:  1.53 / Bmax:  2.30 
   an1_conf-1_UT.log / R:  0.50 / Bmin:  1.64 / Bmax:  3.06 
   an1_conf-1_UT.log / R:  1.00 / Bmin:  1.65 / Bmax:  3.21 
   an1_conf-1_UT.log / R:  1.50 / Bmin:  1.65 / Bmax:  3.21 
   an1_conf-1_UT.log / R:  2.00 / Bmin:  1.65 / Bmax:  3.21 
   an1_conf-1_UT.log / R:  2.50 / Bmin:  1.65 / Bmax:  3.15 
   an1_conf-1_UT.log / R:  3.00 / Bmin:  1.60 / Bmax:  3.06 

   L parameter is  6.35 Ang
   an2_conf-1.log / R:  0.00 / Bmin:  1.53 / Bmax:  2.25 
   an2_conf-1.log / R:  0.50 / Bmin:  1.64 / Bmax: 

Unnamed: 0,log_name,H3,N1,C2,H4,Sterimol_Bmin_N1_C2_0.0Å(Å),Sterimol_Bmax_N1_C2_0.0Å(Å),Sterimol_Bmin_N1_C2_0.5Å(Å),Sterimol_Bmax_N1_C2_0.5Å(Å),Sterimol_Bmin_N1_C2_1.0Å(Å),Sterimol_Bmax_N1_C2_1.0Å(Å),Sterimol_Bmin_N1_C2_1.5Å(Å),Sterimol_Bmax_N1_C2_1.5Å(Å),Sterimol_Bmin_N1_C2_2.0Å(Å),Sterimol_Bmax_N1_C2_2.0Å(Å),Sterimol_Bmin_N1_C2_2.5Å(Å),Sterimol_Bmax_N1_C2_2.5Å(Å),Sterimol_Bmin_N1_C2_3.0Å(Å),Sterimol_Bmax_N1_C2_3.0Å(Å)
0,an1_conf-1,8,1,2,9,1.534741,2.302173,1.640901,3.05655,1.65,3.209751,1.65,3.214032,1.65,3.214032,1.65,3.150397,1.6,3.060229
1,an1_conf-1_UT,8,1,2,9,1.534741,2.302173,1.640901,3.05655,1.65,3.209751,1.65,3.214032,1.65,3.214032,1.65,3.150397,1.6,3.060229
2,an2_conf-1,14,6,5,15,1.53384,2.250555,1.640901,3.05655,1.65,3.209751,1.65,3.214032,1.65,3.214032,1.65,3.151587,1.6,3.05655
3,an3_conf-1,15,7,6,16,1.52028,2.247221,1.63415,3.067572,1.6787,3.225678,1.690831,3.231099,1.690831,3.231099,1.666258,3.148412,1.632197,3.05655
4,an3_conf-2,15,7,6,16,1.52028,1.998124,1.6,3.010399,1.65,3.209751,1.65,3.209751,1.65,3.209751,1.65,3.151587,1.6,3.020348
5,an4_conf-1,9,1,2,10,1.534741,2.302173,1.644616,3.05655,1.65,3.209751,1.65,3.214032,1.65,3.214032,1.65,3.151587,1.6,3.05655
6,an5_conf-1,9,1,2,10,1.534741,2.302173,1.644616,3.05655,1.65,3.209751,1.65,3.214032,1.65,3.214032,1.65,3.151587,1.6,3.05655
7,an6_conf-1,12,4,3,13,1.534741,3.98152,1.640901,4.180012,1.671078,4.28544,1.726792,4.440721,1.671045,4.463743,1.654347,4.463743,1.619127,4.341659
8,an7_conf-1,9,1,2,10,1.534741,2.300543,1.640901,3.05655,1.65,3.209751,1.65,3.214032,1.65,3.214032,1.65,3.110064,1.6,3.064719
9,an8_conf-1,15,7,6,16,1.534741,2.36326,1.640901,3.070016,1.65,3.224903,1.668673,3.224903,1.666056,3.224903,1.653185,4.219005,1.615639,4.440721


In [35]:
writer = pd.ExcelWriter('properties_4.xlsx')
df.to_excel(writer)
writer.save()

## Other available property functions that are less common

In [4]:
#df = atom_map_df
#---------------Time----------------------------------
#returns the total CPU time and total Wall time (not per subjob) because we are pioneers
#if used in summary df, will give the average (not Boltzmann average) in the Boltzmann average column
#df = gp.get_time(df)

#---------------ChelpG----------------------------
#ChelpG ESP charge 
#requires the Gaussian keyword = "pop=chelpg" in the .com file
#a_list = ['C1']
#df = gp.get_chelpg(df, a_list) 

#---------------Hirshfeld-------------------------
#Hirshfeld charge, CM5 charge, Hirshfeld atom dipole
#requires the Gaussian keyword = "pop=hirshfeld" in the .com file
#a_list = ['C1']
#df = gp.get_hirshfeld(df, a_list) 

#---------------IR--------------------------------
#CAUTION: CANNOT ACCURATELY IDENTIFY ATOM STRETCHES IN MOST CASES (strugges if there is more than one stretch in the defined range)
#IR frequencies and intensities in a specific range (for specific atoms)
#requires the Gaussian keyword = "freq=noraman" in the .com file
#inputs: dataframe, atom1, atom2, frequency_min, frequency_max, intensity_min, intensity_max, threshold
#if you want to collect multiple IR frequencies, you will need to copy/paste this function for each stretch
#we recommend a threshold of 0.0 (may have to adjust)
#df = gp.get_IR(df, "C1", "O2", 1700, 1900, 100, 800, 0.0)

#display(df)

Time function has completed
****Unable to acquire ChelpG charges for: an1_conf-1 .log
****Unable to acquire ChelpG charges for: an2_conf-1 .log
****Unable to acquire ChelpG charges for: an3_conf-1 .log
****Unable to acquire ChelpG charges for: an1_conf-1_UT .log
ChelpG function has completed for ['C1']
****Unable to acquire Hirshfeld properties for: an1_conf-1 .log
****Unable to acquire Hirshfeld properties for: an2_conf-1 .log
****Unable to acquire Hirshfeld properties for: an3_conf-1 .log
****Unable to acquire Hirshfeld properties for: an1_conf-1_UT .log
Hirshfeld function has completed for ['C1']
****Unable to acquire IR frequencies for: an1_conf-1 .log
****Unable to acquire IR frequencies for: an2_conf-1 .log
****Unable to acquire IR frequencies for: an3_conf-1 .log
****Unable to acquire IR frequencies for: an1_conf-1_UT .log
IR function has completed for C1 and O2


Unnamed: 0,log_name,H3,N1,H4,C2,CPU_time_total(hours),Wall_time_total(hours),ChelpG_charge_C1,Hirsh_charge_C1,Hirsh_CM5_charge_C1,Hirsh_atom_dipole_C1,IR_freq_C1_O2
0,an1_conf-1,9,1,8,2,0.49834,0.032889,no data,no data,no data,no data,no data
1,an2_conf-1,15,6,14,5,1.08514,0.069972,no data,no data,no data,no data,no data
2,an3_conf-1,16,7,15,6,1.31833,0.084778,no data,no data,no data,no data,no data
3,an1_conf-1_UT,9,1,8,2,0.60122,0.019917,no data,no data,no data,no data,no data


## Save collected properties to Excel

Helpful to save here in case the Notebook crashes or if you want to add more properties before post-processsing. Can be read in at 5.1.1.

In [25]:
#writer = pd.ExcelWriter('all_properties.xlsx')
#df.to_excel(writer)
#writer.save()

# Post-processing

## User input for data processing

In [12]:
#for numerically named compounds, prefix is any text common to all BEFORE the number and suffix is common to all AFTER the number
#this is a template for our files that are all named "AcXXX_clust-X.log" or "AcXXX_conf-X.log"
prefix = "an" 
suffix = "_"

#columns that provide atom mapping information are dropped
atom_columns_to_drop = ["H3", "N1", "H4", "C2"]

#title of the column for the energy you want to use for boltzmann averaging and lowest E conformer determination
energy_col_header = "G(T)_spc(Hartree)"

### Option to import an Excel sheet if you're using properties or energies collected outside of this notebook

If you would like to use post-processing functionality (i.e. Boltzmann averaging, lowest E conformers, etc.) you can read in a dataframe with properties (e.g. QikProp properties) or energies (e.g. if you don't/can't run linked jobs) collected outside of this notebook. 

Check out the dataframe_sample.xlsx to make sure you have the desired format. 

In [None]:
#df = pd.read_excel('manual_properties_sample.xlsx','Sheet1',index_col=0,header=0,engine='openpyxl')
#display(df.head())

## Generating a list of compounds that have conformational ensembles

**ONLY RUN THE AUTOMATED OR THE MANUAL CELL, NOT BOTH**

**AUTOMATED:** if your compounds are named consistenly, this section generates your compound list based on the similar naming structure

In [13]:
#this is a template for our files that are all named "AcXXX_clust-X.log"

compound_list = []
    
for index, row in df.iterrows():
    log_file = row['log_name'] #read file name from df
    prefix_and_compound = log_file.split(str(suffix)) #splits to get "AcXXX" (entry O) (and we don't use the "clust-X" (entry 1))
    compound = prefix_and_compound[0].split(str(prefix)) #splits again to get "XXX" (entry 1) (and we don't use the empty string "" (entry 0))
    compound_list.append(compound[1])

compound_list = list(set(compound_list)) #removes duplicate stuctures that result from having conformers of each
compound_list.sort() #reorders numerically (not sure if it reorders alphabetically)
print(compound_list)

#this should generate a list that looks like this: ['24', '27', '34', '48']

['1', '2', '3', '4', '5', '6', '7', '8', '9']


**MANUAL:** if your comment naming scheme is not consistent or you have trouble with the template above, you can manually define your compound list

In [None]:
#compound_list = [24, 27, 34, 48]

## Post-processing to get properties for each compound

In [11]:
all_df_master = pd.DataFrame(columns=[])
properties_df_master = pd.DataFrame(columns=[])

for compound in compound_list: 
    #defines the common start to all files using the input above 
    substring = str(prefix) + str(compound) + str(suffix)
    
    #makes a data frame for one compound at a time for post-processing
    valuesdf = df[df["log_name"].str.startswith(substring)]
    valuesdf = valuesdf.drop(columns = atom_columns_to_drop)
    valuesdf = valuesdf.reset_index(drop = True)  #you must re-index otherwise the 2nd, 3rd, etc. compounds fail
   
    #define columns that won't be included in summary properties or are treated differently because they don't make sense to Boltzmann average
    non_boltz_columns = ["G(Hartree)","∆G(Hartree)","∆G(kcal/mol)", "e^(-∆G/RT)","Mole Fraction"] #don't boltzman average columns containing these strings in the column label
    reg_avg_columns = ['CPU_time_total(hours)', 'Wall_time_total(hours)'] #don't boltzmann average these either, we average them in case that is helpful
    gv_extra_columns = ['E_spc (Hartree)', 'H_spc(Hartree)', 'T', 'T*S', 'T*qh_S', 'ZPE(Hartree)', 'qh_G(T)_spc(Hartree)', "G(T)_spc(Hartree)"]
    gv_extra_columns.remove(str(energy_col_header))
    
    #calculate the summary properties based on all conformers (Boltzmann Average, Minimum, Maximum, Boltzmann Weighted Std)
    valuesdf["∆G(Hartree)"] = valuesdf[energy_col_header] - valuesdf[energy_col_header].min()
    valuesdf["∆G(kcal/mol)"] = valuesdf["∆G(Hartree)"] * 627.5
    valuesdf["e^(-∆G/RT)"] = np.exp((valuesdf["∆G(kcal/mol)"] * -1000) / (1.987204 * 298.15)) #R is in cal/(K*mol)
    valuesdf["Mole Fraction"] = valuesdf["e^(-∆G/RT)"] / valuesdf["e^(-∆G/RT)"].sum()
    values_boltz_row = []
    values_min_row = []
    values_max_row = []
    values_boltz_stdev_row =[]
    values_range_row = []
    values_exclude_columns = []
    
    for column in valuesdf:
        if "log_name" in column:
            values_boltz_row.append("Boltzmann Averages")
            values_min_row.append("Ensemble Minimum")
            values_max_row.append("Ensemble Maximum")
            values_boltz_stdev_row.append("Boltzmann Standard Deviation")
            values_range_row.append("Ensemble Range")
            values_exclude_columns.append(column) #used later to build final dataframe
        elif any(phrase in column for phrase in non_boltz_columns) or any(phrase in column for phrase in gv_extra_columns):
            values_boltz_row.append("")
            values_min_row.append("")
            values_max_row.append("")
            values_boltz_stdev_row.append("")
            values_range_row.append("")
        elif any(phrase in column for phrase in reg_avg_columns):
            values_boltz_row.append(valuesdf[column].mean()) #intended to print the average CPU/wall time in the boltz column
            values_min_row.append("")
            values_max_row.append("")
            values_boltz_stdev_row.append("")
            values_range_row.append("")
        else:
            valuesdf[column] = pd.to_numeric(valuesdf[column]) #to hopefully solve the error that sometimes occurs where the float(Mole Fraction) cannot be mulitplied by the string(property)
            values_boltz_row.append((valuesdf[column] * valuesdf["Mole Fraction"]).sum())
            values_min_row.append(valuesdf[column].min())
            values_max_row.append(valuesdf[column].max())
            values_range_row.append(valuesdf[column].max() - valuesdf[column].min())

            
            # this section generates the weighted std deviation (weighted by mole fraction) 
            # formula: https://www.statology.org/weighted-standard-deviation-excel/
    
            boltz = (valuesdf[column] * valuesdf["Mole Fraction"]).sum() #number
            delta_values_sq = []
    
            #makes a list of the "deviation" for each conformer           
            for index, row in valuesdf.iterrows(): 
                value = row[column]
                delta_value_sq = (value - boltz)**2
                delta_values_sq.append(delta_value_sq)
            
            #w is list of weights (i.e. mole fractions)
            w = list(valuesdf["Mole Fraction"])
            wstdev = np.sqrt( (np.average(delta_values_sq, weights=w)) / (((len(w)-1)/len(w))*np.sum(w)) )
            if len(w) == 1: #if there is only one conformer in the ensemble, set the weighted standard deviation to 0 
                wstdev = 0
            #np.average(delta_values_sq, weights=w) generates sum of each (delta_value_sq * mole fraction)
            
            values_boltz_stdev_row.append(wstdev)
            
            
    valuesdf.loc[len(valuesdf)] = values_boltz_row
    valuesdf.loc[len(valuesdf)] = values_boltz_stdev_row
    valuesdf.loc[len(valuesdf)] = values_min_row
    valuesdf.loc[len(valuesdf)] = values_max_row
    valuesdf.loc[len(valuesdf)] = values_range_row

    #final output format is built here:
    explicit_order_front_columns = ["log_name", energy_col_header,"∆G(Hartree)","∆G(kcal/mol)","e^(-∆G/RT)","Mole Fraction"]
    
    #reorders the dataframe using front columns defined above
    valuesdf = valuesdf[explicit_order_front_columns + [col for col in valuesdf.columns if col not in explicit_order_front_columns and col not in values_exclude_columns]]
    
    #determine the index of the lowest energy conformer
    low_e_index = valuesdf[valuesdf["∆G(Hartree)"] == 0].index.tolist()
    
    #copy the row to a new_row with the name of the log changed to Lowest E Conformer
    new_row = valuesdf.loc[low_e_index[0]]
    new_row['log_name'] = "Lowest E Conformer"   
    valuesdf =  valuesdf.append(new_row, ignore_index=True)

#------------------------------EDIT THIS SECTION IF YOU WANT A SPECIFIC CONFORMER----------------------------------  
    #if you want all properties for a conformer with a particular property (i.e. all properties for the Vbur_min conformer)
    #this template can be adjusted for min/max/etc. 
    
    #find the index for the min or max column:
    #ensemble_min_index = valuesdf[valuesdf["log_name"] == "Ensemble Minimum"].index.tolist()
    
    #find the min or max value of the property (based on index above)
    #saves the value in a list (min_value) with one entry (this is why we call min_value[0])
    #min_value = valuesdf.loc[ensemble_min_index, "%Vbur_C4_3.0Å"].tolist()   
    #vbur_min_index = valuesdf[valuesdf["%Vbur_C4_3.0Å"] == min_value[0]].index.tolist()
    
    #copy the row to a new_row with the name of the log changed to Property_min_conformer
    #new_row = valuesdf.loc[vbur_min_index[0]]
    #new_row['log_name'] = "%Vbur_C4_3.0Å_min_Conformer"   
    #valuesdf =  valuesdf.append(new_row, ignore_index=True)
#--------------------------------------------------------------------------------------------------------------------    
    
    #appends the frame to the master output
    all_df_master = pd.concat([all_df_master, valuesdf])
    
    #drop all the individual conformers
    dropindex = valuesdf[valuesdf["log_name"].str.startswith(substring)].index
    valuesdf = valuesdf.drop(dropindex)
    valuesdf = valuesdf.reset_index(drop = True)
    
    #display(valuesdf)   
    
    #drop the columns created to determine the mole fraction and some that 
    valuesdf = valuesdf.drop(columns = explicit_order_front_columns)
    try:
        valuesdf = valuesdf.drop(columns = gv_extra_columns)
    except:
        pass
    try:
        valuesdf = valuesdf.drop(columns = reg_avg_columns)
    except:
        pass
        
#---------------------THIS MAY NEED TO CHANGE DEPENDING ON HOW YOU LABEL YOUR COMPOUNDS------------------------------  
    compound_name = prefix + str(compound) 
#--------------------------------------------------------------------------------------------------------------------      

    properties_df = pd.DataFrame({'Compound_Name': [compound_name]})
    
    #builds a dataframe (for each compound) by adding summary properties as new columns
    for (columnName, columnData) in valuesdf.iteritems():
        #the indexes need to match the values dataframe - display it to double check if you need to make changes 
        #(uncomment the display(valuesdf) in row 124 of this cell)
        properties_df[str(columnName) + "_Boltz"] = [columnData.values[0]]
        properties_df[str(columnName) + "_Boltz_stdev"] = [columnData.values[1]]
        properties_df[str(columnName) + "_min"] = [columnData.values[2]]
        properties_df[str(columnName) + "_max"] = [columnData.values[3]]
       #properties_df[str(columnName) + "_range"] = [columnData.values[4]]
        properties_df[str(columnName) + "_low_E"] = [columnData.values[5]]
        
        #if you're collecting properties for a specific conformer, add these here (note the index)
        #example:
        #properties_df[str(columnName) + "_V_bur_min"] = [columnData.values[6]]
        
        #if you only want a table with Boltz, you can comment out the other summary properties to generate a Boltz spreadsheet
        #of if you don't want to collect range, etc.
    #concatenates the individual acid properties df into the master properties df
    properties_df_master = pd.concat([properties_df_master, properties_df], axis = 0)

all_df_master = all_df_master.reset_index(drop = True)
properties_df_master = properties_df_master.reset_index(drop = True)


KeyError: 'G(T)_spc(Hartree)'

### Peek at your new dataframes

In [8]:
display(properties_df_master.head())
display(all_df_master)

Unnamed: 0,Compound_Name,HOMO_Boltz,HOMO_Boltz_stdev,HOMO_min,HOMO_max,HOMO_low_E,LUMO_Boltz,LUMO_Boltz_stdev,LUMO_min,LUMO_max,...,NBO_LP_occupancy_N1_Boltz,NBO_LP_occupancy_N1_Boltz_stdev,NBO_LP_occupancy_N1_min,NBO_LP_occupancy_N1_max,NBO_LP_occupancy_N1_low_E,NBO_LP_energy_N1_Boltz,NBO_LP_energy_N1_Boltz_stdev,NBO_LP_energy_N1_min,NBO_LP_energy_N1_max,NBO_LP_energy_N1_low_E
0,an1,-0.25981,0.0,-0.25981,-0.25981,-0.25981,0.02524,0.0,0.02524,0.02524,...,1.86652,0.0,1.86652,1.86652,1.86652,-0.35096,0.0,-0.35096,-0.35096,-0.35096
1,an2,-0.25202,0.0,-0.25202,-0.25202,-0.25202,0.0251,0.0,0.0251,0.0251,...,1.87208,0.0,1.87208,1.87208,1.87208,-0.34997,0.0,-0.34997,-0.34997,-0.34997
2,an3,-0.24275,0.0,-0.24275,-0.24275,-0.24275,0.02215,0.0,0.02215,0.02215,...,1.88317,0.0,1.88317,1.88317,1.88317,-0.35251,0.0,-0.35251,-0.35251,-0.35251


Unnamed: 0,log_name,G(T)_spc(Hartree),∆G(Hartree),∆G(kcal/mol),e^(-∆G/RT),Mole Fraction,E_spc (Hartree),ZPE(Hartree),H_spc(Hartree),T*S,...,HOMO,LUMO,μ,η,ω,Sterimol_B1_N1_H3(Å)_dbstep,Sterimol_B5_N1_H3(Å)_dbstep,Sterimol_L_N1_H3(Å)_dbstep,NBO_LP_occupancy_N1,NBO_LP_energy_N1
0,an1_conf-1,-287.496212,0.0,0.0,1.0,0.5,-287.584401,0.117332,-287.460336,0.035876,...,-0.25981,0.02524,-0.117285,0.28505,0.02413,1.52028,5.873883,2.2,1.86652,-0.35096
1,an1_conf-1_UT,-287.496212,0.0,0.0,1.0,0.5,-287.584401,0.117332,-287.460336,0.035876,...,-0.25981,0.02524,-0.117285,0.28505,0.02413,1.52028,5.873883,2.2,1.86652,-0.35096
2,Boltzmann Averages,,,,,,,,,,...,-0.25981,0.02524,-0.117285,0.28505,0.02413,1.52028,5.873883,2.2,1.86652,-0.35096
3,Boltzmann Standard Deviation,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Ensemble Minimum,,,,,,,,,,...,-0.25981,0.02524,-0.117285,0.28505,0.02413,1.52028,5.873883,2.2,1.86652,-0.35096
5,Ensemble Maximum,,,,,,,,,,...,-0.25981,0.02524,-0.117285,0.28505,0.02413,1.52028,5.873883,2.2,1.86652,-0.35096
6,Ensemble Range,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Lowest E Conformer,-287.496212,0.0,0.0,1.0,0.5,-287.584401,0.117332,-287.460336,0.035876,...,-0.25981,0.02524,-0.117285,0.28505,0.02413,1.52028,5.873883,2.2,1.86652,-0.35096
8,an2_conf-1,-326.7815,0.0,0.0,1.0,1.0,-326.893704,0.144741,-326.740336,0.041163,...,-0.25202,0.0251,-0.11346,0.27712,0.02323,1.52028,6.965092,2.2,1.87208,-0.34997
9,Boltzmann Averages,,,,,,,,,,...,-0.25202,0.0251,-0.11346,0.27712,0.02323,1.52028,6.965092,2.2,1.87208,-0.34997


### Save to Microsoft Excelᵀᴹ 

In [9]:
all_df_master.to_excel('all_properties_master.xlsx', index = False)
properties_df_master.to_excel('summary_properties.xlsx', index = False)