In [84]:
# create YAML data file for molecule
import yaml, re, sys, os
import pandas as pd
import numpy as np

sys.path.insert(0, '../../karlib')
import molpro_subs as mpr
import chem_subs as chem
import gaussian_subs as gau

In [85]:
# Import reference data
atct = pd.read_csv(r'./refdata/ATcT_1p122r_gases.csv')
webbook = pd.read_csv(r'./refdata/gas-enthalpies_webbook.csv')
soc = pd.read_excel(r'./refdata/spin_orbit_correction.xlsx', skiprows=1)
# put SOC formulas into standard order
soc['Formula'] = soc['Species'].apply(lambda x: chem.formula(chem.formula_to_atomlist(x)))

In [86]:
molec = 'o'

In [87]:
doc = {}
# get charge and spin multiplicity from Gaussian geom/freq file
fgau = r'./geomfreq/{:s}_geom.out'.format(molec)
fpro = r'./energysp/{:s}_cc.pro'.format(molec)
FGAU = open(fgau, 'r')
df = gau.read_charge_mult(FGAU)
charge = df['Charge'].iloc[-1]
mult = df['Mult'].iloc[-1]
doc = {'Charge': int(charge)}
doc['Spin_mult'] = int(mult)

In [88]:
# collect meta-information about computed geometry
cmd = str(gau.read_command(FGAU).at[0, 'Command']).split('#', 1)[1].strip()
comment = str(gau.read_comments(FGAU).at[0, 'Comment'])
geom = {'command': cmd, 'comment': comment}
rev = gau.read_version(FGAU)
vers = 'Gaussian{:s} {:s}'.format(rev[0], rev[2])
geom['software'] = vers

In [89]:
def find_line_number(file, search_string):
    # return a list of line numbers for lines that include the search string
    try:
        F = open(file, 'r')
    except TypeError:
        # hopefully already a file object
        F = file
    lineno = []
    for i, line in enumerate(F):
        if search_string in line:
            lineno.append(i)
    return lineno

In [90]:
# find the optimized geometry, assuming an SCF method
lineno = find_line_number(FGAU, 'Stationary point found')[0]
dfscf = gau.read_scfe(FGAU)
dfcrd = gau.read_std_orient(FGAU)
dfscf = dfscf[dfscf.line < lineno].sort_values('line')
scfE = float(dfscf['Energy'].iloc[-1])  # last energy before the "Optimized" announcement
geom['E_scf'] = scfE
dfcrd = dfcrd[dfcrd.line > lineno].sort_values('line')
unit = str(dfcrd['Unit'].iloc[0])  # first geometry listed after the "Optimized" announcement
# remove trailing "s" from name of unit
if unit[-1] == 's':
    unit = unit[:-1]
geom['unit'] = unit
crd = dfcrd['Coordinates'].iloc[0]  # last geom before the lowest energy
coords = []
for i, row in crd.iterrows():
    c = [chem.elz(row.Z, 'symbol')] + list(row[['x', 'y', 'z']])
    coords.append(c)
geom['coordinates'] = coords
geom['basis_functions'] = gau.read_nbfn(FGAU)[0][0]
natom = len(coords)
if natom > 1:
    # rotational constants
    dfrot = gau.read_rotational(FGAU)
    dfrot = dfrot[dfrot.line > lineno].sort_values('line')
    rotat = {s: float(x) for s, x in zip(['A','B','C'], dfrot['Rotational Constants'].iloc[0])}
    pg = gau.read_pointgroup(FGAU)['point group'].iloc[-1]  # last PG (freq calc) is probably tightest
    if '*' in pg:
        # linear molecule should have only one rotational constant
        rotat = {'A': rotat['A']}
    rotat['point_group'] = str(pg)
    rotat['unit'] = str(dfrot['Unit'].iloc[0])
    # external symmetry number
    rotat['symmetry_number'] = gau.read_symno(FGAU)
    doc['Rotational'] = rotat

In [91]:
# create formula from list of atoms
formula = chem.formula([x[0] for x in geom['coordinates']])
print('formula:', formula)

formula: O


In [92]:
# collect identifiers
identifier = {
    'local': molec,  # the label used internally here
    'formula': formula,
}
if natom > 2:
    identifier['IUPAC'] = chem.CIRconvert(formula, 'iupac_name')
    identifier['InChI'] = chem.CIRconvert(formula, 'stdinchi').split('=')[1]
    cas = ', '.join(chem.CIRconvert(formula, 'cas').split('\n'))  # multiple values will be spliced into one string
    identifier['CASRN'] = cas
    if len(identifier['IUPAC'].split('\n')) > 1:
        # do not show IUPAC name at all
        del identifier['IUPAC']
df_atct = atct[(atct.Formula == formula) | (atct.Hill.str.lower() == formula)]
display(df_atct)
df_wb = webbook[(webbook.Formula == formula)]
display(df_wb)
for source, prefix, df in zip(['WebBook', 'ATcT'], ['WB', 'ATcT'], [df_wb, df_atct]):
    if len(df) == 1:
        identifier[source] = df[prefix+'_ID'].iloc[0]
        identifier['name'] = df['Species'].iloc[0]
    elif len(df) == 0:
        print(f'Notice: "{molec}" not found in {source} list')
    else:
        # more than one
        print(f'Multiple {source} matches for "{molec}"')
        display(df)
        print('Formula = {:s}'.format(identifier['formula']))
doc['Identifiers'] = identifier

Unnamed: 0,Species,Formula,EoF0,EoF298,Unc,MW,ATcT_ID,Hill
13,Oxygen atom,O,246.844,249.229,0.0021,15.9994,17778-80-2*0,O


Unnamed: 0,Species,Formula,EoF298,Unc,WB_ID,Squib,Method
1362,"Oxygen, atomic",O,249.18,0.1,C17778802,1984COX/WAG1B,Review # CODATA Review value


In [93]:
# collect reference enthalpy of formation
refdata = {}
for source, df in zip(['ATcT', 'WebBook'], [df_atct, df_wb]):
    if len(df) == 1:
        data = {}
        try:
            data['EoF0'] = float(df['EoF0'].iloc[0])
        except:
            if source != 'WebBook':
                print(f'No EoF0 found in {source}')
        try:
            data['source'] = str(df['Squib'].iloc[0])
        except:
            # ATcT source cannot be further specified
            pass
        try:
            data['comment'] = str(df['Method'].iloc[0])
        except:
            # ATcT method cannot be further specified
            pass
        data['EoF298'] = float(df['EoF298'].iloc[0])
        data['unc'] = float(df['Unc'].iloc[0])
        data['unit'] = 'kJ/mol'  # for both WebBook and ATcT
        refdata[source] = data
# look for spin-orbit correction
subsoc = soc[soc.Formula == formula]
if len(subsoc == 1):
    socdata = {'value': float(subsoc['Value'].iloc[0])}
    socdata['unc'] = float(subsoc['Unc'].iloc[0])
    socdata['unit'] = subsoc['Unit'].iloc[0]
    refdata['spin_orbit'] = socdata
doc['Refdata'] = refdata

In [94]:
# harmonic vibrational frequencies
if natom > 1:
    doc['Frequencies'] = gau.read_freqs(FGAU)[-1][2]['Freq'].values.tolist()
    # put the number of imaginary frequencies in the geometry block
    geom['nimag'] = int(np.sum(np.array(doc['Frequencies']) < 0))
    print(geom['nimag'])
    doc['Geometry'] = geom

In [95]:
FGAU.close()
# read single-point CCSD(T)-F12b energy 
regx = re.compile(r'CCSD\(T\)-F12b energy')
rx_nbf = re.compile(r'NUMBER OF CONTRACTIONS:')
rx_hf = re.compile(r'HF STATE\s*\d+\.\d Energy')
with open(fpro, 'r') as FPRO:
    for line in FPRO:
        if regx.search(line):
            w = line.split()
            ecc = float(w[-1])
        if rx_nbf.search(line):
            w = line.split()
            nbf = int(w[3])
        if rx_hf.search(line):
            w = line.split()
            ehf = float(w[-1])
energy = {'CCSD(T)-F12b': ecc, 'basis_functions': nbf, 'HF': ehf}
doc['Energy'] = energy

In [96]:
# save to YAML file
fout = molec + '.yml'
with open(fout, 'w') as F:
    F.write(yaml.dump(doc))
print('YAML file {:s} created.'.format(fout))

YAML file o.yml created.
