In [129]:
# create YAML data file for molecule
import yaml, re, sys, os
import pandas as pd
import numpy as np

sys.path.insert(0, '../atomic_SOC')
import molpro_subs as mpr
import chem_subs as chem
sys.path.insert(0, '../karlib')
import gaussian_subs as gau
import multirx_subs as mrx

In [89]:
# sub-directory names
GDIR = mrx.GDIR    # Gaussian files, geom opt and harmonic freqs
EDIR = mrx.EDIR    # Molpro files, single-point CCSD(T)-F12
REFD = mrx.REFDAT     # Reference data
DATA = mrx.MDAT  # Processed molecular YAML files
JUNK = 'discard/energysp'  # CC outputs rejected because of old filename format

In [90]:
# Import reference data
atct = pd.read_csv(os.sep.join([REFD, 'ATcT_1p122r_gases.tsv']), sep='\t')
webbook = pd.read_csv(os.sep.join([REFD, 'gas-enthalpies_webbook.tsv']), sep='\t')
soc = pd.read_excel(os.sep.join([REFD, 'spin_orbit_correction.xlsx']), skiprows=1)
dflabel = pd.read_csv(os.sep.join([REFD, 'label_meanings.tsv']), sep='\t')
# put SOC formulas into standard order
soc['Formula'] = soc['Species'].apply(lambda x: chem.formula(chem.formula_to_atomlist(x)))

In [91]:
molec = 'etoc2h3'

In [92]:
display(dflabel[dflabel.Label == molec])
CASRN = dflabel[dflabel.Label == molec].CASRN.values[0]

Unnamed: 0,Label,Name,CASRN
423,etoc2h3,ethyl vinyl ether,109-92-2


In [136]:
dflabel

Unnamed: 0,Label,Name,CASRN
0,c,carbon atom,7440-44-0
1,c2f4,tetrafluoroethylene,116-14-3
2,c2h2,acetylene,74-86-2
3,c2h4,ethylene,74-85-1
4,c2h3cl,vinyl chloride,75-01-4
...,...,...,...
449,pyridine-N-oxide,pyridine-N-oxide,694-59-7
450,pyridine,pyridine,110-86-1
451,ch3nch2,N-methyl methanimine,1761-67-7
452,ch3chnh,ethanimine,20729-41-3


In [93]:
doc = {}
# get charge and spin multiplicity from Gaussian geom/freq file
fgau = os.sep.join([GDIR, molec + '.out'])
fpro = os.sep.join([EDIR, molec + '.pro'])
#fgau = r'./geomfreq/{:s}_geom.out'.format(molec)
#fpro = r'./energysp/{:s}_cc.pro'.format(molec)
FGAU = open(fgau, 'r')
df = gau.read_charge_mult(FGAU)
charge = df['Charge'].iloc[-1]
mult = df['Mult'].iloc[-1]
doc = {'Charge': int(charge)}
doc['Spin_mult'] = int(mult)

In [94]:
# collect meta-information about computed geometry
cmd = str(gau.read_command(FGAU).at[0, 'Command']).split('#', 1)[1].strip()
comment = str(gau.read_comments(FGAU).at[0, 'Comment'])
geom = {'command': cmd, 'comment': comment}
rev = gau.read_version(FGAU)
vers = 'Gaussian{:s} {:s}'.format(rev[0], rev[2])
geom['software'] = vers

In [95]:
def find_line_number(file, search_string):
    # return a list of line numbers for lines that include the search string
    try:
        F = open(file, 'r')
    except TypeError:
        # hopefully already a file object
        F = file
    lineno = []
    for i, line in enumerate(F):
        if search_string in line:
            lineno.append(i)
    return lineno

In [96]:
# find the optimized geometry, assuming an SCF method
lineno = find_line_number(FGAU, 'Stationary point found')[0]
dfscf = gau.read_scfe(FGAU)
dfcrd = gau.read_std_orient(FGAU)
dfscf = dfscf[dfscf.line < lineno].sort_values('line')
scfE = float(dfscf['Energy'].iloc[-1])  # last energy before the "Optimized" announcement
geom['E_scf'] = scfE
dfcrd = dfcrd[dfcrd.line > lineno].sort_values('line')
unit = str(dfcrd['Unit'].iloc[0])  # first geometry listed after the "Optimized" announcement
# remove trailing "s" from name of unit
if unit[-1] == 's':
    unit = unit[:-1]
geom['unit'] = unit
crd = dfcrd['Coordinates'].iloc[0]  # last geom before the lowest energy
coords = []
for i, row in crd.iterrows():
    c = [chem.elz(row.Z, 'symbol')] + list(row[['x', 'y', 'z']])
    coords.append(c)
geom['coordinates'] = coords
geom['basis_functions'] = gau.read_nbfn(FGAU)[0][0]
natom = len(coords)
if natom > 1:
    # rotational constants
    dfrot = gau.read_rotational(FGAU)
    dfrot = dfrot[dfrot.line > lineno].sort_values('line')
    rotat = {s: float(x) for s, x in zip(['A','B','C'], dfrot['Rotational Constants'].iloc[0])}
    pg = gau.read_pointgroup(FGAU)['point group'].iloc[-1]  # last PG (freq calc) is probably tightest
    if '*' in pg:
        # linear molecule should have only one rotational constant
        rotat = {'A': rotat['A']}
    rotat['point_group'] = str(pg)
    rotat['unit'] = str(dfrot['Unit'].iloc[0])
    # external symmetry number
    rotat['symmetry_number'] = gau.read_symno(FGAU)
    doc['Rotational'] = rotat

In [97]:
# create formula from list of atoms
formula = chem.formula([x[0] for x in geom['coordinates']])
print('formula:', formula)

formula: C4H8O


In [137]:
atct

Unnamed: 0,Species,Formula,EoF0,EoF298,Unc,MW,ATcT_ID,Hill
0,Dihydrogen,H2,0.000,0.000,-999.0000,2.015880,1333-74-0*0,H2
1,Helium,He,0.000,0.000,-999.0000,4.002602,7440-59-7*0,He
2,Dinitrogen,N2,0.000,0.000,-999.0000,28.013480,7727-37-9*0,N2
3,Dioxygen,O2,0.000,0.000,-999.0000,31.998800,7782-44-7*0,O2
4,Difluorine,F2,0.000,0.000,-999.0000,37.996806,7782-41-4*0,F2
...,...,...,...,...,...,...,...,...
1247,Argon dimer,Ar2,-1.013,-6.347,0.0051,79.896000,12595-59-4*0,Ar2
1248,Helium cation,He+,2372.322,2372.322,0.0000,4.002053,14234-48-1*0,He+
1249,Helium dication,[He]+2,7622.839,7622.839,0.0000,4.001505,12587-46-1*0,He+2
1250,Helium anion,He-,51.000,51.000,130.0000,4.003151,14452-58-5*0,He-


In [99]:
# collect identifiers
identifier = {
    'local': molec,  # the label used internally here
    'formula': formula,
}
if natom > 2:
    try:
        iup = chem.CIRconvert(formula, 'iupac_name')
        if len(iup.split('\n')) == 1:
            identifier['IUPAC'] = iup
    except:
        # conversion presumably failed
        chem.print_err('', f'CIRconvert to IUPAC failed for {formula}', halt=False)
    try:
        identifier['InChI'] = chem.CIRconvert(formula, 'stdinchi').split('=')[1]
    except:
        # conversion presumably failed
        chem.print_err('', f'CIRconvert to InChI failed for {formula}', halt=False)
    try:
        cas = ', '.join(chem.CIRconvert(formula, 'cas').split('\n'))  # multiple values will be spliced into one string
        identifier['CASRN'] = cas
    except:
        # conversion presumably failed
        chem.print_err('', f'CIRconvert to CASRN failed for {formula}', halt=False)



In [100]:
def compress_CASRN(strcas):
    # Return a string of digits (only), extracted from input string
    # Actual CASRN also contains hyphens
    # ATcT identifier has trailing '*0' or similar
    s = strcas.split('*')[0]
    ccas = ''
    for c in s:
        if c in '0123456789':
            ccas += c
    return ccas

In [101]:
cCASRN = compress_CASRN(CASRN)  # for the target, from local file

In [107]:
# collect identifiers
identifier = {
    'local': molec,  # the label used internally here
    'formula': formula,
}
if CASRN:
    identifier['CASRN'] = CASRN
df_atct = atct[(atct.Formula == formula) | (atct.Hill == formula)].copy()
df_atct['ccas'] = [compress_CASRN(casrn) for casrn in df_atct.ATcT_ID]
#display(df_atct)
df_wb = webbook[(webbook.Formula == formula)].copy()
df_wb['ccas'] = [compress_CASRN(casrn) for casrn in df_wb.CASRN]
#display(df_wb)
for source, prefix, df in zip(['WebBook', 'ATcT'], ['WB', 'ATcT'], [df_wb, df_atct]):
    if len(df) == 1:
        identifier[source] = df[prefix+'_ID'].iloc[0]
        identifier['name'] = df['Species'].iloc[0]
    elif len(df) == 0:
        print(f'Notice: "{formula}" not found in {source} list')
    else:
        # more than one
        print(f'Multiple {formula} matches for "{molec}" in {source}--checking for CASRN {CASRN}')
        if cCASRN:
            subdf = df[df.ccas == cCASRN]
            if len(subdf) == 0:
                print('\tNothing matches')
            else:
                df = subdf.copy()
                if len(df) > 1:
                    print('\tMultiple matches for {cCASRN}')
        display(df)
        print('Formula = {:s}'.format(identifier['formula']))
doc['Identifiers'] = identifier
print(identifier)

Multiple C4H8O matches for "etoc2h3" in WebBook--checking for CASRN 109-92-2


Unnamed: 0,Species,Formula,EoF298,Unc,WB_ID,CASRN,Squib,Method,ccas
488,"Ethene, ethoxy-",C4H8O,-140.164,0.96232,C109922,109-92-2,1963PIL/SKI316-330,Cm,109922


Formula = C4H8O
Multiple C4H8O matches for "etoc2h3" in ATcT--checking for CASRN 109-92-2


Unnamed: 0,Species,Formula,EoF0,EoF298,Unc,MW,ATcT_ID,Hill,ccas
1135,Ethoxyethene,CH2CHOCH2CH3,-117.7,-136.2,1.5,72.1057,109-92-2*0,C4H8O,109922


Formula = C4H8O
{'local': 'etoc2h3', 'formula': 'C4H8O', 'CASRN': '109-92-2'}


In [108]:
# collect reference enthalpy of formation
refdata = {}
for source, df in zip(['ATcT', 'WebBook'], [df_atct, df_wb]):
    if len(df) == 1:
        data = {}
        try:
            data['EoF0'] = float(df['EoF0'].iloc[0])
        except:
            if source != 'WebBook':
                print(f'No EoF0 found in {source}')
        try:
            data['source'] = str(df['Squib'].iloc[0])
        except:
            # ATcT source cannot be further specified
            pass
        try:
            data['comment'] = str(df['Method'].iloc[0])
        except:
            # ATcT method cannot be further specified
            pass
        data['EoF298'] = float(df['EoF298'].iloc[0])
        data['unc'] = float(df['Unc'].iloc[0])
        data['unit'] = 'kJ/mol'  # for both WebBook and ATcT
        refdata[source] = data
# look for spin-orbit correction
subsoc = soc[soc.Formula == formula]
if len(subsoc == 1):
    socdata = {'value': float(subsoc['Value'].iloc[0])}
    socdata['unc'] = float(subsoc['Unc'].iloc[0])
    socdata['unit'] = subsoc['Unit'].iloc[0]
    refdata['spin_orbit'] = socdata
doc['Refdata'] = refdata

In [109]:
# harmonic vibrational frequencies
if natom > 1:
    doc['Frequencies'] = gau.read_freqs(FGAU)[-1][2]['Freq'].values.tolist()
    # put the number of imaginary frequencies in the geometry block
    geom['nimag'] = int(np.sum(np.array(doc['Frequencies']) < 0))
    print(geom['nimag'])
    doc['Geometry'] = geom

0


In [110]:
FGAU.close()
# read single-point CCSD(T)-F12b energy 
regx = re.compile(r'CCSD\(T\)-F12b energy')
rx_nbf = re.compile(r'NUMBER OF CONTRACTIONS:')
rx_hf = re.compile(r'HF STATE\s*\d+\.\d Energy')
with open(fpro, 'r') as FPRO:
    for line in FPRO:
        if regx.search(line):
            w = line.split()
            ecc = float(w[-1])
        if rx_nbf.search(line):
            w = line.split()
            nbf = int(w[3])
        if rx_hf.search(line):
            w = line.split()
            ehf = float(w[-1])
energy = {'CCSD(T)-F12b': ecc, 'basis_functions': nbf, 'HF': ehf}
doc['Energy'] = energy

In [127]:
# Detect and add functional groups
G = chem.Geometry(geom['coordinates'])
G.spinmult = int(mult)
fungroups = G.find_functional_group(spin=True)
doc['Functional_groups'] = {k: str(v) for k, v in fungroups.items()}

In [128]:
# save to YAML file
fout = os.sep.join([DATA, molec + '.yml'])
with open(fout, 'w') as F:
    F.write(yaml.dump(doc))
print('YAML file {:s} created.'.format(fout))

YAML file molec_data\etoc2h3.yml created.


In [133]:
# Read it back, as a check
ydat = mrx.read_molec_yaml(molec)
ydat

{'Charge': 0,
 'Energy': {'CCSD(T)-F12b': -232.134313983044,
  'HF': -231.059686617081,
  'basis_functions': 409},
 'Frequencies': [102.5694,
  211.7058,
  229.5532,
  274.9195,
  389.3822,
  608.4256,
  716.5658,
  831.122,
  845.5862,
  859.4505,
  985.8119,
  1000.2239,
  1051.0457,
  1136.4019,
  1182.1848,
  1236.8372,
  1300.2236,
  1354.9634,
  1402.3189,
  1424.805,
  1450.5496,
  1484.7706,
  1500.5604,
  1521.7051,
  1677.4945,
  3002.4324,
  3034.9799,
  3042.7226,
  3108.756,
  3114.6793,
  3172.5802,
  3185.4675,
  3258.3418],
 'Functional_groups': {'C-C': [(2, 4)],
  'C=C': [(1, 3)],
  'ether': [(1, 0, 2)],
  'methyl': [(4, 10, 11, 12)]},
 'Geometry': {'E_scf': -232.535068238,
  'basis_functions': 262,
  'command': 'b3lyp/gen opt freq',
  'comment': 'ethyl vinyl ether, B3LYP/pcseg-2',
  'coordinates': [['O', 0.017454, 0.677844, 0.0],
   ['C', -1.325374, 0.519774, 0.0],
   ['C', 0.823721, -0.499965, 0.0],
   ['C', -2.024876, -0.613478, 0.0],
   ['C', 2.27692, -0.078463, 0.