In [24]:
# create YAML data file for molecule
import yaml, re, sys, os
import pandas as pd
import numpy as np

sys.path.insert(0, '../atomic_SOC')
import molpro_subs as mpr
import chem_subs as chem
sys.path.insert(0, '../karlib')
import gaussian_subs as gau
import multirx_subs as mrx

In [25]:
# sub-directory names
GDIR = mrx.GDIR    # Gaussian files, geom opt and harmonic freqs
EDIR = mrx.EDIR    # Molpro files, single-point CCSD(T)-F12
REFD = mrx.REFDAT     # Reference data
DATA = mrx.MDAT  # Processed molecular YAML files
JUNK = 'discard/energysp'  # CC outputs rejected because of old filename format

In [26]:
# Import reference data
atct = pd.read_csv(os.sep.join([REFD, 'ATcT_1p122r_gases.tsv']), sep='\t')
webbook = pd.read_csv(os.sep.join([REFD, 'gas-enthalpies_webbook.tsv']), sep='\t')
soc = pd.read_excel(os.sep.join([REFD, 'spin_orbit_correction.xlsx']), skiprows=1)
dflabel = pd.read_csv(os.sep.join([REFD, 'label_meanings.tsv']), sep='\t')
# put SOC formulas into standard order
soc['Formula'] = soc['Species'].apply(lambda x: chem.formula(chem.formula_to_atomlist(x)))

In [27]:
molec = 'cf3cl'

In [28]:
display(dflabel[dflabel.Label == molec])
CASRN = dflabel[dflabel.Label == molec].CASRN.values[0]

Unnamed: 0,Label,Name,CASRN
172,cf3cl,chlorotrifluoromethane,75-72-9


In [29]:
doc = {}
# get charge and spin multiplicity from Gaussian geom/freq file
fgau = os.sep.join([GDIR, molec + '.out'])
fpro = os.sep.join([EDIR, molec + '.pro'])
#fgau = r'./geomfreq/{:s}_geom.out'.format(molec)
#fpro = r'./energysp/{:s}_cc.pro'.format(molec)
FGAU = open(fgau, 'r')
df = gau.read_charge_mult(FGAU)
charge = df['Charge'].iloc[-1]
mult = df['Mult'].iloc[-1]
doc = {'Charge': int(charge)}
doc['Spin_mult'] = int(mult)

In [30]:
# collect meta-information about computed geometry
cmd = str(gau.read_command(FGAU).at[0, 'Command']).split('#', 1)[1].strip()
comment = str(gau.read_comments(FGAU).at[0, 'Comment'])
geom = {'command': cmd, 'comment': comment}
rev = gau.read_version(FGAU)
vers = 'Gaussian{:s} {:s}'.format(rev[0], rev[2])
geom['software'] = vers
# get ROHF/cc-pVTZ-F12 energy used to cross-check with Molpro
dfscf = gau.read_scfe(FGAU)
rohf = dfscf.loc[dfscf.Method == 'ROHF', 'Energy'].values[0]
geom['HF_check'] = rohf

In [31]:
def find_line_number(file, search_string):
    # return a list of line numbers for lines that include the search string
    try:
        F = open(file, 'r')
    except TypeError:
        # hopefully already a file object
        F = file
    lineno = []
    for i, line in enumerate(F):
        if search_string in line:
            lineno.append(i)
    return lineno

In [32]:
# find the optimized geometry, assuming an SCF method
lineno = find_line_number(FGAU, 'Stationary point found')[0]
dfscf = gau.read_scfe(FGAU)
dfcrd = gau.read_std_orient(FGAU)
dfscf = dfscf[dfscf.line < lineno].sort_values('line')
scfE = float(dfscf['Energy'].iloc[-1])  # last energy before the "Optimized" announcement
geom['E_scf'] = scfE
dfcrd = dfcrd[dfcrd.line > lineno].sort_values('line')
unit = str(dfcrd['Unit'].iloc[0])  # first geometry listed after the "Optimized" announcement
# remove trailing "s" from name of unit
if unit[-1] == 's':
    unit = unit[:-1]
geom['unit'] = unit
crd = dfcrd['Coordinates'].iloc[0]  # last geom before the lowest energy
coords = []
for i, row in crd.iterrows():
    c = [chem.elz(row.Z, 'symbol')] + list(row[['x', 'y', 'z']])
    coords.append(c)
geom['coordinates'] = coords
geom['basis_functions'] = gau.read_nbfn(FGAU)[0][0]
natom = len(coords)
if natom > 1:
    # rotational constants
    dfrot = gau.read_rotational(FGAU)
    dfrot = dfrot[dfrot.line > lineno].sort_values('line')
    rotat = {s: float(x) for s, x in zip(['A','B','C'], dfrot['Rotational Constants'].iloc[0])}
    pg = gau.read_pointgroup(FGAU)['point group'].iloc[-1]  # last PG (freq calc) is probably tightest
    if '*' in pg:
        # linear molecule should have only one rotational constant
        rotat = {'A': rotat['A']}
    rotat['point_group'] = str(pg)
    rotat['unit'] = str(dfrot['Unit'].iloc[0])
    # external symmetry number
    rotat['symmetry_number'] = gau.read_symno(FGAU)
    doc['Rotational'] = rotat

In [34]:
# harmonic vibrational frequencies
if natom > 1:
    doc['Frequencies'] = gau.read_freqs(FGAU)[-1][2]['Freq'].values.tolist()
    # put the number of imaginary frequencies in the geometry block
    geom['nimag'] = int(np.sum(np.array(doc['Frequencies']) < 0))
    print(geom['nimag'])
    doc['Geometry'] = geom

0


In [35]:
FGAU.close()
# read single-point CCSD(T)-F12b energy 
regx = re.compile(r'CCSD\(T\)-F12b energy')
rx_nbf = re.compile(r'NUMBER OF CONTRACTIONS:')
rx_hf = re.compile(r'HF STATE\s*\d+\.\d Energy')
with open(fpro, 'r') as FPRO:
    for line in FPRO:
        if regx.search(line):
            w = line.split()
            ecc = float(w[-1])
        if rx_nbf.search(line):
            w = line.split()
            nbf = int(w[3])
        if rx_hf.search(line):
            w = line.split()
            ehf = float(w[-1])
energy = {'CCSD(T)-F12b': ecc, 'basis_functions': nbf, 'HF': ehf}
doc['Energy'] = energy

In [36]:
doc

{'Charge': 0,
 'Spin_mult': 1,
 'Rotational': {'A': 5.6941225,
  'B': 3.2814068,
  'C': 3.2814068,
  'point_group': 'C3V',
  'unit': 'GHZ',
  'symmetry_number': 3},
 'Frequencies': [342.0463,
  342.0465,
  460.5266,
  555.1357,
  555.1362,
  773.6733,
  1075.374,
  1189.4556,
  1189.4577],
 'Geometry': {'command': 'b3lyp/gen opt freq CPHF(Grid=OneStep)',
  'comment': 'CF3Cl, B3LYP/pcseg-2',
  'software': 'Gaussian16 B.01',
  'HF_check': -795.86436156,
  'E_scf': -798.018040724,
  'unit': 'Angstrom',
  'coordinates': [['C', -0.0, -0.0, -0.35312],
   ['Cl', -0.0, -0.0, 1.418259],
   ['F', 0.0, 1.247889, -0.814507],
   ['F', 1.080704, -0.623945, -0.814507],
   ['F', -1.080704, -0.623944, -0.814507]],
  'basis_functions': 154,
  'nimag': 0},
 'Energy': {'CCSD(T)-F12b': -797.16294807079,
  'basis_functions': 274,
  'HF': -795.864361923957}}

In [37]:
# create formula from list of atoms
formula = chem.formula([x[0] for x in geom['coordinates']])
hill = chem.formula([x[0] for x in geom['coordinates']], Hill=True)
print('formula:', formula)
print('Hill   :', hill)

formula: CClF3
Hill   : CClF3


In [38]:
atct

Unnamed: 0,Species,Formula,EoF0,EoF298,Unc,MW,ATcT_ID,Hill
0,Dihydrogen,H2,0.000,0.000,-999.0000,2.015880,1333-74-0*0,H2
1,Helium,He,0.000,0.000,-999.0000,4.002602,7440-59-7*0,He
2,Dinitrogen,N2,0.000,0.000,-999.0000,28.013480,7727-37-9*0,N2
3,Dioxygen,O2,0.000,0.000,-999.0000,31.998800,7782-44-7*0,O2
4,Difluorine,F2,0.000,0.000,-999.0000,37.996806,7782-41-4*0,F2
...,...,...,...,...,...,...,...,...
1247,Argon dimer,Ar2,-1.013,-6.347,0.0051,79.896000,12595-59-4*0,Ar2
1248,Helium cation,He+,2372.322,2372.322,0.0000,4.002053,14234-48-1*0,He+
1249,Helium dication,[He]+2,7622.839,7622.839,0.0000,4.001505,12587-46-1*0,He+2
1250,Helium anion,He-,51.000,51.000,130.0000,4.003151,14452-58-5*0,He-


In [39]:
# collect identifiers
identifier = {
    'local': molec,  # the label used internally here
    'formula': formula,
}
if natom > 2:
    try:
        iup = chem.CIRconvert(formula, 'iupac_name')
        if len(iup.split('\n')) == 1:
            identifier['IUPAC'] = iup
    except:
        # conversion presumably failed
        chem.print_err('', f'CIRconvert to IUPAC failed for {formula}', halt=False)
    try:
        identifier['InChI'] = chem.CIRconvert(formula, 'stdinchi').split('=')[1]
    except:
        # conversion presumably failed
        chem.print_err('', f'CIRconvert to InChI failed for {formula}', halt=False)
    try:
        cas = ', '.join(chem.CIRconvert(formula, 'cas').split('\n'))  # multiple values will be spliced into one string
        identifier['CASRN'] = cas
    except:
        # conversion presumably failed
        chem.print_err('', f'CIRconvert to CASRN failed for {formula}', halt=False)



In [40]:
def compress_CASRN(strcas):
    # Return a string of digits (only), extracted from input string
    # Actual CASRN also contains hyphens
    # ATcT identifier has trailing '*0' or similar
    s = strcas.split('*')[0]
    ccas = ''
    for c in s:
        if c in '0123456789':
            ccas += c
    return ccas

In [41]:
cCASRN = compress_CASRN(CASRN)  # for the target, from local file

In [42]:
# collect identifiers
identifier = {
    'local': molec,  # the label used internally here
    'formula': formula,
}
if CASRN:
    identifier['CASRN'] = CASRN
df_atct = atct[(atct.Formula == formula) | (atct.Hill == formula)].copy()
df_atct['ccas'] = [compress_CASRN(casrn) for casrn in df_atct.ATcT_ID]
#display(df_atct)
df_wb = webbook[(webbook.Formula == formula)].copy()
df_wb['ccas'] = [compress_CASRN(casrn) for casrn in df_wb.CASRN]
#display(df_wb)
for source, prefix, df in zip(['WebBook', 'ATcT'], ['WB', 'ATcT'], [df_wb, df_atct]):
    if len(df) == 1:
        identifier[source] = df[prefix+'_ID'].iloc[0]
        identifier['name'] = df['Species'].iloc[0]
    elif len(df) == 0:
        print(f'Notice: "{formula}" not found in {source} list')
    else:
        # more than one
        print(f'Multiple {formula} matches for "{molec}" in {source}--checking for CASRN {CASRN}')
        if cCASRN:
            subdf = df[df.ccas == cCASRN]
            if len(subdf) == 0:
                print('\tNothing matches')
            else:
                df = subdf.copy()
                if len(df) > 1:
                    print('\tMultiple matches for {cCASRN}')
        display(df)
        print('Formula = {:s}'.format(identifier['formula']))
doc['Identifiers'] = identifier
print(identifier)

{'local': 'cf3cl', 'formula': 'CClF3', 'CASRN': '75-72-9', 'WebBook': 'C75729', 'name': 'Chlorotrifluoromethane', 'ATcT': '75-72-9*0'}


In [43]:
# collect reference enthalpy of formation
refdata = {}
for source, df in zip(['ATcT', 'WebBook'], [df_atct, df_wb]):
    if len(df) == 1:
        data = {}
        try:
            data['EoF0'] = float(df['EoF0'].iloc[0])
        except:
            if source != 'WebBook':
                print(f'No EoF0 found in {source}')
        try:
            data['source'] = str(df['Squib'].iloc[0])
        except:
            # ATcT source cannot be further specified
            pass
        try:
            data['comment'] = str(df['Method'].iloc[0])
        except:
            # ATcT method cannot be further specified
            pass
        data['EoF298'] = float(df['EoF298'].iloc[0])
        data['unc'] = float(df['Unc'].iloc[0])
        data['unit'] = 'kJ/mol'  # for both WebBook and ATcT
        refdata[source] = data
# look for spin-orbit correction
subsoc = soc[soc.Formula == formula]
if len(subsoc == 1):
    socdata = {'value': float(subsoc['Value'].iloc[0])}
    socdata['unc'] = float(subsoc['Unc'].iloc[0])
    socdata['unit'] = subsoc['Unit'].iloc[0]
    refdata['spin_orbit'] = socdata
doc['Refdata'] = refdata

In [44]:
# Detect and add functional groups
G = chem.Geometry(geom['coordinates'])
G.spinmult = int(mult)
fungroups = G.find_functional_group(spin=True)
doc['Functional_groups'] = {k: str(v) for k, v in fungroups.items()}

In [None]:
# save to YAML file
fout = os.sep.join([DATA, molec + '.yml'])
with open(fout, 'w') as F:
    F.write(yaml.dump(doc))
print('YAML file {:s} created.'.format(fout))

In [None]:
# Read it back, as a check
ydat = mrx.read_molec_yaml(molec)
ydat

In [None]:
def compare_pro_in(inpro, outpro):
    # Return True if output file 'outpro' was generated using
    #   the same input as shown in 'inpro'
    # comparison is word by word within each line
    # Special: in comment, '***,' input may be '***, ' output
    # Special: floating '-0.0' should match '0.0'
    inbuf = []
    with open(inpro, 'r') as F:
        for line in F:
            ln = line.strip()
            if ln[:4] == '***,':
                ln = ln[4:].strip()
            wrds = ln.split()
            inbuf.append(wrds)
    with open(outpro, 'r') as F:
        for i, line in enumerate(F):
            if 'PROGRAM SYSTEM MOLPRO' in line:
                break
            ln = line.strip()
            if ln[:4] == '***,':
                ln = ln[4:].strip()
            wrds = ln.split()
            # delete a line if all words are equal
            for inline in inbuf.copy():
                match = (len(inline) == len(wrds))
                if match:
                    for iw, ow in zip(inline, wrds):
                        if iw != ow:
                            try:
                                if float(iw) == float(ow):
                                    continue
                            except:
                                pass
                            match = False
                if match:
                    inbuf.remove(inline)
    return len(inbuf) == 0

In [None]:
testmol = 'hco2'
fin = os.sep.join([EDIR, testmol + '.in'])
fpro = os.sep.join([EDIR, testmol + '.pro'])
compare_pro_in(fin, fpro)