In [None]:
# Extract SO-CI information from MOLPRO outputs for atoms
#   Read exptl data from Excel file, combine with weights to get E_so
#   The experimental Excel file is generated by get_NIST_atomic_data.ipynb
# More robust J assignments (attempted)
# KKI 8/30/2024
# Working for difficult Ta case (9/25/2024)
#    (1) get it working for even-electron case (Kr test)
#    (2) transparent handling of missing exptl levels in eq. (
import re, sys, os
import numpy as np
import pandas as pd
from collections import Counter
#import random
#import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

import chem_subs as chem
#import molpro_subs as mpr
import molpro_subs2 as m2

pd.set_option('display.max_rows', None)
np.set_printoptions(suppress=True)

In [None]:
# how to format DataFrames
fmt = {'Eshift': '{:.1f}', 'degen': '{:d}'}
for col in ['J', 'Ecalc', 'E_dif', 'Erel', 'Eshift', 'err', 'Eterm', 'cm-1', 'S',
           'wmean', 'wstds', 'uwmean', 'uwstds', 'change', 'rwmse', 'Erel_spread',
           'Edav_range_cm']:
    fmt[col] =  fmt['Eshift']
for col in ['dif', 'Theory', 'ecm', 'SOC', 'RMSE']:
    fmt[col] = '{:.2f}'
fmt['weight'] = '{:.6f}'

### Specify Molpro SO-CI output file

In [None]:
#fname = 'Mo_S1Q18_acvqz-pp.out'  # degenerate levels are problem
fname = 'at_2PDPS4PDS_dactzpp.pro'
#fname = 'Tc_6SQ26_acvt-pp.out'

In [None]:
# my atom subdirectory names look like "Ar_I" (for neutral argon)
el = fname.split('_')[0].capitalize()
fdir = r'C:\Users\irikura\OneDrive - NIST\Karl\atomic_SOC\calculations\{:s}_I'.format(el)
#fdir = r'C:\Users\dagbaglo\Desktop\So-ci_energy\{:s}_I'.format(el)

In [None]:
fsoc = os.sep.join([fdir, fname])
print(f'Reading MOLPRO file')
print(fsoc)

In [None]:
# Identify the different sections of the output file
major_sections, linenos = m2.identify_sections(fsoc)
if True:
    print('Major sections:')
    for k, v in major_sections.items():
        print(f'   {k:<11s}   {len(v)} text blocks')

In [None]:
# section "header"
basisset = m2.basisset_name(major_sections['header'][-1])
# section "integrals"
PG = m2.point_group(major_sections['integrals'][-1])
print(f'Computational point group = {PG}')
if PG != 'Ci':
    chem.print_err('', 'Ci point group is required for this analysis')
nprim = m2.nbf_primitive(major_sections['integrals'][-1])
nbf = m2.nbf(major_sections['integrals'][-1])
print(f'{basisset} basis set')
print(f'    {nprim} primitives')
print(f'    {nbf} contracted basis functions')
crd = m2.coordinates(major_sections['integrals'][-1])
atom = crd[-1]['el']
if atom != el:
    chem.print_err('', f'This looks like the wrong atom ({atom}) for the filename ({el})')
Qtot = m2.nuclear_charge_total(major_sections['integrals'][-1])
print(f'Atom "{atom}" with nuclear charge = {Qtot}')
Zel = chem.elz(atom, 'Z')
if Zel > Qtot:
    print(f'    pseudopotential replaces {Zel - Qtot} core electrons')

In [None]:
# section "rhf"
try:
    occup_hf = m2.hf_occup(major_sections['rhf'][-1])
    print('HF occupations: ', occup_hf)
    hf_results = m2.scf_result('RHF', major_sections['rhf'][-1])
    print('HF energy = {:.6f} for state {:s}'.format(hf_results['E'], hf_results['Label']))
    orbtitle, dfHForb = m2.parse_orbitals(major_sections['rhf'][-1])
    nel_HF = 0
    for otype, occs in occup_hf.items():
        if otype == 'alpha/beta':
            nel_HF += 2 * sum(occs)
        else:
            nel_HF += 1 * sum(occs)
    print(f'HF has {nel_HF} electrons (charge = {Qtot - nel_HF})')
    print(orbtitle)
    m2.color_by_orb(dfHForb)
except KeyError:
    print('No Hartree-Fock step found')
    nel_HF = 0

In [None]:
# break section "multi" into sub-sections
multisec = m2.multi_sections(major_sections['multi'][-1])
multisec.keys()

In [None]:
# parsing MULTI sub-sections
dynfac = m2.get_dynfac(multisec['top'][-1])
orbspace = m2.orbital_spaces(multisec['top'][-1])
statesym = m2.state_symmetry_groups(multisec['top'][-1])
convergence = m2.multi_convergence(multisec['iterations'][-1])
weights = m2.multi_weights(multisec['iterations'][-1])
dfiter = m2.multi_iterations(multisec['iterations'][-1])
dfstates = m2.multi_results(multisec['results'])
dfexpec = m2.multi_expec(multisec['trans'][-1])
dftrans = m2.multi_transmom(multisec['trans'][-1])
orbtitle, dfNO = m2.parse_orbitals(multisec['natorb'][-1])
ddfcivec, dEcas = m2.multi_civecs(multisec['civector'])

In [None]:
nel_CAS = nactel = statesym[0]['nelec']
nactorb = sum(orbspace['active'])
print(f'CASSCF active space is ({nactel}/{nactorb}) with active orbitals {orbspace["active"]}')
if 'closed-shell' in orbspace.keys():
    print(f'    closed orbitals are {orbspace["closed-shell"]}')
    nel_CAS += 2 * sum(orbspace["closed-shell"])
else:
    print( '    There are no "closed" orbitals')
if 'frozen' in orbspace.keys():
    print(f'    frozen orbitals are {orbspace["frozen"]}')
    nel_CAS += 2 * sum(orbspace["frozen"])
else:
    print( '    There are no "frozen" orbitals')
print(f'    charge = {Qtot - nel_CAS}')
# Count the states
mult_count = {}
ncas = 0
for st in statesym:
    mult = st['spin']
    mult_count[mult] = st['nstates'] + mult_count.get(mult, 0)
    ncas += st['nstates']
print(f'{ncas} CASSCF states:')
for mult, n in mult_count.items():
    print(f'   {n:3d} {mult}')
    
# Show the state weights, renormalized for reading convenience
print('CASSCF relative state weights (subject to rounding error):')
uweights = m2.unnormalize_cas_weights(weights)
for k, wts in uweights.items():
    print('    ', np.round(wts, 1))
    if max(wts) > wts[0] + 0.01:
        print('    *** putting a heavy weight on and excited can cause trouble ***')
    
# Are <L**2> values clean?
ilsq = np.rint(dfexpec['L**2'])
maxdev = np.abs(ilsq - dfexpec['L**2']).max()
if maxdev:
    print(f'Largest deviation of <L**2> from integer = {maxdev:.1e}')
else:
    print('Values of <L**2> are clean')
if nel_CAS == nel_HF:
    CAS_rel_HF = dfstates.E.min() - hf_results['E']
    print(f'For the ground state, [E(CASSCF) - E(HF)] = {CAS_rel_HF:.6f}')
    if CAS_rel_HF >= 0:
        print('   *** this difference is usually negative')
        print('   *** consider using a heavier CASSCF weight on the ground term')
elif nel_HF > 0:
    print(f'CASSCF and HF have different numbers of electrons ({nel_CAS} and {nel_HF})')
print()
print(orbtitle)
orb_styler = m2.color_by_orb(dfNO)

In [None]:
if False:
    # print results from parsing MULTI output
    print(f'DYNW = {dynfac}')
    print('Spaces: ', orbspace)
    print('CASSCF state groups:')
    for g in statesym:
        print('   ', g)
    print(convergence)
    print('CASSCF state weights:')
    for k, v in weights.items():
        print(f'  {k:>2s}: ', v)
    display(dfiter)
    display(dfstates)
    display(dfexpec)
    for op, df in dftrans.items():
        print(f'Operator {op}')
        display(df)
    print(orbtitle)
    display(dfNO)
    for k, df in ddfcivec.items():
        print(k, dEcas[k])
        display(df.head())

In [None]:
ineg = np.argwhere(dfexpec['L**2'] < 0)
if len(ineg) > 0:
    print('*** Negative values of <L**2> ***')
    display(dfexpec[dfexpec['L**2'] < 0])
    ismall = np.argwhere(dfexpec['L**2'].abs() < 1.e-5).flatten().tolist()
    #print(ismall)
    if len(ismall):
        print('Setting small values to zero')
        dfexpec.loc[ismall, 'L**2'] = 0

In [None]:
ineg = np.argwhere(dfexpec['LZLZ'] < 0)
if len(ineg) > 0:
    print('*** Negative values of <LZLZ> ***')
    display(dfexpec[dfexpec['LZLZ'] < 0])
    ismall = np.argwhere(dfexpec['LZLZ'].abs() < 1.e-5).flatten().tolist()
    #print(ismall)
    if len(ismall):
        print('Setting small values to zero')
        dfexpec.loc[ismall, 'LZLZ'] = 0

In [None]:
# Summarize CASSCF results
dfcas = dfstates[['Label', 'irrep', 'E']].copy()
Svals = []
for g in statesym:
    for i in range(g['nstates']):
        Svals.append(chem.MULTSPIN[g['spin']])
dfcas.insert(2, 'S', Svals)
dfcas['L**2'] = dfexpec['L**2']
dfcas['LZ'] = np.sqrt(dfexpec['LZLZ'])
dfcas['L'] = np.sqrt(dfexpec['L**2']).astype(int)
tsymb = []
for S, L, irr in zip(dfcas.S, dfcas.L, dfcas.irrep):
    parity = 3 - 2*irr
    trm = chem.term_symbol(L, S, parity, linear=False)
    tsymb.append(trm)
dfcas['term'] = tsymb
print('CASSCF states')
dfcas

In [None]:
dfcasterm = m2.collect_atomic_terms(dfcas, 'E')
nterm = len(dfcasterm)
print(f'There are {ncas} CASSCF states in {nterm} terms')
# Add J values
Jvals = [chem.possible_J_from_term(trm) for trm in dfcasterm['term']]
dfcasterm['J_vals'] = Jvals
display(dfcasterm)

In [None]:
# Parse MRCI results and summarize in DataFrame
dfmrci = pd.DataFrame()
for imrci, sec in enumerate(major_sections['mrci']):
    print(f'MRCI calculation #{imrci+1}')
    mrcisec = m2.mrci_sections(sec)
    mrci_meta = m2.mrci_info(mrcisec['top'][0])
    mrci_iter = m2.mrci_iterations(mrcisec['iterations'][0])
    mrci_results = m2.mrci_results(mrcisec['results'][0])
    nstate = len(mrci_results['state'])
    print(f'    {mrci_meta["smult"]}, irrep {mrci_meta["irrep"]}')
    print(f'    {nstate} states')
    # Report on orbital spaces in the MRCI
    print('    orbital spaces, by irrep')
    for sp in ['core', 'closed', 'active', 'external']:
        print('\t{:10s} {}'.format(sp, mrci_meta['spaces'].get(sp, [])))
    lbll =  []  # list of state labels
    c0rot = []  # list of C0 (rotated) values
    El =    []  # list of energies
    davl =  []  # list of Davidson-corrected energies (rotated ref)
    erefl = []  # list of reference energies
    spinmult = mrci_meta['smult']
    S = chem.MULTSPIN[spinmult]
    irrep = mrci_meta['irrep']
    for lbl, v in mrci_results['state'].items():
        lbll.append(lbl)
        try:
            c0rot.append(v['C0']['rotated'])
            davl.append(v['Energy']['davidson']['rotated'])
        except KeyError:
            # no "rotated" values if there is only one state
            c0rot.append(v['C0']['relaxed'])
            davl.append(v['Energy']['davidson']['relaxed'])
        El.append(v['Energy']['total'])
        erefl.append(v['Energy']['ref E'])

    # Get CASSCF (fixed) term composition of each MRCI state
    fixref = m2.coefficients_of_refs(mrcisec['results'][0])  # coeffs of CASSCF refs
    subcas = dfcas[dfcas.S == S]
    if nstate == 1:
        # 'fixref' is [0] because that text block is not available
        # just assign 100% to the only CASSCF reference state
        ilead = np.array([0])
        fixref = np.array([])
        casterm = subcas.iloc[0]['term']
        cascomp = [{casterm: 1.}]
    else:
        ilead = np.zeros(fixref.shape[1]).astype(int)
        cascomp = []
    for ist in range(fixref.shape[0]):
        icas = np.argmax(np.abs(fixref[ist, :]))
        ilead[ist] = icas
        termd = {}
        for icas, c in enumerate(fixref[ist, :]):
            casterm = subcas.iloc[icas]['term']
            termd[casterm] = termd.get(casterm, 0) + c*c
        cascomp.append(termd)
    if len(set(ilead)) < len(ilead):
        print('    ** Warning: one CASSCF state leads more than one MRCI state')
    reflbl = [subcas.iloc[i]['Label'] for i in ilead]
    lz = [subcas.iloc[i]['LZ'] for i in ilead]
    x = [v for v in mrci_iter['init_ref'].values()][:nstate]
    init_refE = [x[i] for i in ilead]
    terml = [subcas.iloc[i]['term'] for i in ilead]
    data = {'Label': lbll, 'irrep': irrep, 'S': S, 'E': El, 'Edav': davl,
            'C0': c0rot, 'Eref': erefl, 'init_ref': init_refE, 'iref_nr': ilead + 1,
            'irlbl': reflbl, 'term': terml, 'LZ': lz, 'CASterm': cascomp}
    dfci = pd.DataFrame(data)
    dfmrci = pd.concat([dfmrci, dfci], ignore_index=True)
# add column for difference between reference energy and corresponding CASSCF
d_ref = dfmrci.Eref - dfmrci.init_ref
dfmrci.insert(8, 'D_ref', d_ref)

In [None]:
# Check for term contamination of MRCI states
maxcontam = 0  # largest term contamination across all MRCI states
for ici, row in dfmrci.iterrows():
    tlead = row.term
    twt = row.CASterm
    tmax = max(twt, key=twt.get)
    if tlead != tmax:
        print(f'*** Term confusion for MRCI state! ***')
        display(row.to_frame().T)
    dcontam = {}
    del twt[tlead]  # what is left is contamination
    for k in list(twt.keys()):
        if twt[k] == 0:
            del twt[k]  # omit zero contaminants
    maxcontam = max([maxcontam] + list(twt.values()))
print(f'Largest MRCI term contamination = {maxcontam:.1e}')
# Rename CASterm to CAScontam
dfmrci.rename(columns={'CASterm': 'CAScontam'}, inplace=True)

In [None]:
nmrci = len(dfmrci)
#dfmrci
dfciterm = m2.collect_atomic_terms(dfmrci, 'Edav')
print(f'There are {nmrci} MRCI states in {len(dfciterm)} terms')
if nterm != len(dfciterm):
    chem.print_err('', 'Different number of terms from CASSCF and from MRCI')
# Make prefixes enumerative
dfciterm['term'] = chem.enumerative_prefix(dfciterm.term.values, always=False)
termsIn = set(dfciterm.term)
print('MRCI terms:')
styler = dfciterm.style
styler = styler.apply(lambda x: ["background: yellow" if abs(v) > 5 else "" for v in x], 
              subset=pd.IndexSlice[['Edav_range_cm']])
styler.format(fmt)

In [None]:
# section "SOintegrals"
if 'SOintegrals' in major_sections.keys():
    SOintgrl = m2.SO_integrals(major_sections['SOintegrals'][0])
    #print(SOintgrl)

In [None]:
# break section "soci" into sub-sections
sosec = m2.soci_sections(major_sections['soci'][0])
sosec.keys()

In [None]:
# SOCI sub-section 'matel_comput'
hlsdiag = m2.soci_replacements(sosec['matel_comput'][0])
n_cistates = sum([x['nstate'] for x in hlsdiag.values()])
print(f'There are {n_cistates} states in the HLSDIAG list:')
# Present a table of old and new diagonal energies, as a check
data = {'sym': [], 'S': [], 'old': [], 'HLSDIAG': []}
for rec, calc in hlsdiag.items():
    for old, new in zip(calc['old'], calc['new']):
        data['sym'].append(calc['sym'])
        data['S'].append(calc['S'])
        data['old'].append(old)
        data['HLSDIAG'].append(new)
    dfhlsdiag = pd.DataFrame(data=data)
    dfhlsdiag['diff'] = dfhlsdiag.HLSDIAG - dfhlsdiag.old
styler = dfhlsdiag.style
warnThresh = 0.5  # (Hartree) highlight changes in diagonal larger than this
styler = styler.apply(lambda x: ["background: yellow" if (abs(v) > warnThresh) or (v > 0) else "" for v in x], 
              subset=pd.IndexSlice[['diff']])
display(styler.format(fmt))
# check for zeros
if (np.array(data['HLSDIAG']) == 0).any():
    chem.print_err('', 'Zero values in HLSDIAG')
mat_elems = m2.soci_matelems(sosec['matel_comput'][0])
if mat_elems:
    print(mat_elems)

In [None]:
# SOCI sub-section 'basis_prop'
if 'basis_prop' in sosec.keys():
    basprop = m2.soci_basis_prop(sosec['basis_prop'][0], n_cistates)
    print(basprop['DMZ'][0,:])

In [None]:
# SOCI sub-section 'so_calc'
E0 = m2.soci_E0(sosec['so_calc'][0])
print(f'E0 = {E0:.6f} in the SO-CI')
somat = m2.soci_matrix(sosec['so_calc'][0])
dimen = somat['matrix'].shape[0]
print(f'There are {dimen} SO-CI states')

In [None]:
# Check for zero spin-orbit coupling
offdiag =somat['matrix'].copy()
np.fill_diagonal(offdiag, 0)
amax = np.max(np.abs(offdiag))
if amax == 0:
    print('*** Off-diagonal elements of spin-orbit matrix are all zero ***')
    print('There is no spin-orbit coupling')
    sys.exit(1)

In [None]:
# Add to basis-state info: MRCI and term parentage, LZ etc.
for i, bas in enumerate(somat['basis']):
    S = bas['S']
    lbl = bas['State']
    subdf = dfmrci[(dfmrci.Label == lbl) & (dfmrci.S == S)]
    #display(subdf)
    ici = subdf.index[0]
    bas['ici'] = ici
    for iterm, trow in dfciterm.iterrows():
        if ici in trow.idx:
            bas['iterm'] = iterm
            break
    # Add LZ
    bas['LZ'] = np.round(subdf.loc[ici, 'LZ'], 5)
    bas['JZ'] = set([abs(bas['Sz'] - bas['LZ']), abs(bas['Sz'] + bas['LZ'])])
    # find the minimum Jz
    minJ = min(bas['JZ'])
    # round minJ up
    minJ = np.rint(minJ - bas['Sz']) + bas['Sz']
    tlbl = dfciterm.loc[bas['iterm'], 'term']  # term label
    jposs = np.array(chem.possible_J_from_term(tlbl))
    bas['Jterm'] = set(jposs)
    # J cannot be less than Jz
    jposs = jposs[jposs >= minJ]
    bas['Jposs'] = set(jposs)

In [None]:
# SOCI sub-section 'so_levels'
so_energies = m2.soci_energies(sosec['so_levels'][0])
df_soE = pd.DataFrame(so_energies)
print(f'There are {len(df_soE)} spin-orbit levels')
df_soE

In [None]:
SOCraw = min(so_energies['Eshift'])
print(f'From lowest level and lowest uncoupled term energy, raw theoretical SOCraw = {SOCraw:.2f} cm-1')

In [None]:
# SO-CI sub-section 'so_vectors'
# In case of symmetry blocking, the last one should be the summary
so_vecs = m2.soci_vectors(sosec['so_vectors'][-1])
so_vecs.keys()

In [None]:
# check eigenvectors for normality
#    eigenvectors are columns of so_vecs['matrix']
tol = 1.e-7
mat = so_vecs['matrix']
for i in range(dimen):
    prod = np.dot(np.conjugate(mat[:, i]), mat[:, i])
    if np.abs(1 - prod) > tol:
        print(i, i, ':  ', prod)

In [None]:
# check eigenvectors for orthogonality
#    eigenvectors are columns of so_vecs['matrix']
mat = so_vecs['matrix']
for i in range(dimen):
    for j in range(i):
        prod = np.dot(np.conjugate(mat[:, i]), mat[:, j])
        if np.abs(prod) > tol:
            print(i, j, ':  ', np.abs(prod))

In [None]:
# SO-CI sub-section 'so_compos'
so_compos = m2.soci_composition(sosec['so_compos'][0])
so_compos.keys()

In [None]:
# check that all listings of basis states are consistent
for a, b, c in zip(somat['basis'], so_vecs['basis'], so_compos['basis']):
    for k in a.keys():
        if (k in b.keys()) and (k in c.keys()):
            if (a[k] != b[k]) or (a[k] != c[k]):
                print(a)
                print(b)
                print(c)
                print('----------------')

In [None]:
# check that composition is consistent with eigenvectors
magnit = np.conjugate(so_vecs['matrix']) * so_vecs['matrix']
# get differences in percent (printed by Molpro to 0.01% precision)
difmat = (magnit * 100) - so_compos['matrix']
dmax = np.abs(difmat).max()
print(f'Largest inconsistency between composition and eigenvectors = {dmax:.2f} %')

In [None]:
# Convert basis-state compositions (percent) to term compositions
use_printed = False  # use composition % as printed by Molpro (less precise)
if use_printed:
    # compositions are printed to 0.001% precision
    print('Using rounded compositions as printed by Molpro')
    magpct = so_compos['matrix']
else:
    # eigenvectors are orthonormal and printed to 1e-8 precision
    print('Using compositions derived from eigenvectors')
    magpct = np.real(magnit * 100)
term_compos = np.zeros((nterm, dimen))
for ibas in range(dimen):
    iterm = somat['basis'][ibas]['iterm']
    term_compos[iterm,:] += magpct[ibas,:]

In [None]:
if False:
    # Display composition of one SO-CI state
    iso = 26
    wts = magpct[:, iso]  # basis-state weights
    dflevcomp = pd.DataFrame({'wt_bas': np.round(wts, 6)})
    dflevcomp['bas_Jposs'] = [x['Jposs'] for x in somat['basis']]
    dflevcomp['S'] = [x['S'] for x in somat['basis']]
    dflevcomp['Sz'] = [x['Sz'] for x in somat['basis']]
    dflevcomp['ici'] = [x['ici'] for x in somat['basis']]
    dflevcomp['iterm'] = [x['iterm'] for x in somat['basis']]
    dflevcomp['term'] = [dfmrci.loc[x['ici'], 'term'] for x in somat['basis']]
    dflevcomp['term_Jposs'] = [chem.possible_J_from_term(trm) for trm in dflevcomp['term']]
    display(dflevcomp.sort_values('wt_bas', ascending=False))

In [None]:
# Add J values to dfciterm
jpossl = []
for term in dfciterm.term:
    jposs = chem.possible_J_from_term(term)
    jpossl.append(jposs)
dfciterm['J'] = jpossl

In [None]:
def term_index(x):
    # Given a term symbol, return its index in dfciterm
    # Given an index, return the term symbol from dfciterm
    global dfciterm
    try:
        trm = dfciterm.at[x, 'term']
        return trm
    except:
        try:
            i = dfciterm.index[dfciterm.term == x].tolist()[0]
            return i
        except IndexError:
            # invalid term
            return None

In [None]:
# Get target J counts corresponding to the CASSCF terms
allJ = []
for jl in dfcasterm['J_vals']:
    allJ.extend(jl)
J_all = dict(Counter(allJ))
print('Required level counts     :', J_all)
nlevels = len(allJ)
print(f'    There are {nlevels} J-levels')
Jxg = {k: int(v * (2*k+1)) for k, v in J_all.items()}
J_left = Jxg.copy()  # copy to be decremented
print('Required sublevel counts:', Jxg)
#df_soE['J'] = None

### Assign values of <em>J</em>  to levels

In [None]:
def find_possible_J(df_soE, basis_descr, magpct, style='term', thrpct=10.):
    '''
    Add set of possible J values to each level/row of df_soE
        also add their count
    Return nothing
    Args: df_soE       DataFrame of SO-CI levels
          basis_descr  list of dict; info about each basis state
          magpct       square array, pct weights of basis states, indices [ibas, iso]
          style        either 'term' or 'basis state'; which to use in determining J poss
          thrpct       pct threshold for including terms or basis states
    '''
    if style == 'term':
        choice = 'Jterm'  # key in basis-state description
    elif style == 'basis state':
        choice = 'Jposs'
    else:
        chem.print_err('', '*** Argument "style" must be either "term" or "basis state"')
    Jpossl = []  # list of sets
    for iso, row in df_soE.iterrows():
        bascomp = magpct[:, iso]  # composition vector for this level
        idx = np.argwhere(bascomp > thrpct).flatten()
        if len(idx) == 0:
            # in rare case that no basis state contributes 'thrpct' %
            idx = [np.argmax(bascomp)]
        for k, i in enumerate(idx):
            if k == 0:
                Jposs = basis_descr[i][choice]
            else:
                # take intersection among sets of significant terms or basis states
                Jposs = Jposs.intersection(basis_descr[i][choice])
        Jpossl.append(Jposs)
    df_soE['J_poss'] = Jpossl
    df_soE['nposs'] = [len(s) for s in Jpossl]
    return 

In [None]:
find_possible_J(df_soE, somat['basis'], magpct, 'term', thrpct=10.)
# Add term composition vector (complete and also rounded)
df_soE['TC_approx'] = list(np.round(term_compos, 1).T)
df_soE['term_comp'] = list(term_compos.T)
# make column for final J values
df_soE['J'] = None  # yet unassigned

In [None]:
# Check for 0-possibility problems
subdf = df_soE[df_soE.nposs < 1]
n0 = len(subdf)  # number of levels with no possible values of J
# Check for insufficient possibilities
inadequate = []  # list of J with insufficient possible values
for J in J_left.keys():
    dfJ = m2.Jposs_subdf(df_soE, J)
    nposs = len(dfJ)
    if nposs < J_left[J]:
        inadequate.append(J)
        display(dfJ)
if n0 or inadequate:
    if n0:
        print('*** Some levels have all possibilities eliminated ***')
    if inadequate:
        print(f'*** Some J values have insufficient possible levels: {inadequate}')
    print(f'\nThis means that the term weights are defective at the {thrpct}% level.')
    print('If you don\'t care, you can try increasing that threshold ("thrpct").')

In [None]:
# Make backup copies in case J assignment fails at first
df_soE_bak = df_soE.copy()
J_left_bak = J_left.copy()

In [None]:
# To assign J, first try Kmeans clustering based upon energy and term composition
#     as done in atomic_soc8.ipynb

# Assemble vectors needed by Kmeans()
Escale = 10.  # this many cm-1 is on par with 1% composition
Xwt = np.transpose(term_compos)
xe = df_soE.Erel.values / Escale
xe = xe.reshape((-1,1))
X = np.hstack((Xwt, xe))
nlevel = sum(J_all.values())
print(f'Assigning J using {nlevel} clusters/levels...')
Kmean = KMeans(n_clusters=nlevel, n_init=10)
try:
    Kmean.fit(X)
    J_fail = False
except:
    print('*** Clustering failed ***')
    J_fail = True
impossl = []
if not J_fail:
    xmeans = [x[-1] for x in Kmean.cluster_centers_]  # cluster mean (Erel/Escale) values
    Emeans = np.array(xmeans) * Escale
    df_soE['ilev'] = Kmean.labels_
    Jfound = []
    J_fail = False
    for ilev, grp in df_soE.groupby('ilev'):
        g = len(grp)
        J = np.round((g-1)/2, 1)
        df_soE.loc[df_soE.ilev == ilev, 'J'] = J
        Jfound.append(J)
    # Error-checking of assignments
    if Counter(Jfound) != J_all:
        print('*** J values found are not J values expected! ***')
        print('Expected:', sorted(J_all.items()))
        J_clust = Counter(Jfound)
        print('Found   :', sorted(J_clust.items()))
        J_fail = True
    for irow, row in df_soE.iterrows():
        if row.J not in row.J_poss:
            print(f'Level {irow} has J = {row.J} but that is not among ' +
                  f'possibilities {sorted(row.J_poss)}')
            J_fail = True
            impossl.append(irow)
    if len(impossl):
        print('\n*** These "impossible" assignments may mean that term compositions are unreliable ***')
        display(df_soE.loc[impossl, ['Erel', 'J_poss', 'TC_approx', 'J']].style.format(fmt))
if not J_fail:
    print('Success!')

In [None]:
if J_fail:
    print('*** Kmeans failed to assign reasonable values of J ***')
    print('Examine the calculated levels for possible assignments of J')
    display(df_soE_bak[['Erel', 'J_poss', 'nposs', 'TC_approx']])

In [None]:
def J_best_group_for_J(df_soE, J_left, J):
    # For the given value of J, find the tightest groups of levels
    #    that are reasonably isolated
    # Return the groups as list of:
    #    indices, marginal DataFrame, E-spread, TC-spread, E-margin, TC-margin
    # KKI 11/1/2024
    degen = int(2*J + 1)
    ngroup = J_left[J] // degen
    df_unassigned = df_soE[df_soE['J'].isnull()]
    dfJ = m2.Jposs_subdf(df_unassigned, J)
    #display(dfJ[['Erel', 'J_poss', 'TC_approx']])
    ilo = []
    sprE = []
    sprTC = []
    margE = []
    margTC = []
    subdf_list = []
    for i in range(len(dfJ) - degen + 1):
        subdf = dfJ.iloc[i : i+degen]
        Espr, TCspr, maxTC = m2.spreads_ETC(subdf)
        if Espr is None:
            # set to zero
            Espr = maxTC = 0.
            TCspr = np.zeros(len(df_soE.loc[0, 'term_comp']))
        ilo.append(i)
        sprE.append(Espr)
        sprTC.append(maxTC)
        Esep = TCsep = np.inf  # separation from adjacent levels
        llim = i
        ulim = i + degen
        if i > 0:
            llim -= 1
            spr, cspr, maxcspr = m2.spreads_ETC(dfJ.iloc[i-1 : i+degen])
            Esep = min(Esep, spr - Espr)
            TCsep = min(TCsep, maxcspr - maxTC)
        if (i + degen) < len(dfJ):
            ulim += 1
            spr, cspr, maxcspr = m2.spreads_ETC(dfJ.iloc[i : i+degen+1])
            Esep = min(Esep, spr - Espr)
            TCsep = min(TCsep, maxcspr - maxTC)
        margE.append(Esep)
        margTC.append(maxTC)
        subdf_list.append(dfJ.iloc[llim:ulim][['Erel', 'J_poss', 'nposs', 'TC_approx', 'J']])
    retval = []
    for i in np.argsort(sprE):
        # loop over groupings from tightest to loosest
        if margE[i] < (2 * sprE[i]):
            # degenerate with adjacent level
            continue
        # add to the list
        idx = dfJ.index.values[i : i+degen]
        retval.append([idx, subdf_list[i], sprE[i], sprTC[i], margE[i], margTC[i]])
        if len(retval) == ngroup:
            break
    return retval
##

In [None]:
if J_fail:
    # discard any (failed) assignments
    df_soE = df_soE_bak
    J_left = J_left_bak
    
    # try the simplest tricks
    thr_degen = 5 # cm-1
    thr_tcomp = 3 # percent
    df_soE['ilev'] = -1
    
    def highlight_rows(row):
        if row.name in idx:  # Highlight rows with index 0 and 2
            return ['background-color: lightgreen'] * len(row)
        else:
            return [''] * len(row)
    
    nass_singl = m2.singletons_J(df_soE, J_left, thr_degen, thr_tcomp, verbose=True)
    nass_goldi = m2.poss_count_just_right(df_soE, J_left, thr_degen, thr_tcomp, verbose=True)
    print('\n--- Look for best-separated groups of degeneracy (2J + 1) ---')
    for J in sorted(J_left.keys(), reverse=True):
        if J_left[J] < 1:
            # no more assignments to make
            print(f'J = {J} is fully assigned')
            continue
        degen = int(2 * J + 1)
        print(f'--- look for {J_left[J] // degen} groups with J = {J}')
        candi = J_best_group_for_J(df_soE, J_left, J)
        for idx, dfx, sprE, sprTC, margE, margTC in candi:
            styler = dfx.style.apply(highlight_rows, axis=1)
            display(styler)
            ok = input(f'Do you approve assigning this group to J={chem.halves(J)}? ')
            if 'n' not in ok.lower():
                m2.record_J_assignment(df_soE, J_left, idx, J)
                print('OK')
                # try the easy things again, now that changes were made
                nass_singl = m2.singletons_J(df_soE, J_left, thr_degen, thr_tcomp, verbose=True)
                nass_goldi = m2.poss_count_just_right(df_soE, J_left, thr_degen, thr_tcomp,
                                                      verbose=True)
            else:
                print('discarding possible assignment')
    J_fail = False
    for J, unk in J_left.items():
        if unk > 0:
            J_fail = True
            break
    if J_fail:
        print(f'\n*** There are still {len(df_soE["J"].isnull())} unassigned levels ***')
    else:
        print('\n=== All levels have been assigned ===')

In [None]:
if J_fail:
    print('**** J assignments failed, cannot proceed ****')
    for J, nleft in J_left.items():
        if nleft:
            print(f'J = {J} needs {nleft} levels assigned')
    
    def highlight_unassigned(row):
        if row.J is None:
            return ['background-color: yellow'] * len(row)
        else:
            return [''] * len(row)
    
    # Add energy increment column, display, and quit
    erel = df_soE.Erel.values
    df_soE['dE'] = [0] + list(erel[1:] - erel[:-1])
    styler = df_soE[['Erel', 'dE', 'ilev', 'J', 'J_poss', 'TC_approx']].style
    styler = styler.apply(highlight_unassigned, axis=1)
    #styler = styler.apply(lambda x: ["background: yellow" if v is None else "" for v in x], 
    #          subset=pd.IndexSlice[['J']])
    display(styler.format({'Erel': '{:.1f}', 'dE': '{:.1f}'}))

    sys.exit(1)

### After successful J assignment

In [None]:
# Add leading terms to df_soE (all levels from the SO-CI)
Tlead = []
ileadl = []
for tc in df_soE.term_comp:
    ilead = np.argmax(tc)
    Tlead.append(dfciterm.loc[ilead, 'term'])
    ileadl.append(ilead)
df_soE['Lead'] = Tlead
df_soE['ilead'] = ileadl
# Create new DataFrame of aggregated levels
avg = {}
cols = list(df_soE.columns) + ['idx']
for col in cols:
    avg[col] = []
for ilev, grp in df_soE.groupby('ilev'):
    #print('ilev =', ilev)
    #display(grp)
    for col in cols:
        try:
            avg[col].append(grp[col].mean())
        except TypeError:
            # string such as symbol of leading term
            avg[col].append(grp[col].values[0])
        except KeyError:
            # 'idxl' column not yet created
            avg['idx'].append(list(grp.index.values))
dflev = pd.DataFrame({k: avg[k] for k in cols})
# adjust values of TC_approx
for i, row in dflev.iterrows():
    #dflev.at[i, 'TC_approx'] = np.round(row.term_comp, 1) # fails when length=1
    dflev.at[i, 'TC_approx'] = [np.round(x, 1) for x in row.term_comp]
# sort by energy
dflev.sort_values('E', inplace=True, ignore_index=True)
# Create labels from Term + J
Jlbl = []
for trm, J in zip(dflev.Lead, dflev.J):
    Jlbl.append(f'{trm}_{chem.halves(J)}')
dflev['Jlbl'] = Jlbl

In [None]:
# Add information about scatter among supposedly degenerate sublevels
espread = []
tcspread = []
for ilev, row in dflev.iterrows():
    idx = row.idx
    esp = np.ptp(df_soE.loc[idx, 'Erel'])
    espread.append(esp)
    tc = np.vstack(df_soE.loc[idx, 'term_comp'])
    tcptp = np.ptp(tc, axis=0)
    tcspread.append(np.round(tcptp, 1))
dflev['Erel_spread'] = espread
dflev['TC_spread'] = tcspread
print('Check the table below for questionable grouping of levels')
print('"Erel_spread" shows how much the energies differ within a level (cm-1)')
print('"TC_spread" shows how much the term compositions differ within a level (%)')
print('"idx" shows which magnetic sublevels in "df_soE" compose each level')
dflev[['J', 'Erel', 'Erel_spread', 'TC_spread', 'idx']].style.format(fmt)

In [None]:
# Convert term compositions 'TC_approx' from numpy arrays to dicts, sorted from largest
#    to smallest weight, and with negligible weights omitted
twl = []
for TC in dflev.TC_approx:
    if TC is None:
        twl.append(None)
    else:
        twd = {dfciterm.at[i,'term']: wt for i, wt in enumerate(TC) if wt >= 0.1}
        tcsort = chem.sort_dict_by_value(twd, reverse=True)
        twl.append(tcsort)
dflev['termwt'] = twl

In [None]:
print('Level assignments from the calculation:')
showcols = ['Lead', 'J', 'Jlbl', 'Erel', 'Eshift', 'termwt']
display(dflev[showcols].style.format(fmt))

In [None]:
# Are there duplicated leading terms?
dups = False
for j, grp in dflev.groupby('J'):
    leads = list(grp.Lead)
    if len(leads) > len(set(leads)):
        print(f'*** Duplicate leading term for J = {j} ***')
        dups = True
        for lead in set(leads):
            leads.remove(lead)
        dfdup = grp[grp.Lead.isin(leads)].copy()
        #display(dfdup[showcols].style.format(fmt))
        styler = grp[showcols].style
        styler = styler.apply(lambda x: ["background: yellow" if v in leads else "" for v in x], 
              subset=pd.IndexSlice[['Lead']])
        display(styler.format(fmt))
        unlead = set() # terms that are not leading
        for twd in dfdup.termwt:
            for trm, wt in twd.items():
                if trm not in list(grp.Lead):
                    unlead.add(trm)
        print('Terms not leading: ', unlead)

In [None]:
# Change assignments of any duplicates
if dups:
    print('Correct the duplicate term assignments, if you wish')
    for trm in unlead:
        ifix = input(f'Level to label with {trm} (blank to ignore)? ')
        if len(ifix):
            ifix = int(ifix)
            dflev.loc[ifix, 'Lead'] = trm
            iterm = dfciterm.index[dfciterm.term == trm]
            dflev.loc[ifix, 'ilead'] = iterm
    # rebuild 'Jlbl' values
    jlbl = [f'{t}_{chem.halves(j)}' for t, j in zip(dflev.Lead, dflev.J)]
    dflev['Jlbl'] = chem.enumerative_prefix(jlbl)
    display(dflev[['Lead', 'J', 'Jlbl', 'Erel', 'Eshift']].style.format(fmt))

In [None]:
# Check for problems in assignments
nAssign = len(set(dflev.Lead))
nTerm = len(dfciterm)
dropT = False
if nAssign != nTerm:
    print(f'*** I started with {nTerm} terms but have {nAssign} leading terms ***')
    print('Starting: ', sorted(termsIn))
    termsOut = set(dflev.Lead)
    print('Leading : ', sorted(termsOut))
    if nAssign > nTerm:
        addT = termsOut - termsIn
        print('Added terms: ', addT)
    else:
        dropT = termsIn - termsOut
        print('Dropped terms: ', dropT)
        # Add weights from dropped terms and display
        for term in dropT:
            wtcol = []
            for comp in dflev.termwt:
                pct = comp.get(term, 0)
                wtcol.append(pct)
            dflev[term] = wtcol
            fmt[term] = '{:.1f}'
        print('Weights (%) of dropped terms in levels:')
        display(dflev[['Lead', 'J', 'Jlbl', 'Erel', 'Eshift'] + list(dropT)].style.format(fmt))
nlvl = (2 * dflev.J + 1).sum()  # number of sublevels
if nlvl != dimen:
    print(f'*** I started with {nSO} (sub)levels but now have {nlvl} ***')

In [None]:
# Manually assign any dropped terms
if dropT:
    for drt in dropT:
        ia = input(f'Which level do you want to assign to term {drt} (blank to ignore)? ')
        if len(ia) > 0:
            ia = int(ia)
            dflev.loc[ia, 'Lead'] = drt
            iterm = dfciterm.index[dfciterm.term == drt]
            dflev.loc[ia, 'ilead'] = iterm
    # rebuild 'Jlbl' values
    jlbl = [f'{t}_{chem.halves(j)}' for t, j in zip(dflev.Lead, dflev.J)]
    dflev['Jlbl'] = chem.enumerative_prefix(jlbl)
    display(dflev[['Lead', 'J', 'Jlbl', 'Erel', 'Eshift'] + list(dropT)].style.format(fmt))

In [None]:
# Inversion parity of the calculated levels
irreps_ci = set(dfciterm.irrep)
if (PG == 'Ci') and (len(irreps_ci) == 1):
    if 1 in irreps_ci:
        parity = 'even'
    else:
        parity = 'odd'
else:
    # ask user for parity of interest
    parity = input('Please choose "even" or "odd" parity: ')
print(f'Experimental states will be restricted to parity = {parity}')

### Read experimental energy levels

In [None]:
charge = Qtot - mrci_meta['nelec']  # number of electrons in the last MRCI
labels_ordinated = False  # flag to prevent multiple (1)(1)(1) etc. 
if charge >= 0:
    atstr = atom + '_' + 'I' * (charge + 1)
else:
    # anion
    atstr = atom + '_neg'
fxl = f'{atstr}_exptl_levels.xlsx'
xlpath = os.sep.join([fdir, fxl])
dfexpt = pd.read_excel(xlpath)
if type(dfexpt.J.values[0]) is str:
    fmt['J'] = '{:s}'
print(f'Experimental energy levels read from {fxl}')
# If there is a column "comment", replace NaN with ''
if 'comment' in dfexpt.columns:
    dfexpt['comment'] = dfexpt['comment'].fillna('')

In [None]:
# Find the number of decimal places in the level energies
Ecol = 'Level (cm-1)'  # the exptl energy column
ndecim = 0
for e in dfexpt[Ecol]:
    words = str(e).split('.')
    # count numeric digits
    n = sum(c.isdigit() for c in words[-1])
    ndecim = max(n, ndecim)
print(f'Experimental energies are provided to {ndecim} decimal digits')
# display formatting
fmt[Ecol] = '{:.' + str(ndecim) + 'f}'

In [None]:
# Delete any ionization limit
ilim = dfexpt[dfexpt.Term == 'Limit'].index.min()
# delete the "Limit" row and everything past it
n1 = len(dfexpt)
dfexpt = dfexpt.truncate(after=ilim-1)
n2 = len(dfexpt)
if n2 < n1:
    print(f'Discarding {n1-n2} ionized or metastable states')
oddstr = r'\*$|°' # characters to identify terms of odd parity
# Sometimes parity is shown in configuration alone?
#dfeven = dfexpt[~(dfexpt.Term.str.contains(oddstr) | dfexpt.Configuration.str.contains(oddstr))].copy()
#dfodd = dfexpt[dfexpt.Term.str.contains(oddstr) | dfexpt.Configuration.str.contains(oddstr)].copy()
dfeven = dfexpt[~(dfexpt.Term.str.contains(oddstr))].copy()
dfodd = dfexpt[dfexpt.Term.str.contains(oddstr)].copy()
print(f'{len(dfexpt)} experimental levels ({len(dfeven)} even and {len(dfodd)} odd)')
# Select by parity
if parity == 'even':
    # discard odd levels ('Term' field ends with '*')
    dfexpt = dfeven.copy()
elif parity == 'odd':
    dfexpt = dfodd.copy()
else:
    chem.print_err('', f'Parity of "{parity}" is not recognized')
n3 = len(dfexpt)
print(f'{n3} levels accepted for parity = {parity}')
# Reject bad values of J
for i in dfexpt.index:
    try:
        chem.halves_to_float(dfexpt.loc[i, 'J'])
    except ValueError:
        dfexpt.at[i, 'J'] = np.nan
nbad = dfexpt.J.isna().sum()
if nbad:
    print(f'** Rejecting {nbad} levels with malformed J values')
    dfexpt = dfexpt.dropna()
    n4 = len(dfexpt)
    print(f'{n4} level retained')
# Assign unique term symbols
if not labels_ordinated:
    dfexpt = chem.unique_labels_exptl_terms(dfexpt, verbose=True, always=True)
    labels_ordinated = True
# Add column for degeneracy
dfexpt['degen'] = (2 * dfexpt.J.apply(chem.halves_to_float)).astype(int) + 1
# Remove any troublesom non-ascii whitespace characters
dfexpt.Term = [' '.join(t.split()) for t in dfexpt.Term]
dfexpt

## Match experimental and theoretical levels

In [None]:
def match_term_symbol(symb_expt, symb_calc):
    # Return True if they are the same, else False
    # Tolerate extra prefix '(1)' or 'a' in symb_expt
    a = str(symb_expt)
    b = str(symb_calc)
    retval = (a == b)  # exact match
    tx = ''
    if ('(1)' in a[:-len(b)]) or ('a' in a):
        tx = a[-len(b):]   # match last characters
        retval |= (b == tx)
    return retval

In [None]:
def termwt_to_dict(termwt):
    # Given a list/Series of term weight arrays,
    # Return a corresponding list of dict where key = term label
    twl = []
    for TC in termwt:
        if TC is None:
            twl.append(None)
        else:
            twd = {dfciterm.at[i,'term']: wt for i, wt in enumerate(TC) if wt >= 0.1}
            # sort from largest to smallest weights
            twd = chem.sort_dict_by_value(twd, reverse=True)
            twl.append(twd)
    return twl

In [None]:
def match_theory_to_expt(dfth, dfx, bigerr=3000):
    '''
    Match experimental levels to theoretical
    Return a DataFrame containing both theory and expt, and
      the index of the highest level matched
    'bigerr' is in cm-1 and triggers extra scrutiny
    '''
    print('Matching experimental levels with theoretical levels')
    Jlist = sorted(set(dfth.J))
    dfcomp = dfexpt.copy()
    dfcomp['Tcalc'] = ''  # term assignment in computation
    dfcomp['leadwt'] = ''
    dfcomp['Ecalc'] = np.nan
    dfcomp['termwt'] = None
    dfcomp['Composition'] = None
    imax = 0  # index of highest exptl level matched
    for J in Jlist:
        print(f'J = {J}')
        # get the indices of the exptl levels that best match theoretical
        dfJth = dfth[dfth.J == J]
        #display(dfJth[['Lead', 'J', 'Erel', 'TC_approx']])
        dfJx = dfx[dfx.J == J]
        if len(dfJx) < 1:
            # try str representation
            hJstr = chem.halves(J)  # as str fraction
            dfJx = dfx[dfx.J == hJstr]
        #display(dfJx)
        idx = match_th_expt_1J_V2(dfJth, dfJx, bigerr=bigerr)
        #print(f'>>>J = {J}, num theor = {len(dfJth)}, idx = {idx}')
        for i, ix in enumerate(idx):
            rowth = dfJth.iloc[i]
            dfcomp.at[ix, 'Tcalc'] = rowth.Lead
            dfcomp.at[ix, 'leadwt'] = rowth.TC_approx[term_index(rowth.Lead)]
            dfcomp.at[ix, 'Ecalc'] = rowth.Erel
            dfcomp.at[ix, 'termwt'] = rowth.TC_approx
            dfcomp.at[ix, 'Composition'] = [x for x in rowth.term_comp] # fix when length=1
            imax = max(imax, ix)
    # Convert approx. 'termwt' from numpy arrays to dicts, sorted from largest to 
    #    smallest weight, and with negligible weights omitted
    dfcomp['termwt'] = termwt_to_dict(dfcomp.termwt)
    return dfcomp, imax

In [None]:
def match_th_expt_1J_V2(dfJth, dfJx, bigerr=3000):
    '''
    Given DataFrame's of theoretical and experimental levels for the same J,
    Return a list of indices into dfJx that match those in dfJth
        based upon energy, but with preference for same leading term
    'bigerr' is in cm-1 and triggers extra scrutiny
    '''
    idx = []  # index of exptl level that matches each theor level
    for i, row in dfJth.iterrows():
        Eth = row.Erel
        thTerm = row.Lead

        def same_Lead(xTerm):
            # Does the exptl "uTerm" match the theoretical "Lead"?
            return match_term_symbol(xTerm, thTerm)
    
        # Is there an exptl level with the same leading term?
        termatch = dfJx.uTerm.apply(same_Lead)
        subx = dfJx.loc[termatch]
        if len(subx) == 1:
            # The nicest situation, but check the energy
            err = abs(Eth - subx[Ecol].values[0])
            if err < bigerr:
                # tolerable
                print(f'    exptl term "{subx.Term.iloc[0]}" matches leading "{thTerm}"')
                j = subx.index.values[0]
                idx.append(j)
                continue
            else:
                pass
                #print(f'    exptl term "{subx.Term.iloc[0]}" rejected as match for' +
                #      f' leading "{thTerm}" because energy error = {err:.0f} cm-1')
        # There are multiple or no matching terms
        #   rely upon energy ordering (among matching terms, if any)
        if len(subx) > 1:
            # Exptl data assign the same leading term to multiple levels of this J (unusual)
            #print(f'    leading "{thTerm}" matches multiple exptl terms "{subx.Term.values}"')
            pass
        else:
            # No matching term symbols
            #print(f'    leading "{thTerm}" matches no exptl term labels')
            subx = dfJx  # consider all levels because none match the term symbol
        j = None
        for j, rowx in subx.iterrows():
            # expect energy ordering to be similar
            #    there is no way to detect inversions
            if j in idx:
                # this exptl level already matched
                continue
            # we get here if this level has not yet been matched; take the first one
            print(f'    leading "{thTerm}" matched with exptl "{rowx.Term}"')
            # check the energy
            err = Eth - rowx[Ecol]
            if abs(err) > bigerr:
                print(f'\t*** but large energy error of {err:.0f} cm-1 ***')
                if err < 0:
                    print('\t*** maybe an experimental level is missing')
            break
        if (j is not None) and (j not in idx):
            idx.append(j)
    return idx

In [None]:
# Match theoretical and experimental levels
# imax is the index of the highest-energy level matched
dfdiff, imax = match_theory_to_expt(dflev, dfexpt)
dfdiff['err'] = dfdiff.Ecalc - dfdiff[Ecol]

In [None]:
t = dfdiff.loc[imax, 'Tcalc']
J = dfdiff.loc[imax, 'J']
levmax = f'{t}_{J}'
emax = dfdiff.loc[imax, Ecol]
showcols = ['Configuration', 'uTerm', 'Tcalc', 'J', Ecol, 'Ecalc', 'err', 'termwt']
print(f'Highest level matched is {levmax} at {emax} cm-1 (exptl energy)')
# Notify about any exptl levels that the calculation skipped over
dfskipped = dfdiff[dfdiff.Ecalc.isna()].loc[:imax]
if len(dfskipped):
    print('\n*** The following experimental levels are skipped over in the calculation ***')
    print(  '***     Consider doing another calculation with these levels included     ***')
    display(dfskipped[showcols])
    #display(dfdiff.loc[:imax][showcols])

In [None]:
# Convert str values of J to float
dfdiff['J'] = dfdiff.J.apply(chem.halves_to_float)
fmt['J'] = '{:.1f}'
warnThresh = 1000  # highlight errors larger than this (cm-1)
# drop rows with NaN (no matching level in the calculation)
dfdiff_all = dfdiff.copy()
dfdiff.dropna(axis=0, inplace=True)
# sort by increasing calculated energy, and re-index
dfdiff.sort_values('Ecalc', inplace=True, ignore_index=True)
# Update the exptl term enumerative prefixes to reflect this subset of terms
dfdiff['uTerm'] = chem.update_enumerative_prefix(dfdiff.uTerm)
selcols = ['Configuration', 'uTerm', 'J', Ecol, 'Tcalc', 'Ecalc', 'err']
# Print a warning if experimental levels are missing
nth = len(dflev); ndiff = len(dfdiff)
expt_missing = nth - ndiff
if expt_missing > 0:
    print(f'\n**** There are {nth} theoretical levels but only {ndiff} ' +
          'matching experimental levels ****\n')
    setdiff = set(dfdiff.Ecalc)
    setth = set(dflev.Erel)
    missing = setth - setdiff
    dfunmatched = dflev[dflev.Erel.isin(missing)].copy()
    print('Unmatched theoretical levels:')
    display(dfunmatched[['Erel', 'J', 'Lead', 'Jlbl', 'termwt']].style.format(fmt))
else:
    # use as flag
    expt_missing = 0

In [None]:
if expt_missing:
    print('*** Installing theoretical levels where no exptl levels could be matched ***')
    dfu = dfunmatched.rename(columns={'Lead': 'Tcalc', 'Erel': 'Ecalc', 
                'term_comp': 'Composition'})
    dfu['degen'] = (2 * dfu.J + 1).astype(int)
    # add column for weight of leading term
    lwt = [twt[lt] for lt, twt in zip(dfunmatched.Lead, dfunmatched.termwt)]
    dfu['leadwt'] = lwt
    # install theoretical energies as "experimental"
    dfu[Ecol] = dfu.Ecalc
    dfu['err'] = 0
    dfu['Configuration'] = ''
    dfu['Term'] = ''
    dfu['uTerm'] = ''
    # drop unwanted columns
    dfu = dfu.drop(columns=['Eshift', 'J_poss', 'nposs', 'TC_approx', 'ilev', 'ilead',
                     'idx', 'Jlbl', 'Erel_spread', 'TC_spread', 'E'])
    dfdiff = pd.concat([dfdiff, dfu])
    dfdiff = dfdiff.sort_values(Ecol).reset_index(drop=True)

In [None]:
if 'comment' in dfdiff.columns:
    selcols.append('comment')
print(f'Please inspect the following pairing of theory ("Ecalc") with expt ("{Ecol}")')
print(f'Errors > {warnThresh} cm-1 are highlighted in yellow')
show_skipped = False  # display the skipped levels (if any) in the table below
if len(dfskipped) and show_skipped:
    # also show the skipped/unmatched exptl levels
    styler = dfdiff_all[showcols].style
else:
    styler = dfdiff[showcols].style
styler = styler.apply(lambda x: ["background: yellow" if abs(v) > warnThresh else "" for v in x], 
              subset=pd.IndexSlice[['err']])
if (len(dfskipped) == 0) or (not show_skipped):
    print('Disagreements in term assignments are highlighted in red')
    styler = styler.apply(lambda x: (x != dfdiff['uTerm']).map({True: "background-color: red; \
                  color: white", False: ""}), subset=['Tcalc'])
display(styler.format(fmt))

## Compute spin-orbit corrections, <em>E</em><sub>so</sub>

In [None]:
# No theoretical calculations are needed to use eq. (1)
# I.e., accept experimental term assignments and assume that each level
#    derives from a single term (100% term weight)
xterms = []  # list of exptl term labels
thterms = [] # list of theoretical term labels
eterms = []  # list of term energies
for term in dfdiff.uTerm:
    if term not in xterms:
        xterms.append(term)
for term in dfdiff.Tcalc:
    if term not in thterms:
        thterms.append(term)
for Term in xterms:
    subdf = dfdiff[dfdiff.uTerm == Term]
    emean = np.dot(subdf.degen, subdf[Ecol]) / subdf.degen.sum()
    eterms.append(emean)
dfeq1 = pd.DataFrame({'Term': xterms, 'Eterm': eterms}).sort_values('Eterm').reset_index(drop=True)
print('In the naive model of eq. (1), experimental term assignments are accepted')
print('    and each level is assumed to derive from a single term (100% term weight).')
print('Term energies (cm-1) using eq. (1) [no theoretical input]:')
display(dfeq1.style.format(fmt))
SOC1 = -1 * np.round(dfeq1.at[0, 'Eterm'], 3)
lowterm = dfeq1.at[0, 'Term']
print(f'The exptl term of lowest energy is *** {lowterm} *** with SOC1 = {SOC1} cm-1')
levterm = dfdiff.uTerm.values[0]

target = levterm

if lowterm != levterm:
    # The lowest term is not the leading term of the lowest level
    SOC1alt = SOC1
    SOC1 = -1 * np.round(dfeq1[dfeq1.Term == levterm]['Eterm'].values[0], 3)
    print(f'The lowest level belongs to \t{levterm} \twith SOC1 = {SOC1} cm-1')

# Check for rare situation
th_term = dfdiff[dfdiff.uTerm == target].Tcalc.values[0]
if th_term != target:
    # hopefully just the enumerative prefix
    print(f'    *** changing target term from {target} to {th_term}')
    old_target = target
    target = th_term
print()
print(f'Term {target} is selected for calculating the spin-orbit correction')
print('    to change this, assign the variable "target" to another term')

In [None]:
def term_energy_from_levels(df, term, returnDF=False):
    # Given a DataFrame with columns ['J', 'Composition', Ecol],
    #   where (variable) Ecol is the header for the column of exptl level energies,
    # Return the term's average energy as derived from the levels [eq. (2) in pub.]
    # If 'returnDF', also return a DataFrame for the selected term
    # 'term' is from theory
    iterm = term_index(term)
    if iterm is None:
        # invalid term
        return None, None
    termwt = []  # weight of term "term" in each level
    for tc in df.Composition.values:
        termwt.append(tc[iterm])
    termwt = np.array(termwt)
    dweight = df.degen.values * termwt  # weights should include degeneracies
    # take product of experimental energies and theoretical term weights
    Eterm = np.dot(df[Ecol], dweight) / dweight.sum()
    if not returnDF:
        return Eterm
    # Construct DF showing distribution of term among levels
    cols = ['Configuration', 'uTerm', 'Tcalc', 'J', 'degen', Ecol, 'Ecalc', 'err']
    df_distrib = df[cols].copy()
    df_distrib.insert(0, 'weight', termwt / 100)  # fraction instead of percent
    return Eterm, df_distrib

In [None]:
print(f'Term of interest is {target}')

In [None]:
print(f'\nDistribution of target term {target} among levels:')
Eterm, df_distrib = term_energy_from_levels(dfdiff, target, returnDF=True)
# Combine degeneracy with weight
df_distrib.insert(0, target, df_distrib.weight * df_distrib.degen)
subdf = df_distrib[df_distrib[target] > 1.e-6].copy()
# suppress redundant columns
subdf = subdf.drop(columns=['weight', 'degen'])
styler = subdf.sort_values(target, ascending=False).style
styler = styler.apply(lambda x: ["background: yellow" if abs(v) > warnThresh else "" for v in x], 
              subset=pd.IndexSlice[['err']])
display(styler.format(fmt))
ibad = subdf[subdf.err.abs() > 3000]
print(f'Sum of displayed weights = {subdf[target].sum():.5f}')

In [None]:
# In case of very large errors, allow replacement of exptl energy with theoretical
ibad = subdf[subdf.err.abs() > 3000]
if len(ibad):
    print('Some errors in level energies are very large:')
    display(ibad.style.format(fmt))
    idxl = []
    while True:
        idx = input('Level to replace exptl energy with theoretical (blank to end): ')
        if idx:
            idxl.append(int(idx))
        else:
            break
    for i in idxl:
        dfdiff.at[i, Ecol] = dfdiff.at[i, 'Ecalc']
        dfdiff.at[i, 'err'] = 0
    if len(idxl):
        print('Energies replaced')
        ibad = ibad.index.values
        display(dfdiff.loc[ibad].style.format(fmt))

In [None]:
# In case of large errors of opposite sign, consider different match with expt
ibad = subdf[subdf.err.abs() > 3000]
swapped = False
if len(ibad):
    # Is max ~ -min?
    if abs(ibad.err.max() + ibad.err.min()) < 1000:
        imax = ibad.err.idxmax()
        imin = ibad.err.idxmin()
        if ibad.at[imin, 'J'] == ibad.at[imax, 'J']:
            # can only switch equal J
            q = input(f'Swap the matches for rows {imin} and {imax}? ')
            if 'n' == q.lower()[0]:
                print('Keeping old match')
            else:
                print('Swapping matches')
            for col in ['Tcalc', 'Ecalc', 'leadwt', 'termwt', 'Composition']:
                x = dfdiff.at[imin, col]
                dfdiff.at[imin, col] = dfdiff.at[imax, col]
                dfdiff.at[imax, col] = x
            for i in [imin, imax]:
                dfdiff.at[i, 'err'] = dfdiff.at[i, 'Ecalc'] - dfdiff.at[i, Ecol]
            swapped = True

In [None]:
# Use experimental level energies via eq. (2) (from the publication)
Eterm, df_distrib = term_energy_from_levels(dfdiff, target, returnDF=True)
if swapped:
    # display the modified matching scheme 
    subdf = df_distrib[df_distrib.weight > 1.e-6]
    styler = subdf.sort_values('weight', ascending=False).style
    styler = styler.apply(lambda x: ["background: yellow" if abs(v) > warnThresh else "" for v in x], 
                  subset=pd.IndexSlice[['err']])
    print('Matching scheme after swapping:')
    display(styler.format(fmt))
SOC2 = -Eterm
print('Applying eq. (2) (experimental energies and theoretical term weights).')
print(f'For term {target}, SOC2 = {SOC2:.2f} cm-1')


In [None]:
print(f'Molpro source file: {fname}\n')
print(f'Alternative values for E_so[{target}] of atom {atom}:')
print('-' * 25)
print('{:12s} {:.2f} cm-1\t (term {:s})'.format('eq (1)', SOC1, old_target))
print('{:12s} {:.2f} cm-1'.format('raw theory', SOCraw))
print('{:12s} {:.2f} cm-1'.format('eq (2)', SOC2))
print('-' * 25)
print(f'\nDifference [eq. (2)] - [eq. (1)]    = {SOC2-SOC1:.1f} cm-1')
print(f'Difference [eq. (2)] - [raw theory] = {SOC2-SOCraw:.1f} cm-1')

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [None]:
# Term energy errors as inferred from all levels
termlist = []
wmean = []
wstds = []
# also consider unsigned (absolute value) errors
uwmean = []
uwstds = []
# also consider RMSE
trmse = []
for term in set(dfdiff.Tcalc):
    eterm, df_distro = term_energy_from_levels(dfdiff, term, returnDF=True)
    if eterm is None:
        print(f'No theoretical term matches exptl {term}')
        continue
    termlist.append(term)
    wts = df_distro.weight.values * df_distro.degen
    m, s = chem.weighted_mean(df_distro.err, wts)
    wmean.append(m)
    wstds.append(s)
    uerr = np.abs(df_distro.err.values)
    um, us = chem.weighted_mean(uerr, wts)
    uwmean.append(um)
    uwstds.append(us)
    umsq, ussq = chem.weighted_mean(uerr ** 2, wts)
    trmse.append(np.sqrt(umsq))
dftermerr = pd.DataFrame({'Term': termlist, 'wmean': wmean, 'wstds': wstds,
                         'uwmean': uwmean, 'uwstds': uwstds, 'rwmse': trmse})
print('Errors in term energies (cm-1) as inferred from the full distribution')
print('    of each term over all levels')
# default order same as experimental terms
dftermerr.Term = pd.Categorical(dftermerr.Term, thterms)
dftermerr = dftermerr.sort_values('Term')
styler = dftermerr.style
styler = styler.apply(lambda x: ["background: yellow" if abs(v) > warnThresh else "" for v in x], 
              subset=pd.IndexSlice[['wmean']])
styler.format(fmt)

### Optional distribution of some other term

In [None]:
other_term = '7S' #'4P°'
Eterm, df_distrib = term_energy_from_levels(dfdiff, other_term, returnDF=True)
if Eterm is None:
    print(f'Cannot evaluate errors for levels within term "{other_term}"')
else:
    print(f'Distribution of term {other_term} among levels:')
    subdf = df_distrib[df_distrib.weight >= 1.e-6]
    display(subdf.sort_values('weight', ascending=False).style.format(fmt))
    wtot = df_distrib.weight.sum()
    dwtot = np.dot(df_distrib.weight, df_distrib.degen)
    print('Total weight = {:.3f} ({:.3f} including degeneracies)'.format(wtot, dwtot))
    print(f'Eq. (2) term energy = {Eterm:.1f} cm-1')