In [1]:
# Generate working reactions
import yaml, sys, os, random, re, glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sympy
sys.path.insert(0, '../../karlib')
import chem_subs as chem
import multirx_subs as mrx

In [2]:
# Process all available molecular YAML files
moldata, Gdict = mrx.read_all_molec_yamls()



In [3]:
target = 'dioxolane'
rxns = mrx.generate_reactions(target, moldata, Gdict)
dfrx, fmtrx = mrx.build_reactions_DF(rxns, moldata, target)

Of 15 reactions, 15 are balanced.


In [4]:
dfrx.style.format(fmtrx)

Unnamed: 0,EoF,dH(rxn),uexp,Reaction,rho_E,rho_T,rho_c
0,-271.1,-120.1,0.3,dioxolane + 3 ch4 + 2 h2o = 4 ch3oh + c2h6,120.1,5.0,17.5
1,-277.3,-3.0,0.3,dioxolane = ch4 + chocho,3.0,2.2,27.5
2,-278.0,-3.5,0.3,dioxolane = 4/3 c2h3oh + 1/3 hcooh,3.5,2.3,11.3
3,-275.0,5.3,0.2,dioxolane = co + DME,5.3,3.4,55.7
4,-281.0,12.6,0.6,dioxolane = ch3chco + h2o,12.6,3.5,9.9
5,-276.5,-16.1,0.1,dioxolane = ch2o + ch3cho,16.1,0.2,33.4
6,-277.9,26.9,0.4,dioxolane = 3/4 mefo + 1/2 meoc2h3,26.9,3.1,12.0
7,-277.0,-27.7,0.2,dioxolane = 1/2 acoh + oxirane,27.7,1.8,1.4
8,-278.7,-43.3,0.1,dioxolane = ch3oh + ketene,43.3,1.9,24.8
9,-279.0,96.3,0.6,dioxolane = etfo,96.3,2.9,8.2


In [6]:
crxns = mrx.cull_too_similar_reactions(target, trxns, moldata, Gdict, disjoint=True)
dftest, fmttest = mrx.build_reactions_DF(crxns, moldata, target)
dftest.sort_values('rho_E').style.format(fmttest)

NameError: name 'trxns' is not defined

In [None]:
def print_delta(rxns):
    # just print stuff for this notebook
    global target, moldata
    # must be a list of reactions
    if not rxns:
        # empty list
        print('No reactions')
        return
    if not isinstance(rxns[0][0], list):
        rlist = [ rxns ]
    else:
        rlist = rxns
    # are the reactions balanced?
    oklist = mrx.check_reactions_balance(rlist, Gdict)
    okrx = [r for i, r in enumerate(rlist) if oklist[i]]
    # mrx.eq5_sums() returns dH/x0, not dH
    print(f'Of {len(rlist)} reactions, {len(okrx)} are balanced.')
    calcH, calcS = mrx.eq5_sums(okrx, target, moldata)
    for rx, H in zip(okrx, calcH):
        #print(rx)   # this is ugly
        lhs = []
        rhs = []
        for pair in rx:
            mol = pair[0]
            c = pair[1]
            if c < 0:
                # left side of equation
                # present coefficients as rational numbers
                crat = sympy.Rational(-c).limit_denominator(1000)
                cstr = f'{crat}'
                lhs.append('{:s} {:s}'.format(cstr, mol))
            else:
                # right side
                crat = sympy.Rational(c).limit_denominator(1000)
                cstr = f'{crat}'
                rhs.append('{:s} {:s}'.format(cstr, mol))
        lhs = ' + '.join(lhs)
        rhs = ' + '.join(rhs)
        print('{:s} = {:s}'.format(lhs, rhs))
        print('\tchange in calc. H = {:.1f} kJ/mol'.format(-H))  # assuming x0 = -1
    return

In [None]:
target = 'phno2'
G = Gdict[target]
tstoich = G.stoichiometry(asdict=True)

In [None]:
print('Component = atom')
print(G.stoichiometry(asdict=True))
#G.printMOL('test.mol', title='molecule_components.ipynb')

In [None]:
rxn = mrx.reaction_bond_separation(target, G)
print_delta(rxn)

In [None]:
rxns = mrx.reaction_isomerization(target, Gdict)
print_delta(rxns)

In [None]:
rxn = mrx.reaction_hydration(target, Gdict)
print_delta(rxn)

In [None]:
rxn = mrx.reaction_hydrofluorination(target, Gdict)
print_delta(rxn)

In [None]:
rxn = mrx.reaction_hydrochlorination(target, Gdict)
print_delta(rxn)

In [None]:
rxn = mrx.reaction_to_elements(target, Gdict)
print_delta(rxn)

In [None]:
rxn = mrx.reaction_hydrogenation(target, Gdict)
print_delta(rxn)

In [None]:
rxn = mrx.reaction_oxygenation(target, Gdict)
print_delta(rxn)

In [None]:
df = G.list_bonds()
df

In [None]:
from sympy import nsimplify

def cull_to_disjoint_educts(rxns):
    # 'rxns' is a list of reactions, where a reaction is a list of [educt, coeff] pairs
    # reduce the list until no educt (besides the target) occurs in more than one reaction
    # the target is the first educt listed
    # Give preference to reactions with fewer educts, small integer coefficients
    nin = len(rxns)
    # score by coefficient ugliness
    ftol = 1.e-6
    rank = []
    for rxn in rxns:
        r = (len(rxn) - 1) * 2   # penalty for more educts
        for pair in rxn[1:]:
            denom = nsimplify(pair[1]).q
            if abs(denom) > 1 + ftol:
                # penalty for fraction (to encourage homologous reactions)
                r += denom
            # penalty for large coefficient (multiplies uncertainties)
            r += abs(pair[1])
        rank.append(float(r))
    idx = np.argsort(rank)
    # do the cull
    used = set()
    ikeep = []
    print('>>> lowest ranks:', np.array(rank)[idx[:5]])
    for i in idx:
        educts = set([pair[0] for pair in rxns[i][1:]])
        if educts.intersection(used) == set():
            # keep this reaction
            ikeep.append(i)
            used = used.union(educts)
    return [rxns[i] for i in ikeep]

In [48]:
def Xtabulate_Benson_groups(target, Gdict, detail, commonG=False):
    # Return a DataFrame of Benson groups in all molecules that
    #   do not contain elements alien to the target molecule and
    #   (if 'commonG') that contain at least one B-group in the target
    # 'Gdict' is a dict of Geometry()
    # 'detail' specifies granularity
    tels = set(Gdict[target].stoichiometry(asdict=True).keys())
    tgrps = set(Gdict[target].Benson_groups(detail=detail, warn=False)[0])
    grps = []  # list of all groups
    compos = {}  # dict, key = molecule
    for mol, G in Gdict.items():
        els = set(G.stoichiometry(asdict=True).keys())
        if not els <= tels:
            # alien elements
            continue
        Bg = G.Benson_groups(asdict=True, detail=detail, warn=False)
        if commonG:
            if len(tgrps.intersection(set(Bg))) < 1:
                # no B-groups in common with the target
                continue
        compos[mol] = Bg
        for grp in Bg.keys():
            if grp not in grps:
                grps.append(grp)
    # build the DataFrame
    cols = ['molec'] + grps
    df = pd.DataFrame(columns=cols)
    row = [target] + [compos[target].get(grp, 0) for grp in grps]
    df.loc[0] = row  # put target at the top
    for mol, st in compos.items():
        if mol == target:
            # don't enter twice
            continue
        row = [mol] + [st.get(grp, 0) for grp in grps]
        df.loc[len(df)] = row
    return df

In [109]:
def tabulate_Benson_groups(target, Gdict, detail):
    # Return a DataFrame of Benson groups in all molecules that
    #   could plausibly form a reaction for the target
    # 'Gdict' is a dict of Geometry()
    # 'detail' specifies granularity
    tels = set(Gdict[target].stoichiometry(asdict=True).keys())
    tgrps = set(Gdict[target].Benson_groups(detail=detail, warn=False)[0])
    grps = set()  # set of all groups (for creating DataFrame)
    compos = {}  # dict, key = molecule
    for mol, G in Gdict.items():
        els = set(G.stoichiometry(asdict=True).keys())
        if not els <= tels:
            # alien elements; discard this molecule
            continue
        Bg = G.Benson_groups(asdict=True, detail=detail, warn=False)
        compos[mol] = Bg
        bset = set(Bg.keys())
        grps = grps.union(bset)       
    # build the DataFrame 
    cols = ['molec'] + sorted(grps)
    df = pd.DataFrame(columns=cols)
    row = [target] + [compos[target].get(grp, 0) for grp in cols[1:]]
    df.loc[0] = row  # put target at the top
    freq = {}  # frequency of each B-group
    '''
    # add any molecules whose B-groups are a subset of the target's
    for mol, comp in compos.items():
        if mol == target:
            continue
        gs = set(comp.keys())
        if (len(gs) > 0) and (gs <= tgrps):
            row = [mol] + [comp.get(grp, 0) for grp in cols[1:]]
            df.loc[len(df)] = row
        # how common is each group?
        for g, n in comp.items():
            freq[g] = freq.get(g, 0) + n
    # which target groups are not found in the DF?
    missing = []
    for g in tgrps:
        n = np.count_nonzero(df[g])
        if n < 2:
            # only in the target
            missing.append(g)
    '''
    # add molecules that contain target groups 
    gset = tgrps.copy()  # set of currently used groups
    for mol, comp in compos.items():
        if mol in df['molec'].values:
            continue
        gs = set(comp.keys())
        if gs.intersection(tgrps):
            row = [mol] + [comp.get(grp, 0) for grp in cols[1:]]
            df.loc[len(df)] = row
            gset = gset.union(gs)
    # add molecules that include only the expanded set of groups
    for mol, comp in compos.items():
        if mol in df['molec'].values:
            continue
        gs = set(comp.keys())
        if (len(gs) > 0) and (gs <= gset):
            row = [mol] + [comp.get(grp, 0) for grp in cols[1:]]
            df.loc[len(df)] = row

    # make list of B-groups by rarity
    Brare = sorted(freq.keys(), key=lambda k: freq[k])
    # add molecules that share the target's rarest group
    for g0 in Brare:
        if g0 in tgrps:
            # the rarest group
            break
        
    # remove columns with all zeros
    df = df.loc[:, (df != 0).any(axis=0)]
    return df

In [110]:
detail = 0
dfBen = tabulate_Benson_groups(target, Gdict, detail=detail)
dfBen

Unnamed: 0,molec,C-(C)(H)(O),C-(C)(H)(O)2,C-(C)(H)2,C-(C)(H)2(O),C-(C)(H)3,C-(C)(O)2,C-(C)2(H)(O),C-(C)2(H)2,C-(C)2(O),C-(H)(O)2,C-(H)2(O),C-(H)2(O)2,C-(H)3(O),C-(O)3,O-(C)(H),O-(C)(O),O-(C)2
0,dioxolane,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,2
1,butyrolactone,0,0,0,1,0,1,0,2,0,0,0,0,0,0,0,0,1
2,c2h5o,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
3,ch2ch2oh,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,ch2ohoh,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,iproh,0,0,0,0,2,0,1,0,0,0,0,0,0,0,1,0,0
58,MEK,0,0,0,0,2,0,0,1,1,0,0,0,0,0,0,0,0
59,meoome,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,2,0
60,nBu,0,0,1,0,1,0,0,2,0,0,0,0,0,0,0,0,0


In [111]:
import time
start = time.time()
rxns = mrx.solve_descriptor_reactions(dfBen, maxeduct=8, verbose=True)
end = time.time()
t = end - start
print('Time = {:.0f} s = {:.1f} min = {:.2f} hr'.format(t, t/60, t/3600))

target = dioxolane
There are 61 educts besides the target
There are 17 descriptors: ['C-(C)(H)(O)', 'C-(C)(H)(O)2', 'C-(C)(H)2', 'C-(C)(H)2(O)', 'C-(C)(H)3', 'C-(C)(O)2', 'C-(C)2(H)(O)', 'C-(C)2(H)2', 'C-(C)2(O)', 'C-(H)(O)2', 'C-(H)2(O)', 'C-(H)2(O)2', 'C-(H)3(O)', 'C-(O)3', 'O-(C)(H)', 'O-(C)(O)', 'O-(C)2']
Dimen = 17, maxeduct = 8
--- neduct = 1 ---
There are 61 educt tuples
--- neduct = 2 ---
There are 1830 educt tuples
--- neduct = 3 ---
There are 35990 educt tuples
--- neduct = 4 ---
There are 521855 educt tuples
--- neduct = 5 ---
There are 5949147 educt tuples


KeyboardInterrupt: 

In [None]:
detail = 0
print('detail =', detail)
dfBgrp = mrx.educts_Benson_groups(target, Gdict, detail=detail, warn=False)
display(dfBgrp)

In [None]:
Gdict['ch2ohoh'].Benson_groups()

In [None]:
rxns = mrx.solve_descriptor_reactions(dfBgrp, verbose=True)

In [None]:
rbal = mrx.discard_unbalanced_reactions(rxns, Gdict, verbose=True)
print_delta(rbal)

In [None]:
detail = 1
dfBbonds = mrx.Benson_bonds_table(target, Gdict, detail=detail, warn=False)
display(dfBbonds)

In [None]:
import time
start = time.time()
rxns = mrx.solve_descriptor_reactions(dfBbonds, verbose=True)
end = time.time()
t = end - start
print('Time = {:.0f} s = {:.1f} min = {:.2f} hr'.format(t, t/60, t/3600))

In [None]:
rbal = mrx.discard_unbalanced_reactions(rxns, Gdict, tol=1.e-3, verbose=True)

In [None]:
crx = cull_too_similar_reactions(target, rbal, moldata, Gdict, disjoint=True)
crx

In [None]:
print_delta(rbal)

In [None]:
def all_reactions_for_target(target, Gdict, verbose=False):
    # generate all reasonable reactions for the educt
    # 'Gdict' is a dict of Geometry() objects (dict key = name of molecule)
    # Return a list of reactions, where a reaction is a list of [educt, coeff] pairs
    st = Gdict[target].stoichiometry(asdict=True)
    tels = list(st.keys())  # chemical elements in the target molecule
    tset = set(tels)
    # build DataFrame of all molecules whose elements are a subset of those of target
    cols = ['molec'] + tels
    df = pd.DataFrame(columns=cols)
    df.loc[0] = [target] + [st[el] for el in tels]
    for molec, G in Gdict.items():
        if molec == target:
            continue
        st = G.stoichiometry(asdict=True)
        if set(st.keys()) <= tset:
            # include this molecule
            df.loc[len(df)] = [molec] + [st.get(el, 0) for el in tels]
    start = time.time()
    rxns = mrx.solve_descriptor_reactions(df, verbose=verbose)
    end = time.time()
    t = end - start
    print('Time = {:.0f} s = {:.1f} min = {:.2f} hr'.format(t, t/60, t/3600))    
    return rxns, df

In [None]:
# TAKES VERY LONG TIME
#rxns, dfelem = all_reactions_for_target(target, Gdict, verbose=True)

In [None]:
rbal = mrx.discard_unbalanced_reactions(rxns, Gdict, tol=1.e-3, verbose=True)

In [None]:
print('Found {:d} reactions'.format(len(rxns)))

In [None]:
'''
def build_reactions_DF(rxns, moldata, target, rho='rho_E'):
    # given a list of reactions, return a DataFrame
    # with computed and exptl thermo (T=0) for analysis
    
    # create the DataFrame
    exptl = mrx.select_expt(moldata, T=0)  # selected exptl data
    okrx = rxn_with_expt(rxns, target, exptl)   # useable reactions
    eq6sum, uexp = mrx.eq6_sums(okrx, target, exptl)  # exptl sums needed to compute EoF
    print('>>> eq5')
    calcH, calcS = mrx.eq5_sums(okrx, target, moldata)  # slow step
    print('>>> eof')
    eof = [s5 - s6 for s5, s6 in zip(calcH, eq6sum)]
    print('counts: eof = {:d}, calcH = {:d}, uexp = {:d}, okrx = {:d}'.format(len(eof),
                                    len(calcH), len(uexp), len(okrx)))
    dfrx = pd.DataFrame({f'EoF': eof, 'dH(rxn)': calcH, 'uexp': uexp,
                     'Reaction': [reaction_string(rxn) for rxn in okrx]})
    # get values of rho variants (list of tuples)
    print('>>> rho')
    rho_E, rho_T, rho_c = mrx.gather_rho(okrx, target, moldata) 
    dfrx['rho_E'] = rho_E
    dfrx['rho_T'] = rho_T
    dfrx['rho_c'] = rho_c
    #dfrx = dfrx.sort_values('EoF')
    #fmt = {col: '{:.1f}' for col in dfrx.columns}
    #fmt['Reaction'] = '{:s}'
    #display(dfrx.style.format(fmt))
    return dfrx
'''    
def process_reactionDF(target, rho, dfrx, verbose=False):
    # analyze the data in a DataFrame from build_reactions_DF()
    print(f'Target = {target}')
    print(f'Non-uniform weighting using {rho}:')
    print('>>> wmean')
    wmean, semw, rwmsx, a, b, change, niter = mrx.nonuniform_weighting(target, rho, dfrx,
                                                            verbose=verbose)
    # combine SEMw with u_exp
    print('\tSEMw = {:.1f}, uexp = {:.1f} kJ/mol'.format(semw, rwmsx))
    unc = np.sqrt(semw**2 + rwmsx**2)  # 'rwmsx' from eq. (13)
    print('\tEoF = ({:.1f} ± {:.1f}) kJ/mol (standard uncertainty)'.format(wmean, unc))
    return dfrx, wmean, unc

In [None]:
dfrx, wmean, unc = process_reactions(rbal, moldata, target, verbose=True)

In [None]:
# consider only a fraction of reactions with lowest rho
frac = 0.5
nf = int(frac * len(dfrx))
print(f'Keeping the {nf} reactions (of {len(dfrx)}) with lowest {rho}')
dfcore = dfrx.sort_values(rho).iloc[:nf].copy()

In [None]:
plt.scatter(dfcore.rho_E, dfcore.rho_c, facecolors='none', edgecolors='b', alpha=0.1)
#plt.scatter(dfrx.rho_E, dfrx.rho_c, facecolors='none', edgecolors='b', alpha=0.1)
plt.xlabel('rho_E')
plt.ylabel('rho_c')
#plt.xlim([0, 1000])
#plt.ylim([0, 500])
plt.show()

In [None]:
T = 0.  # temperature
exptl = mrx.select_expt(moldata, T)
okrx = rxn_with_expt(rxns, target, exptl)
eq6sum, uexp = mrx.eq6_sums(okrx, target, exptl)

In [None]:
# Make DataFrame of reactions
pd.set_option('display.max_rows', 500)
calcH, calcS = mrx.eq5_sums(okrx, target, moldata)
eof = [s5 - s6 for s5, s6 in zip(calcH, eq6sum)]
neduct = max([len(rx) for rx in okrx])
cols = ['Target', 'x0']
for n in range(neduct-1):
    cols.extend([f'Educt{n+1}', f'x{n+1}'])
cols.append('{:s}H'.format(chem.DELTA))
dfrx = pd.DataFrame(columns=cols)
for rx, H in zip(rxns, calcH):
    row = []
    for ed in rx:
        row.extend(ed)
    while len(row) < len(cols) - 1:
        row.extend(['', 0])
    row.append(np.round(H, 1))
    dfrx.loc[len(dfrx)] = row
dfrx['rho_E'] = abs(dfrx[cols[-1]])
dfrx['EoF'] = eof
dfrx.sort_values('rho_E')

In [None]:
hmin = dfrx[cols[-1]].min()
hmax = dfrx[cols[-1]].max()
print('Hmin, Hmax = {:.1f}, {:.1f}'.format(hmin, hmax))

In [None]:
plt.hist(dfrx[cols[-1]], bins=100)
plt.xlim([-5000, 5000])
plt.xlabel('kJ/mol')
plt.ylabel('counts')
plt.show()

In [None]:
plt.scatter(dfrx[cols[-1]], dfrx.EoF)
#plt.xlabel(r'$\rho_E$')
plt.xlabel(cols[-1])
plt.ylabel(r'$\Delta_fH$')
#plt.xlim([0, 1000])
#plt.ylim([-500, 500])
plt.show()

In [None]:
# plot EoF vs cutoff
rhocut = np.linspace(10, 2010)
eof = [dfrx[dfrx.rho_E < cut].EoF.mean() for cut in rhocut]
plt.plot(rhocut, eof)
plt.xlabel('rho_E cutoff')
plt.ylabel('EoF(target)')
plt.show()

In [None]:
rho = 'rho_E'
print(f'Target = {target}')
print(f'Non-uniform weighting using {rho}:')
wmean, semw, rwmsx, a, b, change, niter = mrx.nonuniform_weighting(target, rho, dfrx,
                                                        verbose=False)
# combine SEMw with u_exp
print('\tSEMw = {:.1f}, uexp = {:.1f} kJ/mol'.format(semw, rwmsx))
unc = np.sqrt(semw**2 + rwmsx**2)  # 'rwmsx' from eq. (13)
print('\tEoF({:.1f} K) = ({:.1f} ± {:.1f}) kJ/mol (standard uncertainty)'.format(T, wmean, unc))

In [None]:
eq6sum

In [None]:
i = 1
print(rxns[i])
print_delta(rxns[i])

In [None]:
def rxn_with_expt(rxin, target, exptl):
    '''
    Return a list of those reactions for which
      exptl thermo data are available.
    'rxns' is a list of reactions, where a reaction is
      a list of [educt, coeff] pairs.
    'target' is the name of the target molecule,
      not required to have exptl data
    'exptl' is a dict of exptl data for molecules
    '''
    rxns = []
    for rx in rxin:
        ok = True
        for pair in rx:
            molec = pair[0]
            try:
                eof = exptl[molec]['EoF']
                unc = exptl[molec]['unc']
            except KeyError:
                ok = False
        if ok:
            rxns.append(rx)
    return rxns