# Standard Thermodynamic Quantities

These values were taken from three sources:
- [CHNOSZ](http://chnosz.net/vignettes/obigt.html), a chemistry and materials science package for [R](https://www.r-project.org/about.html). The dataframes were exported to csv files as-is.
- [Principles of Modern Chemistry, 8th ed.](https://www.amazon.com/Principles-Modern-Chemistry-David-Oxtoby-ebook/dp/B00UGDPNFI) by Oxtoby et al.

In [3]:
import os                               
import re                   
import time                 # to stall requests (just in case)
import itertools
import sympy

import numpy as np
import pandas as pd 
# import pubchempy as pcp
# import chemdataextractor as cde     # chemistry parser

# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize, sent_tokenize

from tika import parser     # the specific parser method 

from chempy import balance_stoichiometry
from chempy import Substance
from chempy import Reaction
from chempy.util import periodic

pd.set_option('display.max_colwidth', 0)    # no max column width
pd.set_option('display.max_rows', 1000)

## import CHNOSZ thermodynamic quantities csv

In [28]:
chnosz = pd.read_csv('../data/external/thermo/chnosz_thermo.csv')

In [29]:
chnosz.columns

Index(['name', 'abbrv', 'formula', 'state', 'ref1', 'ref2', 'date', 'E_units',
       'G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e',
       'c2.f', 'omega.lambda', 'z.T'],
      dtype='object')

In [30]:
chnosz = chnosz[['name', 'abbrv', 'formula', 'E_units', 'state', 'G', 'H', 'S', 'Cp']]
chnosz.head()

Unnamed: 0,name,abbrv,formula,E_units,state,G,H,S,Cp
0,water,,H2O,cal,liq,,,,
1,e-,,(Z-1),cal,aq,0.0,0.0,15.6166,0.0
2,H+,H+,H+,cal,aq,0.0,0.0,0.0,0.0
3,Li+,Li+,Li+,cal,aq,-69933.0,-66552.0,2.7,14.2
4,Na+,Na+,Na+,cal,aq,-62591.0,-57433.0,13.96,9.06


In [31]:
chnosz.shape

(3372, 9)

In [32]:
chnosz = chnosz[chnosz['name'].str[0] != '[']
chnosz.shape

(3210, 9)

In [34]:
chnosz.dtypes

name       object 
abbrv      object 
formula    object 
E_units    object 
state      object 
G          float64
H          float64
S          float64
Cp         float64
dtype: object

In [33]:
chnosz['E_units'].value_counts()

cal    3172
J      38  
Name: E_units, dtype: int64

In [35]:
# convert to joules
energy_columns = ['G', 'H', 'S', 'Cp']

for col in energy_columns:
    chnosz.loc[chnosz['E_units'] == 'cal', col] *= 4.184
    
chnosz.drop(columns='E_units', inplace=True)
chnosz.head()

Unnamed: 0,name,abbrv,formula,state,G,H,S,Cp
0,water,,H2O,liq,,,,
1,e-,,(Z-1),aq,0.0,0.0,65.339854,0.0
2,H+,H+,H+,aq,0.0,0.0,0.0,0.0
3,Li+,Li+,Li+,aq,-292599.672,-278453.568,11.2968,59.4128
4,Na+,Na+,Na+,aq,-261880.744,-240299.672,58.40864,37.90704


In [36]:
chnosz.at[0, 'G'] = -237180
chnosz.at[0, 'H'] = -285830
chnosz.at[0, 'S'] = 69.91
chnosz.at[0, 'Cp'] = 75.29

just checking to make sure appropriate values are in joules

In [37]:
chnosz.loc[4, 'G']

-261880.744

In [38]:
chnosz.loc[779, 'G']

-713730.0

In [39]:
chnosz['state'].value_counts()

aq     1807
cr     601 
liq    488 
gas    285 
cr2    14  
cr3    9   
cr7    1   
cr4    1   
cr5    1   
cr8    1   
cr9    1   
cr6    1   
Name: state, dtype: int64

In [40]:
value_dict = {
    'aq': '(aq)',
    'cr': '(s)',
    'liq': '(l)',
    'gas': '(g)',
    'cr2': '(s, II)',
    'cr3': '(s, III)',
    'cr4': '(s, IV)',
    'cr5': '(s, V)',
    'cr6': '(s, VI)',
    'cr7': '(s, VII)',
    'cr8': '(s, VIII)',
    'cr9': '(s, IX)',
    'g': '(g)',
}

chnosz['state'].replace(value_dict, inplace=True)

In [41]:
chnosz['formula'] = chnosz['formula'].astype(str) + chnosz['state'].astype(str)
chnosz.drop(columns='state', inplace=True)
chnosz.head()

Unnamed: 0,name,abbrv,formula,G,H,S,Cp
0,water,,H2O(l),-237180.0,-285830.0,69.91,75.29
1,e-,,(Z-1)(aq),0.0,0.0,65.339854,0.0
2,H+,H+,H+(aq),0.0,0.0,0.0,0.0
3,Li+,Li+,Li+(aq),-292599.672,-278453.568,11.2968,59.4128
4,Na+,Na+,Na+(aq),-261880.744,-240299.672,58.40864,37.90704


In [42]:
chnosz.at[1, 'formula'] = 'e-(aq)'

In [46]:
chnosz.shape

(3210, 7)

In [47]:
chnosz.isna().sum()

name       0   
abbrv      2266
formula    0   
G          121 
H          151 
S          132 
Cp         245 
dtype: int64

In [48]:
chnosz = chnosz.dropna(subset=['G'])
chnosz.shape

(3089, 7)

## import thermodynamic quantities from other sources

## NOTE TO SELF: is there a way to import from common py file?

In [19]:
def get_text(file, sleep=0, counter=0):
    if counter == 2:        # so we stop the recursive function
        pass
    # grab the raw text using parser.from_file()
    raw = parser.from_file(file)
    status = raw['status']          # returns the status code from tika server
    # if things go well, return the raw text
    if status == 200:
        print(f"'{file}' successfully opened!")
        return raw['content']
    # if things don't go well, pause for five seconds and try again
    # we might not need this code, but it's useful for other server calls
    else:
        print(f'! ! ! ! error code {status} ! ! ! !')
        print(f'! ! ! ! trying again ! ! ! !')
        time.sleep(5)
        counter += 1
        # repeats grab_text up to twice
        return get_text(file, counter=counter)

In [20]:
oxtobya = get_text('../data/external/thermo/oxtoby8a.pdf')
oxtobyb = get_text('../data/external/thermo/oxtoby8b.pdf')

'../data/external/thermo/oxtoby8a.pdf' successfully opened!
'../data/external/thermo/oxtoby8b.pdf' successfully opened!


In [21]:
clean = re.sub('—', 'nan', oxtobya)
clean = re.sub('\\ue02c', 'l', clean)
clean = re.sub(r'\n\n[I]*\s*', r'RROOWW', clean)
clean = re.sub(r'([\d]+)\n([A-Z]+)', r'\1RROOWW\2', clean)
clean = re.sub(r'(nan)\s*\n([A-Z]+)', r'\1RROOWW\2', clean)
clean = re.sub('\\ue031', '+', clean)
clean = re.sub('([\d])\+\(', r'+\1(', clean)
clean = re.sub('\\ue032', '-', clean)
clean = re.sub('([\d])\-\(', r'-\1(', clean)
clean = re.sub('\(([aqslg]+)\,\s([\w]+)', r'(\1,\2', clean)
clean = re.sub('[\s]+', ' ', clean)
clean = re.sub(' mol-1', '', clean)
# clean = re.sub('Substance.*mol', 'HHEEAADDEERR', clean)
clean = re.split('RROOWW', clean)
clean = [re.split(' ', r) for r in clean]
clean = [r for r in clean if len(r) == 5]
oxtobya_clean = clean

In [22]:
clean = re.sub('—', 'nan', oxtobyb)
clean = re.sub('\\ue02c', 'l', clean)
clean = re.sub(r'\n\n[I]*\s*', r'RROOWW', clean)
clean = re.sub(r'([\d]+)\n([A-Z]+)', r'\1RROOWW\2', clean)
clean = re.sub(r'(nan)\s*\n([A-Z]+)', r'\1RROOWW\2', clean)
clean = re.sub('\\ue031', '+', clean)
clean = re.sub('([\d])\+\(', r'+\1(', clean)
clean = re.sub('\\ue032', '-', clean)
clean = re.sub('([\d])\-\(', r'-\1(', clean)
clean = re.sub('\(([aqslg]+)\,\s([\w]+)', r'(\1,\2', clean)
clean = re.sub('[\s]+', ' ', clean)
clean = re.sub(' mol-1', '', clean)
# clean = re.sub('Substance.*mol', 'HHEEAADDEERR', clean)
clean = re.split('RROOWW', clean)
clean = [re.split(' ', r) for r in clean]
clean = [r for r in clean if len(r) == 5]
oxtobyb_clean = clean

In [23]:
oxtoby = oxtobya_clean + oxtobyb_clean
oxtoby[:20]

[['H(g)', '217.96', '114.60', '203.26', '20.78'],
 ['H2(g)', '0', '130.57', '0', '28.82'],
 ['H+(aq)', '0', '0', '0', '0'],
 ['H3O+(aq)', '-285.83', '69.91', '-237.18', '75.29'],
 ['Li(s)', '0', '29.12', '0', '24.77'],
 ['Li(g)', '159.37', '138.66', '126.69', '20.79'],
 ['Li+(aq)', '-278.49', '13.4', '-293.31', '68.6'],
 ['LiH(s)', '-90.54', '20.01', '-68.37', '27.87'],
 ['Li2O(s)', '-597.94', '37.57', '-561.20', '54.10'],
 ['LiF(s)', '-615.97', '35.65', '-587.73', '41.59'],
 ['LiCl(s)', '-408.61', '59.33', '-384.39', '47.99'],
 ['LiBr(s)', '-351.21', '74.27', '-342.00', 'nan'],
 ['LiI(s)', '-270.41', '86.78', '-270.29', '51.04'],
 ['Na(s)', '0', '51.21', '0', '28.24'],
 ['Na(g)', '107.32', '153.60', '76.79', '20.79'],
 ['Na+(aq)', '-240.12', '59.0', '-261.90', '46.4'],
 ['Na2O(s)', '-414.22', '75.06', '-375.48', '69.12'],
 ['NaOH(s)', '-425.61', '64.46', '-379.53', '59.54'],
 ['NaF(s)', '-573.65', '51.46', '-543.51', '48.86'],
 ['NaCl(s)', '-411.15', '72.13', '-384.15', '50.50']]

In [24]:
oxtoby_df = pd.DataFrame(oxtoby, columns=['formula', 'H', 'S', 'G', 'Cp'])
oxtoby_df.head()

Unnamed: 0,formula,H,S,G,Cp
0,H(g),217.96,114.6,203.26,20.78
1,H2(g),0.0,130.57,0.0,28.82
2,H+(aq),0.0,0.0,0.0,0.0
3,H3O+(aq),-285.83,69.91,-237.18,75.29
4,Li(s),0.0,29.12,0.0,24.77


In [25]:
energy_columns = ['G', 'H', 'S', 'Cp']

for col in energy_columns:
    oxtoby_df[col] = oxtoby_df[col].astype(float)
    
oxtoby_df.head()

Unnamed: 0,formula,H,S,G,Cp
0,H(g),217.96,114.6,203.26,20.78
1,H2(g),0.0,130.57,0.0,28.82
2,H+(aq),0.0,0.0,0.0,0.0
3,H3O+(aq),-285.83,69.91,-237.18,75.29
4,Li(s),0.0,29.12,0.0,24.77


In [26]:
oxtoby_df.loc[:, 'H'] *= 1000
oxtoby_df.loc[:, 'G'] *= 1000
oxtoby_df.head()

Unnamed: 0,formula,H,S,G,Cp
0,H(g),217960.0,114.6,203260.0,20.78
1,H2(g),0.0,130.57,0.0,28.82
2,H+(aq),0.0,0.0,0.0,0.0
3,H3O+(aq),-285830.0,69.91,-237180.0,75.29
4,Li(s),0.0,29.12,0.0,24.77


In [27]:
oxtoby_df = oxtoby_df[['formula', 'G', 'H', 'S', 'Cp']]
oxtoby_df.head()

Unnamed: 0,formula,G,H,S,Cp
0,H(g),203260.0,217960.0,114.6,20.78
1,H2(g),0.0,0.0,130.57,28.82
2,H+(aq),0.0,0.0,0.0,0.0
3,H3O+(aq),-237180.0,-285830.0,69.91,75.29
4,Li(s),0.0,0.0,29.12,24.77


In [43]:
oxtoby_df.isna().sum()

formula    0 
G          2 
H          0 
S          2 
Cp         73
dtype: int64

In [49]:
oxtoby_df = oxtoby_df.dropna(subset=['G'])

In [50]:
thermo_df = pd.concat([chnosz, oxtoby_df], ignore_index=True).fillna(np.NaN)
thermo_df.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,Cp,G,H,S,abbrv,formula,name
0,75.29,-237180.0,-285830.0,69.91,,H2O(l),water
1,0.0,0.0,0.0,65.339854,,e-(aq),e-
2,0.0,0.0,0.0,0.0,H+,H+(aq),H+
3,59.4128,-292599.672,-278453.568,11.2968,Li+,Li+(aq),Li+
4,37.90704,-261880.744,-240299.672,58.40864,Na+,Na+(aq),Na+


In [51]:
thermo_df = thermo_df[['formula', 'abbrv', 'name', 'G', 'H', 'S', 'Cp']]
thermo_df.tail()

Unnamed: 0,formula,abbrv,name,G,H,S,Cp
3444,IBr(g),,,3710.0,40840.0,258.66,36.44
3445,Ne(g),,,0.0,0.0,146.22,20.79
3446,Ar(g),,,0.0,0.0,154.73,20.79
3447,Kr(g),,,0.0,0.0,163.97,20.79
3448,Xe(g),,,0.0,0.0,169.57,20.79


In [52]:
thermo_df.shape

(3449, 7)

In [53]:
thermo_df.drop_duplicates(subset='formula', keep='last', inplace=True)
thermo_df.reset_index(drop=True, inplace=True)
thermo_df.shape

(2837, 7)

In [54]:
thermo_df[thermo_df['formula'].str.contains('\.')].index

Int64Index([1765, 1766, 1767, 1770, 1785, 1787, 2596], dtype='int64')

In [55]:
thermo_df = thermo_df.drop(index=thermo_df[thermo_df['formula'].str.contains('\.')].index)
thermo_df.reset_index(drop=True, inplace=True)
thermo_df.shape

(2830, 7)

In [56]:
thermo_df.tail()

Unnamed: 0,formula,abbrv,name,G,H,S,Cp
2825,IBr(g),,,3710.0,40840.0,258.66,36.44
2826,Ne(g),,,0.0,0.0,146.22,20.79
2827,Ar(g),,,0.0,0.0,154.73,20.79
2828,Kr(g),,,0.0,0.0,163.97,20.79
2829,Xe(g),,,0.0,0.0,169.57,20.79


We'll fill in some of these `abbrv` fields with just the formula.

In [111]:
def formula_state_separator(formula, keep_state=False):
    '''
    Separates the state from a formula string.
    
    --Parameters--
    formula:        str
        a string of a single chemical formula
    
    --Output--
    tuple (str)
        
    --Examples--
    >>> formula_state_separator('NaCl(aq)')
    ('NaCl', 'aq')
    
    >>> formula_state_separator('NaCl')
    'NaCl'
    '''
    try:
        regex = re.search('(?<=\()[aglsq]+', formula)
        formula = formula[:regex.start() - 1]
        if keep_state:
            state = regex.group(0)
            return formula, state
        else:
            return formula
    except:
        return formula

In [112]:
formulas = [formula_state_separator(f) for f in thermo_df['formula'].astype(str)]
formulas[:5]

['e-', 'HCO3-', 'NO3-', 'NO2-', 'NH4+']

In [59]:
new_list = []
for a, f in zip(list(thermo_df['abbrv']), formulas):
    if pd.isna(a):
        new_list.append(f)
    else:
        new_list.append(a)
new_list[:10]

['e-',
 'HCO3-',
 'NO3-',
 'NO2-',
 'NH4+',
 'H2PO4-',
 'HSO3-',
 'HSO4-',
 'S2O3-2',
 'S2O8-2']

In [60]:
thermo_df['abbrv'] = new_list

In [61]:
thermo_df.head()

Unnamed: 0,formula,abbrv,name,G,H,S,Cp
0,e-(aq),e-,e-,0.0,0.0,65.339854,0.0
1,HCO3-(aq),HCO3-,HCO3-,-586939.888,-689933.232,98.44952,-35.39664
2,NO3-(aq),NO3-,NO3-,-110905.288,-206810.936,146.94208,-68.6176
3,NO2-(aq),NO2-,NO2-,-32216.8,-104600.0,123.0096,-97.4872
4,NH4+(aq),NH4+,NH4+,-79454.16,-133260.4,111.16888,65.85616


In [62]:
thermo_df.tail()

Unnamed: 0,formula,abbrv,name,G,H,S,Cp
2825,IBr(g),IBr,,3710.0,40840.0,258.66,36.44
2826,Ne(g),Ne,,0.0,0.0,146.22,20.79
2827,Ar(g),Ar,,0.0,0.0,154.73,20.79
2828,Kr(g),Kr,,0.0,0.0,163.97,20.79
2829,Xe(g),Xe,,0.0,0.0,169.57,20.79


In [64]:
thermo_df.to_csv('../data/processed/thermo_df.csv', index=False)

## TO DO: maybe define a function to add missing substances

## replicate CHNOSZ stoichiometry csv

Originally, I had imported the `CHNOSZ` stoichiometry csv. However, I found the need to update it with whatever new entries were present in `thermo`.

The stoichiometry csv is formatted so that each formula is count-vectorized by element.

In [65]:
stoich_og = pd.read_csv('../data/external/thermo/chnosz_stoich.csv')
stoich_og.head()

Unnamed: 0.1,Unnamed: 0,Ag,Al,Ar,As,Au,B,Ba,Be,Bi,...,Tm,U,V,W,Xe,Y,Yb,Z,Zn,Zr
0,H2O,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,(Z-1),0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1,0,0
2,H+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Li+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Na+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


For my workflow, we will rename the columns by atomic number and have charge occupy column `0` to be in-line with how `chempy` works. We'll recreate this by scanning the formulas in `thermo` and then vectorizing the formulas.

In [67]:
thermo_df.loc[10, 'formula']

'ClO3-(aq)'

In [70]:
chlorate = Substance.from_formula(thermo_df.loc[10, 'formula'])
chlorate.composition

{17: 1, 8: 3, 0: -1}

In [71]:
for k, v in chlorate.composition.items():
    print (k, v)

17 1
8 3
0 -1


Using `chempy` it becomes quite simple to vectorize.

From the composition dictionary:

- key $\Rightarrow$ `stoich[key]`  
- value $\Rightarrow$ `stoich.at[index, key]`

In [74]:
formulas = set(thermo_df['formula'])

In [81]:
stoich_df = pd.DataFrame(formulas)
stoich_df.head()

Unnamed: 0,0
0,MnC6H8O4(aq)
1,FeF+(aq)
2,C18H30(g)
3,Al+3(aq)
4,DyF3(aq)


In [82]:
# naming the column -1 for now so we can sort them later

stoich_df.columns = [-1]
stoich_df.head()

Unnamed: 0,-1
0,MnC6H8O4(aq)
1,FeF+(aq)
2,C18H30(g)
3,Al+3(aq)
4,DyF3(aq)


In [83]:
for i, f in enumerate(stoich_df[-1]):
    try:
        sub = Substance.from_formula(f)
        for k, v in sub.composition.items():
            stoich_df.at[i, k] = v
    except:
        pass
        
stoich_df.head()

Unnamed: 0,-1,25,6,1,8,26,9,0,13,66,...,18,22,2,90,75,54,86,36,43,87
0,MnC6H8O4(aq),1.0,6.0,8.0,4.0,,,,,,...,,,,,,,,,,
1,FeF+(aq),,,,,1.0,1.0,1.0,,,...,,,,,,,,,,
2,C18H30(g),,18.0,30.0,,,,,,,...,,,,,,,,,,
3,Al+3(aq),,,,,,,3.0,1.0,,...,,,,,,,,,,
4,DyF3(aq),,,,,,3.0,,,1.0,...,,,,,,,,,,


In [84]:
stoich_df = stoich_df[sorted(stoich_df.columns)]
stoich_df.head()

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8,...,80,81,82,83,86,87,88,90,91,92
0,MnC6H8O4(aq),,8.0,,,,,6.0,,4.0,...,,,,,,,,,,
1,FeF+(aq),1.0,,,,,,,,,...,,,,,,,,,,
2,C18H30(g),,30.0,,,,,18.0,,,...,,,,,,,,,,
3,Al+3(aq),3.0,,,,,,,,,...,,,,,,,,,,
4,DyF3(aq),,,,,,,,,,...,,,,,,,,,,


In [92]:
stoich_df.rename(columns={-1: 'formula'}, inplace=True)

In [93]:
stoich_df.fillna(0, inplace=True)
stoich_df.head()

Unnamed: 0,formula,0,1,2,3,4,5,6,7,8,...,80,81,82,83,86,87,88,90,91,92
0,MnC6H8O4(aq),0.0,8.0,0.0,0.0,0.0,0.0,6.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,FeF+(aq),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C18H30(g),0.0,30.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Al+3(aq),3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DyF3(aq),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
stoich_df.to_csv('../data/processed/stoich_df.csv', index=False)

## playground for writing functions using thermo tables

The cells below show my thought process exploring how to relate the `chempy` library with the thermodynamic tables. The goals are as follows:

- The most likely chemical reaction is highly correlated with the reaction that has the greatest loss of Gibbs free energy (given by parameter 'G' in `thermo`.
- We iterate through the different possible products so long as their combination allows for balanced stoichiometry (having the same number of each element on both sides of the equation)

We will attempt to predict the following reaction:

$$ 2 Na(s) + 2 H_{2}O(l) \longrightarrow 2 NaOH(aq) + H_{2}(g) $$

In [87]:
reactants = ['Na', 'H2O']

In [88]:
water = Substance.from_formula('H2O')
[*water.composition]

[1, 8]

When searching for possible products, we want to ignore all compounds that have elements outside of sodium, hydrogen, or oxygen.

In [89]:
z_ignore = ['formula']
for r in reactants:
    s = Substance.from_formula(r)
    z_ignore += [*s.composition]
z_ignore = set(z_ignore)
z_ignore

{1, 11, 8, 'formula'}

In [90]:
column_mask = [col for col in stoich.columns if col not in z_ignore]
print(column_mask)

[0, 2, 3, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 78, 79, 80, 81, 82, 83, 86, 87, 88, 90, 91, 92]


In [113]:
# https://stackoverflow.com/questions/22649693/

stoich_temp = stoich_df.copy()

for col in column_mask:
    stoich_temp = stoich_temp[stoich_temp[col] == 0]

stoich_temp = stoich_temp.loc[(stoich_temp.drop(columns=['formula'])!=0).any(axis=1)]

stoich_temp = stoich_temp[z_ignore]
stoich_temp

Unnamed: 0,8,1,11,formula
49,1.0,2.0,0.0,H2O(g)
61,1.0,0.0,2.0,Na2O(s)
172,0.0,0.0,1.0,Na(g)
289,1.0,2.0,0.0,"H2O(s, VIII)"
569,1.0,2.0,0.0,H2O(s)
576,2.0,0.0,0.0,O2(g)
604,1.0,0.0,0.0,O(g)
854,0.0,0.0,1.0,Na(s)
943,2.0,2.0,0.0,H2O2(aq)
1006,2.0,2.0,0.0,H2O2(l)


In [116]:
# chemical reactions don't have the same species on both sides of the arrow

candidates = set([formula_state_separator(f) for f in set(stoich_temp['formula'])\
              if formula_state_separator(f) not in map(formula_state_separator, reactants)])
candidates

{'H', 'H2', 'H2O2', 'Na2O', 'NaOH', 'O', 'O2', 'O3'}

In [117]:
# most chemical reactions don't form more than four different chemical species
# we'll play it safe and just make the maximum number 3 + num_reactants

combinations = []
max_length = min(len(reactants) + 4, 7)
for i in range(1, max_length):
    combinations += list(itertools.combinations(candidates, i))
combinations

[('H2',),
 ('O3',),
 ('H2O2',),
 ('O',),
 ('Na2O',),
 ('NaOH',),
 ('H',),
 ('O2',),
 ('H2', 'O3'),
 ('H2', 'H2O2'),
 ('H2', 'O'),
 ('H2', 'Na2O'),
 ('H2', 'NaOH'),
 ('H2', 'H'),
 ('H2', 'O2'),
 ('O3', 'H2O2'),
 ('O3', 'O'),
 ('O3', 'Na2O'),
 ('O3', 'NaOH'),
 ('O3', 'H'),
 ('O3', 'O2'),
 ('H2O2', 'O'),
 ('H2O2', 'Na2O'),
 ('H2O2', 'NaOH'),
 ('H2O2', 'H'),
 ('H2O2', 'O2'),
 ('O', 'Na2O'),
 ('O', 'NaOH'),
 ('O', 'H'),
 ('O', 'O2'),
 ('Na2O', 'NaOH'),
 ('Na2O', 'H'),
 ('Na2O', 'O2'),
 ('NaOH', 'H'),
 ('NaOH', 'O2'),
 ('H', 'O2'),
 ('H2', 'O3', 'H2O2'),
 ('H2', 'O3', 'O'),
 ('H2', 'O3', 'Na2O'),
 ('H2', 'O3', 'NaOH'),
 ('H2', 'O3', 'H'),
 ('H2', 'O3', 'O2'),
 ('H2', 'H2O2', 'O'),
 ('H2', 'H2O2', 'Na2O'),
 ('H2', 'H2O2', 'NaOH'),
 ('H2', 'H2O2', 'H'),
 ('H2', 'H2O2', 'O2'),
 ('H2', 'O', 'Na2O'),
 ('H2', 'O', 'NaOH'),
 ('H2', 'O', 'H'),
 ('H2', 'O', 'O2'),
 ('H2', 'Na2O', 'NaOH'),
 ('H2', 'Na2O', 'H'),
 ('H2', 'Na2O', 'O2'),
 ('H2', 'NaOH', 'H'),
 ('H2', 'NaOH', 'O2'),
 ('H2', 'H', 'O2'),
 ('

Let's see if any of these combinations result in a good balanced equation:

In [118]:
for comb in combinations:    
    try:
        print(balance_stoichiometry(reactants, comb))
    except:
        pass

(OrderedDict([('Na', 2), ('H2O', 1)]), OrderedDict([('H2', 1), ('Na2O', 1)]))
(OrderedDict([('Na', 2), ('H2O', 2)]), OrderedDict([('H2', 1), ('NaOH', 2)]))
(OrderedDict([('Na', 6), ('H2O', 3)]), OrderedDict([('O3', -1), ('NaOH', 6)]))
(OrderedDict([('Na', 2), ('H2O', -1)]), OrderedDict([('H2O2', -1), ('Na2O', 1)]))
(OrderedDict([('Na', 2), ('H2O', 1)]), OrderedDict([('O', -1), ('NaOH', 2)]))
(OrderedDict([('Na', 2), ('H2O', 1)]), OrderedDict([('Na2O', 1), ('H', 2)]))
(OrderedDict([('Na', 1), ('H2O', 1)]), OrderedDict([('NaOH', 1), ('H', 1)]))
(OrderedDict([('Na', -4), ('H2O', -2)]), OrderedDict([('NaOH', -4), ('O2', 1)]))
(OrderedDict([('Na', 2*x1), ('H2O', x1 + 3)]), OrderedDict([('H2', x1 + 3), ('O3', 1), ('Na2O', x1)]))
(OrderedDict([('Na', 2*x1), ('H2O', 2*x1 + 6)]), OrderedDict([('H2', x1 + 6), ('O3', 2), ('NaOH', 2*x1)]))
(OrderedDict([('Na', 2*x1), ('H2O', x1 + 2)]), OrderedDict([('H2', x1 + 1), ('H2O2', 1), ('Na2O', x1)]))
(OrderedDict([('Na', 2*x1), ('H2O', 2*x1 + 4)]), Ordere

(OrderedDict([('Na', 6*x1 + 12*x2), ('H2O', 3*x1)]), OrderedDict([('O3', -x1 - 2*x2 - 2), ('O', 6), ('Na2O', 6*x2), ('NaOH', 6*x1)]))
(OrderedDict([('Na', 12*x2), ('H2O', 3*x1)]), OrderedDict([('O3', x1 - 2*x2 - 2), ('O', 6), ('Na2O', 6*x2), ('H', 6*x1)]))
(OrderedDict([('Na', 6*x2), ('H2O', 3*x1 + 3*x2)]), OrderedDict([('O3', x1 - x2 - 2), ('O', 6), ('NaOH', 6*x2), ('H', 6*x1)]))
(OrderedDict([('Na', 6*x2), ('H2O', 3*x2)]), OrderedDict([('O3', -2*x1 - x2 - 1), ('O', 3), ('NaOH', 6*x2), ('O2', 3*x1)]))
(OrderedDict([('Na', 6*x2 + 12), ('H2O', 3*x1 + 3*x2)]), OrderedDict([('O3', x1 - x2 - 2), ('Na2O', 6), ('NaOH', 6*x2), ('H', 6*x1)]))
(OrderedDict([('Na', 6*x2 + 6), ('H2O', 3*x2)]), OrderedDict([('O3', -2*x1 - x2 - 1), ('Na2O', 3), ('NaOH', 6*x2), ('O2', 3*x1)]))
(OrderedDict([('Na', 6), ('H2O', 3*x2)]), OrderedDict([('O3', -2*x1 + x2 - 1), ('Na2O', 3), ('H', 6*x2), ('O2', 3*x1)]))
(OrderedDict([('Na', 3), ('H2O', 3*x2 + 3/2)]), OrderedDict([('O3', -2*x1 + x2 - 1/2), ('NaOH', 3), ('H',

(OrderedDict([('Na', 6*x3), ('H2O', 3*x2 + 3)]), OrderedDict([('O3', -2*x1 + x2 - x3 - 1), ('H2O2', 3), ('Na2O', 3*x3), ('H', 6*x2), ('O2', 3*x1)]))
(OrderedDict([('Na', 6*x3), ('H2O', 3*x2 + 3*x3 + 3)]), OrderedDict([('O3', -2*x1 + x2 - x3 - 1), ('H2O2', 3), ('NaOH', 6*x3), ('H', 6*x2), ('O2', 3*x1)]))
(OrderedDict([('Na', 6*x2 + 12*x3), ('H2O', 3*x1 + 3*x2)]), OrderedDict([('O3', x1 - x2 - 2*x3 - 2), ('O', 6), ('Na2O', 6*x3), ('NaOH', 6*x2), ('H', 6*x1)]))
(OrderedDict([('Na', 6*x2 + 6*x3), ('H2O', 3*x2)]), OrderedDict([('O3', -2*x1 - x2 - x3 - 1), ('O', 3), ('Na2O', 3*x3), ('NaOH', 6*x2), ('O2', 3*x1)]))
(OrderedDict([('Na', 6*x3), ('H2O', 3*x2)]), OrderedDict([('O3', -2*x1 + x2 - x3 - 1), ('O', 3), ('Na2O', 3*x3), ('H', 6*x2), ('O2', 3*x1)]))
(OrderedDict([('Na', 6*x3), ('H2O', 3*x2 + 3*x3)]), OrderedDict([('O3', -2*x1 + x2 - x3 - 1), ('O', 3), ('NaOH', 6*x3), ('H', 6*x2), ('O2', 3*x1)]))
(OrderedDict([('Na', 6*x3 + 6), ('H2O', 3*x2 + 3*x3)]), OrderedDict([('O3', -2*x1 + x2 - x3 - 

In order for an equation to be properly balanced, each coefficient (dictionary values here) must be a positive number, and oftentimes we balance so that every coefficient is a whole number. We have to filter out the balanced instances where we get negative coefficients (such as the equation with `H2O2` and `Na2O` as products).

Notice also that `sympy` has relative coefficients listed (meaning the equation would be balanced for any whole number `x1`, for example). We will want to filter these out too, but may consider including them for a reach goal.

In [119]:
np.array(list(balance_stoichiometry(reactants, ('H2O2', 'Na2O'))[0].values())) >= 1

array([ True, False])

In [120]:
# there might be a more elegant way of doing this. round down to zero if any instance is false.

np.floor((np.array(list(balance_stoichiometry(reactants, ('H2O2', 'Na2O'))[0].values())) >= 1).mean())

0.0

In [121]:
np.array(list(balance_stoichiometry(reactants, ('H2', 'O2', 'NaOH'))[0].values()))

array([2*x1, 2*x1 + 4], dtype=object)

In [122]:
np.array([isinstance(i, sympy.numbers.Number) for i in list(balance_stoichiometry(reactants, ('H2', 'O2', 'NaOH'))[0].values())])

array([False, False])

## functions

The above code has been condensed into several functions:

- `Z_unique`: returns a list of unique atomic numbers present in a list of substances
- `stoich_filter`: filters the `stoich` dataframe to return formulas that only have the elements described by `Z_unique`
- `check_coefficients`: checks if all coefficients are positive and non-relational once an equation has been balanced
- `formula_state_separator`: in the case that a formula is formatted with its corresponding state (eg: `NaCl(s)`), return a tuple of the formula and the state.
- `get_gibbs`: from the results of `formula_state_separator`, find the exact free energy value for the substance specified. If state is not specified, find the lowest free energy value for formulas that have multiple entries (since that is the most likely state under standard conditions.
- `possibility_reducer`: sometimes we get too many results from `stoich_filter`. In general, substances with lower $\Delta G$ values are more likely to be products. However, very large, complex molecules with low $\Delta G$ values are still not very likely, so one (imperfect) way to normalize for that is to divide by the mass of the compound. The jury is still out if this is a good way to filter.
- `standard_gibbs_free_energy`: calculates the overall $\Delta G$ change under standard conditions.
- `reaction_predictor`: takes a list of reactants, iterates through the different possibilities (using `stoich_filter`, takes valid combinations using `check_coefficients`, and calculates $\Delta G$ values using `thermo`. Returns the reaction with the lowest $\Delta G$ value.

In [123]:
def Z_unique(substances):
    '''
    Returns a set representing unique atomic numbers present within a list of
    chemical formulas.
    
    --Parameters--
    substances:     iterable (str)
        any iterable containing strings with valid chemical formulas
    
    --Output--
    set (int)
        atomic numbers of each unique element present in substances
        
    --Example--
    >>> Z_unique(['CH4', 'H2O'])
    {1, 6, 8}
    '''
    composition = []
    for s in substances:
        sub = Substance.from_formula(s)
        composition += [*sub.composition]
    return set(composition)

In [126]:
Z_unique(reactants)

{1, 8, 11}

In [184]:
def stoich_filter(substances, df=False, thorough=False):
    '''
    Returns a masked copy of the stoich dataframe containing elements that
    only contain the elements present in substances. 
    
    --Parameters--
    substances:     iterable (str)
        any iterable containing strings with valid chemical formulas
    
    --Output--
    DataFrame
    '''
    stoich_temp = stoich.copy()
    
    # mask to keep the charge and formula columns in final dataframe
    z_keep = [0, 'formula'] + list(Z_unique(substances))
    
    # get all other columns
    column_mask = [col for col in stoich.columns if col not in z_keep]
    for col in column_mask:
        # return the dataframe where these columns are all 0
        stoich_temp = stoich_temp[stoich_temp[col] == 0]
    
    # keep the columns where it's not all zero
    stoich_temp = stoich_temp.loc[(stoich_temp.drop(columns=['formula'])!=0).any(axis=1)]
    
    # return the dataframe with the columns we want to keep
    if df:
        return stoich_temp[z_keep]
    else:
        stoich_list = list(stoich_temp['formula'])
        if thorough:
            return set([f for f in stoich_list if f not in substances])
        else:
            stoich_list = [formula_state_separator(f) for f in stoich_list]
            substances = [formula_state_separator(s) for s in substances]
            return set([state_predictor(f) for f in stoich_list if f not in substances])
        
        

In [185]:
stoich_filter(reactants)

{'H(g)',
 'H+(aq)',
 'H2(g)',
 'H2O2(aq)',
 'H3O+(aq)',
 'HO2-(aq)',
 'Na+(aq)',
 'Na2O(s)',
 'NaOH(aq)',
 'O(g)',
 'O2(g)',
 'O3(g)',
 'OH-(aq)',
 'e-(aq)'}

In [148]:
def check_coefficients(reactants, products):
    '''
    Checks whether a possible reactant/product combination would result in a
    valid balanced chemical equation.
    
    --Parameters--
    reactants:      iterable (str)
    products:       iterable (str)
        any iterable containing strings with valid chemical formulas
    
    --Output--
    bool
        
    --Examples--
    >>> check_coefficients(['CH4', 'H2O'], ['CO', 'H2'])
    True
    
    >>> check_coefficients(['CH4', 'H2O'], ['CO2', 'H2O2'])
    False
    
    >>> check_coefficients(['CH4', 'H2O'], ['NaOH'])
    False
    '''
    try:
        balance = balance_stoichiometry(reactants, products)
        
        # list all the coefficients out
        reac_coef = list(balance[0].values()) + list(balance[1].values())
        
        # rounds to zero if any of the coefficients are less than 1
        is_positive = np.floor((np.array(reac_coef) >= 1).mean()).astype(bool)
        
        # rounds to zero if any of the coefficients are sympy relational class
        is_definite = np.floor(np.array([isinstance(i, sympy.numbers.Number) for i in reac_coef])\
                               .mean()).astype(bool)
        
        return is_positive and is_definite
    except:
        return False   

In [174]:
check_coefficients(reactants, ['Na2O', 'H2O2'])

False

In [159]:
def get_gibbs(formula, df=False):     
    '''
    Retrieves the free energy value, in J, of a single substance
    
    --Parameters--
    formula:        str
        a string of a single chemical formula
    
    --Output--
    list (float)    
        
    --Examples--
    >>> get_gibbs('NaCl(aq)')
    array([-388735.44])
    '''
    if (thermo_df['formula'] == formula).max():
        matches = thermo_df[thermo_df['formula'] == formula]
    else:
        matches = thermo_df[thermo_df['formula'].map(
            lambda x: x[:len(formula)] == formula)]
        matches = matches[matches['formula'].map(
            formula_state_separator) == formula]
        
    if df:
        return matches
    else:
        return list(matches['G'])

In [156]:
get_gibbs('HNO3', df=True)

Unnamed: 0,formula,abbrv,name,G,H,S,Cp
432,HNO3(aq),HNO3,HNO3,-103470.32,-189995.44,178.6568,75.312
2744,HNO3(l),HNO3,,-80760.0,-174100.0,155.49,109.87


In [152]:
def state_predictor(formula):
    df = get_gibbs(formula, df=True)
    return list(df.sort_values(by='G')['formula'])[0]

In [158]:
state_predictor('HNO3')

'HNO3(aq)'

In [181]:
# https://stackoverflow.com/questions/6618515/

def possibility_reducer(possibilities, length=12, offset=0):
    '''
    Limits the list of possible substances to a specified length based on 
    free energy 'density'
    
    --Parameters--
    possibilities:      iterable (str)
        any iterable containing strings with valid chemical formulas
    
    --Output--
    tuple (str)
        
    --Examples--

    '''
    # just in case. might be redundant
    possibilities = np.array(list(possibilities))
    energies = np.array([min(get_gibbs(s)) / Substance.from_formula(s).mass for s in possibilities])
    indices = energies.argsort()
    sorted_possibilities = possibilities[indices]
    
    max_length = min(len(sorted_possibilities), (length + offset))
    
    return sorted_possibilities[offset:(max_length)]

In [186]:
def standard_gibbs_free_energy(reactants, products, kJ=True):
    '''
    Returns the overall delG of a reaction under standard conditions. 
    
    --Parameters--
    reactants:      
    products:
    
    --Output--
    float
        
    --Examples--
    >>> equation = balance_stoichiometry(['Na', 'H2O'], ['NaH', 'O2'])
    >>> standard_gibbs_free_energy(equation)
    340.36
    '''
    products = [state_predictor(p) for p in products]
    reactants = [state_predictor(r) for r in reactants]
    equation = balance_stoichiometry(reactants, products)
    
    # each side is a formula, coefficient tuple
    prod = list(equation[1].items())
    reac = list(equation[0].items())
    
    delG = 0
        
    # s[0] is the formula, with or without state
    # s[1] is the coefficient
    
    def gibbs_sum(side):
        interim_delG = 0
        for s in side:
            interim_delG += min(get_gibbs(s[0])) * s[1]
        return interim_delG
    
    delG = gibbs_sum(prod) - gibbs_sum(reac)
    
    return delG / (1 + 999*kJ)

In [187]:
standard_gibbs_free_energy(['Na', 'H2O'], ['NaOH', 'H2'])

-361.603200000000

In [203]:
def reaction_predictor(reactants):
    '''
    Returns the balanced chemical equation of the predicted reaction based on
    minimizing overall delG values.
    
    --Parameters--
    reactants:      iterable(str)
        any iterable containing strings with valid chemical formulas
    
    --Output--
    chempy.chemistry.Reaction
        
    --Examples--
    >>> reaction_predictor(['Al', 'O2'])
    4 Al + 3 O2 → 2 Al2O3
    '''
    reactants = [state_predictor(r) for r in reactants]
    possibilities = stoich_filter(reactants)
    
    print('scoping possibilities...')
    if len(possibilities) > 12:
        possibilities = possibility_reducer(possibilities)
    
    print('  optimizing combinations...')
    combinations = []
    comb_length = min(6, len(possibilities))
    for i in range(1, comb_length):
        combinations += list(itertools.combinations(possibilities, i))
    combinations = [c for c in combinations if Z_unique(c) == Z_unique(reactants)]
    
    print('    deriving equations...')
    good_combinations = []
    for i, comb in enumerate(combinations):    
        if check_coefficients(reactants, comb):
            good_combinations.append(comb)
    
    print('      calculating energies...')
    energies = []
    for gc in good_combinations:
        energies.append(standard_gibbs_free_energy(reactants, gc))
    
    best_energy = min(energies)
    best_index = energies.index(best_energy)
    best_reaction = Reaction(*balance_stoichiometry(
        reactants, good_combinations[best_index]))
    
    print(best_reaction)
    print(f'delG = {best_energy:.4} kJ mol-1')
    
    return best_reaction

In [205]:
reaction_predictor(['Na', 'H2O'])

scoping possibilities...
  optimizing combinations...
    deriving equations...
      calculating energies...
2 Na(s) + 2 H2O(l) -> 2 NaOH(aq) + H2(g)
delG = -361.6 kJ mol-1


In [204]:
reaction_predictor(['Al', 'O2'])

scoping possibilities...
  optimizing combinations...
    deriving equations...
      calculating energies...
4 Al(s) + 3 O2(g) -> 2 Al2O3(s)
delG = -3165 kJ mol-1


In [192]:
reaction_predictor(['HCl(aq)', 'NaOH(s)'])

scoping possibilities...
  optimizing combinations...
    deriving equations...
      calculating energies...
HCl(aq) + NaOH(s) -> H2O(l) + NaCl(aq)
-119.145816000000


In [206]:
reaction_predictor(['H2CO3'])

scoping possibilities...
  optimizing combinations...
    deriving equations...
      calculating energies...
H2CO3(aq) -> H2O(l) + CO2(g)
delG = -8.460 kJ mol-1
