# Standard Thermodynamic Quantities

These values were taken from [CHNOSZ](http://chnosz.net/vignettes/obigt.html), a chemistry and materials science package for [R](https://www.r-project.org/about.html). The dataframes were exported to csv files as-is.

In [352]:
import os                               
import re                   
import time                 # to stall requests (just in case)
import itertools
import sympy

import numpy as np
import pandas as pd 
import chemdataextractor as cde     # chemistry parser

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

from tika import parser     # the specific parser method 

from chempy import balance_stoichiometry
from chempy import Substance
from chempy import Reaction
from chempy.util import periodic

pd.set_option('display.max_colwidth', 0)    # no max column width
pd.set_option('display.max_rows', 1000)

## import thermodynamic quantities csv

In [688]:
thermo = pd.read_csv('../data/external/thermo/chnosz_thermo.csv')

In [689]:
thermo.columns

Index(['name', 'abbrv', 'formula', 'state', 'ref1', 'ref2', 'date', 'E_units',
       'G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e',
       'c2.f', 'omega.lambda', 'z.T'],
      dtype='object')

In [690]:
thermo = thermo[['name', 'abbrv', 'formula', 'E_units', 'state', 'G', 'H', 'S', 'Cp']]
thermo.head()

Unnamed: 0,name,abbrv,formula,E_units,state,G,H,S,Cp
0,water,,H2O,cal,liq,,,,
1,e-,,(Z-1),cal,aq,0.0,0.0,15.6166,0.0
2,H+,H+,H+,cal,aq,0.0,0.0,0.0,0.0
3,Li+,Li+,Li+,cal,aq,-69933.0,-66552.0,2.7,14.2
4,Na+,Na+,Na+,cal,aq,-62591.0,-57433.0,13.96,9.06


In [691]:
thermo.shape

(3372, 9)

In [692]:
thermo = thermo[thermo['name'].str[0] != '[']
thermo.shape

(3210, 9)

In [693]:
# code for finding duplicates, not necessary after gibbs formula below
# pd.concat(g for _, g in thermo.groupby('formula') if len(g) > 1)

In [694]:
thermo.dtypes

name       object 
abbrv      object 
formula    object 
E_units    object 
state      object 
G          float64
H          float64
S          float64
Cp         float64
dtype: object

In [695]:
thermo['state'].value_counts()

aq     1807
cr     601 
liq    488 
gas    285 
cr2    14  
cr3    9   
cr9    1   
cr8    1   
cr6    1   
cr7    1   
cr4    1   
cr5    1   
Name: state, dtype: int64

In [696]:
thermo['E_units'].value_counts()

cal    3172
J      38  
Name: E_units, dtype: int64

In [697]:
# convert to joules
energy_columns = ['G', 'H', 'S', 'Cp']

for col in energy_columns:
    thermo.loc[thermo['E_units'] == 'cal', col] *= 4.184

In [698]:
thermo.at[0, 'G'] = -237180
thermo.at[0, 'H'] = -285830
thermo.at[0, 'S'] = -69.91
thermo.at[0, 'Cp'] = -75.29

In [699]:
thermo.loc[4, 'G']

-261880.744

In [700]:
thermo.loc[779, 'G']

-713730.0

## TO DO: maybe define a function to add missing substances

In [701]:
sodium = {
    'name': 'sodium',
    'abbrv': 'Na',
    'formula': 'Na',
    'state': 'cr',
    'G': 0,
    'H': 0,
    'S': 51.21,
    'Cp': 28.24
}

thermo = thermo.append(sodium, ignore_index=True)

In [702]:
oxygen = {
    'name': 'oxygen',
    'abbrv': 'O2',
    'formula': 'O2',
    'state': 'gas',
    'G': 0,
    'H': 0,
    'S': 205.03,
    'Cp': 29.36
}

thermo = thermo.append(oxygen, ignore_index=True)

In [703]:
nah = {
    'name': 'sodium hydride',
    'abbrv': 'NaH',
    'formula': 'NaH',
    'state': 'cr',
    'G': -33500,
    'H': -56300,
    'S': 40,
    'Cp': 36.4
}

thermo = thermo.append(nah, ignore_index=True)

In [704]:
naohs = {
    'name': 'sodium hydroxide',
    'abbrv': 'NaOH',
    'formula': 'NaOH',
    'state': 'cr',
    'G': -379530,
    'H': -425610,
    'S': 64.46,
    'Cp': 59.54
}

thermo = thermo.append(naohs, ignore_index=True)

In [705]:
Al = {
    'name': 'aluminum',
    'abbrv': 'Al',
    'formula': 'Al',
    'state': 'cr',
    'G': 0,
    'H': 0,
    'S': 28.33,
    'Cp': 24.35
}

thermo = thermo.append(Al, ignore_index=True)

In [716]:
H = {
    'name': 'monoatomic hydrogen',
    'abbrv': 'H',
    'formula': 'H',
    'state': 'g',
    'G': 203250,
    'H': 217970,
    'S': 114.71,
    'Cp': 20.78
}

thermo = thermo.append(H, ignore_index=True)

In [727]:
O = {
    'name': 'monoatomic oxygen',
    'abbrv': 'O',
    'formula': 'O',
    'state': 'g',
    'G': 231700,
    'H': 249200,
    'S': 161.1,
    'Cp': 21.9
}

thermo = thermo.append(O, ignore_index=True)

In [706]:
thermo.at[1, 'formula'] = 'e-'

In [707]:
thermo.head()

Unnamed: 0,name,abbrv,formula,E_units,state,G,H,S,Cp
0,water,,H2O,cal,liq,-237180.0,-285830.0,-69.91,-75.29
1,e-,,e-,cal,aq,0.0,0.0,65.339854,0.0
2,H+,H+,H+,cal,aq,0.0,0.0,0.0,0.0
3,Li+,Li+,Li+,cal,aq,-292599.672,-278453.568,11.2968,59.4128
4,Na+,Na+,Na+,cal,aq,-261880.744,-240299.672,58.40864,37.90704


In [708]:
thermo.tail(10)

Unnamed: 0,name,abbrv,formula,E_units,state,G,H,S,Cp
3205,4-iodophenol,,C6H5IO,cal,gas,25686.99856,-16709.000648,379.589216,122.750192
3206,2-iodobenzoic acid,,C7H5IO2,cal,gas,-139073.00108,-202799.998792,416.358208,128.461352
3207,3-iodobenzoic acid,,C7H5IO2,cal,gas,-160462.998168,-218999.999104,433.772016,122.101672
3208,4-iodobenzoic acid,,C7H5IO2,cal,gas,-157064.000248,-215600.001208,433.772016,122.101672
3209,methyl-2-iodobenzoate,,C8H7IO2,cal,gas,-79910.998408,-167660.001168,472.20624,168.07128
3210,sodium,Na,Na,,cr,0.0,0.0,51.21,28.24
3211,oxygen,O2,O2,,gas,0.0,0.0,205.03,29.36
3212,sodium hydride,NaH,NaH,,cr,-33500.0,-56300.0,40.0,36.4
3213,sodium hydroxide,NaOH,NaOH,,cr,-379530.0,-425610.0,64.46,59.54
3214,aluminum,Al,Al,,cr,0.0,0.0,28.33,24.35


### save to csv

In [717]:
# thermo.drop(columns='E_units', inplace=True)
thermo.to_csv('../data/processed/thermo.csv', index=False)

## import stoichiometry csv

In [241]:
stoich = pd.read_csv('../data/external/thermo/chnosz_stoich.csv')
stoich.head()

Unnamed: 0.1,Unnamed: 0,Ag,Al,Ar,As,Au,B,Ba,Be,Bi,...,Tm,U,V,W,Xe,Y,Yb,Z,Zn,Zr
0,H2O,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,(Z-1),0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1,0,0
2,H+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Li+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Na+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [242]:
# it's more convenient to look at if the formula column is first
# the column names will be renamed by atomic number

stoich.rename(columns={'Unnamed: 0': -1}, inplace=True)
stoich.head()

Unnamed: 0,-1,Ag,Al,Ar,As,Au,B,Ba,Be,Bi,...,Tm,U,V,W,Xe,Y,Yb,Z,Zn,Zr
0,H2O,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,(Z-1),0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1,0,0
2,H+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Li+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Na+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [246]:
# i'm sure there's a better way to do this
# getting symbols from periodic.py

symbols = ('e-',
    'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al',
    'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe',
    'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr',
    'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn',
    'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm',
    'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W',
    'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn',
    'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf',
    'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds',
    'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og'
    )

def Z(symbol):
    return symbols.index(symbol)

In [247]:
stoich.rename(columns={'Z': 'e-'}, inplace=True)
stoich.head()

Unnamed: 0,-1,Ag,Al,Ar,As,Au,B,Ba,Be,Bi,...,Tm,U,V,W,Xe,Y,Yb,e-,Zn,Zr
0,H2O,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,(Z-1),0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1,0,0
2,H+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Li+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Na+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [248]:
stoich.columns = [Z(col) if col != -1 else col for col in stoich.columns]
stoich.head()

Unnamed: 0,-1,47,13,18,33,79,5,56,4,83,...,69,92,23,74,54,39,70,0,30,40
0,H2O,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,(Z-1),0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1,0,0
2,H+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Li+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Na+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [250]:
stoich = stoich[list(stoich.columns.sort_values())]
stoich.head()

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8,...,80,81,82,83,86,87,88,90,91,92
0,H2O,0,2.0,0,0,0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,(Z-1),-1,0.0,0,0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,H+,1,1.0,0,0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Li+,1,0.0,0,1,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,Na+,1,0.0,0,0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [251]:
stoich.rename(columns={-1: 'formula'}, inplace=True)

In [252]:
stoich.at[1, 'formula'] = 'e-'
stoich.head()

Unnamed: 0,formula,0,1,2,3,4,5,6,7,8,...,80,81,82,83,86,87,88,90,91,92
0,H2O,0,2.0,0,0,0,0,0,0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,e-,-1,0.0,0,0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,H+,1,1.0,0,0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,Li+,1,0.0,0,1,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,Na+,1,0.0,0,0,0,0,0,0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [285]:
stoich.shape

(3372, 86)

In [286]:
stoich_og = stoich.copy()

In [287]:
stoich = stoich.drop_duplicates()

In [288]:
stoich.to_csv('../data/processed/stoich.csv', index=False)

In [289]:
stoich_og.to_csv('../data/processed/stoich_og.csv', index=False)

# playground for writing functions using thermo tables

In [292]:
equation = balance_stoichiometry(['Na', 'H2O'], ['NaOH', 'H2'])
equation

(OrderedDict([('Na', 2), ('H2O', 2)]), OrderedDict([('NaOH', 2), ('H2', 1)]))

In [261]:
reactants = ['Na', 'H2O']

In [262]:
water = Substance.from_formula('H2O')

In [263]:
[*water.composition]

[1, 8]

In [264]:
['hello'] + [*water.composition]

['hello', 1, 8]

In [275]:
z_ignore = ['formula']
for r in reactants:
    s = Substance.from_formula(r)
    z_ignore += [*s.composition]
z_ignore = set(z_ignore)
z_ignore

{1, 11, 8, 'formula'}

In [276]:
column_mask = [col for col in stoich.columns if col not in z_ignore]
print(column_mask)

[0, 2, 3, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 78, 79, 80, 81, 82, 83, 86, 87, 88, 90, 91, 92]


In [290]:
# https://stackoverflow.com/questions/22649693/

stoich_temp = stoich.copy()

for col in column_mask:
    stoich_temp = stoich_temp[stoich_temp[col] == 0]

stoich_temp = stoich_temp.loc[(stoich_temp.drop(columns='formula')!=0).any(axis=1)]

stoich_temp = stoich_temp[z_ignore]
stoich_temp

Unnamed: 0,8,1,11,formula
0,1.0,2.0,0,H2O
61,0.0,2.0,0,H2
62,2.0,0.0,0,O2
484,2.0,2.0,0,H2O2
572,1.0,1.0,1,NaOH
1767,0.0,1.0,0,H
1906,1.0,0.0,2,Na2O
2164,1.0,0.0,0,O


In [291]:
list(stoich_temp['formula'])

['H2O', 'H2', 'O2', 'H2O2', 'NaOH', 'H', 'Na2O', 'O']

In [293]:
candidates = [f for f in list(stoich_temp['formula']) if f not in reactants]
candidates

['H2', 'O2', 'H2O2', 'NaOH', 'H', 'Na2O', 'O']

In [304]:
combinations = []
for i in range(len(candidates)):
    combinations += list(itertools.combinations(candidates, i))
combinations

[(),
 ('H2',),
 ('O2',),
 ('H2O2',),
 ('NaOH',),
 ('H',),
 ('Na2O',),
 ('O',),
 ('H2', 'O2'),
 ('H2', 'H2O2'),
 ('H2', 'NaOH'),
 ('H2', 'H'),
 ('H2', 'Na2O'),
 ('H2', 'O'),
 ('O2', 'H2O2'),
 ('O2', 'NaOH'),
 ('O2', 'H'),
 ('O2', 'Na2O'),
 ('O2', 'O'),
 ('H2O2', 'NaOH'),
 ('H2O2', 'H'),
 ('H2O2', 'Na2O'),
 ('H2O2', 'O'),
 ('NaOH', 'H'),
 ('NaOH', 'Na2O'),
 ('NaOH', 'O'),
 ('H', 'Na2O'),
 ('H', 'O'),
 ('Na2O', 'O'),
 ('H2', 'O2', 'H2O2'),
 ('H2', 'O2', 'NaOH'),
 ('H2', 'O2', 'H'),
 ('H2', 'O2', 'Na2O'),
 ('H2', 'O2', 'O'),
 ('H2', 'H2O2', 'NaOH'),
 ('H2', 'H2O2', 'H'),
 ('H2', 'H2O2', 'Na2O'),
 ('H2', 'H2O2', 'O'),
 ('H2', 'NaOH', 'H'),
 ('H2', 'NaOH', 'Na2O'),
 ('H2', 'NaOH', 'O'),
 ('H2', 'H', 'Na2O'),
 ('H2', 'H', 'O'),
 ('H2', 'Na2O', 'O'),
 ('O2', 'H2O2', 'NaOH'),
 ('O2', 'H2O2', 'H'),
 ('O2', 'H2O2', 'Na2O'),
 ('O2', 'H2O2', 'O'),
 ('O2', 'NaOH', 'H'),
 ('O2', 'NaOH', 'Na2O'),
 ('O2', 'NaOH', 'O'),
 ('O2', 'H', 'Na2O'),
 ('O2', 'H', 'O'),
 ('O2', 'Na2O', 'O'),
 ('H2O2', 'NaOH', 'H'

In [313]:
# reach goal: see if we can figure out how to populate x1, x2 etc.

for comb in combinations:    
    try:
        print(balance_stoichiometry(reactants, comb))
    except:
        pass

(OrderedDict([('Na', 2), ('H2O', 2)]), OrderedDict([('H2', 1), ('NaOH', 2)]))
(OrderedDict([('Na', 2), ('H2O', 1)]), OrderedDict([('H2', 1), ('Na2O', 1)]))
(OrderedDict([('Na', 4), ('H2O', 2)]), OrderedDict([('O2', -1), ('NaOH', 4)]))
(OrderedDict([('Na', 2), ('H2O', -1)]), OrderedDict([('H2O2', -1), ('Na2O', 1)]))
(OrderedDict([('Na', 1), ('H2O', 1)]), OrderedDict([('NaOH', 1), ('H', 1)]))
(OrderedDict([('Na', -2), ('H2O', -1)]), OrderedDict([('NaOH', -2), ('O', 1)]))
(OrderedDict([('Na', 2), ('H2O', 1)]), OrderedDict([('H', 2), ('Na2O', 1)]))
(OrderedDict([('Na', 2*x1), ('H2O', 2*x1 + 4)]), OrderedDict([('H2', x1 + 4), ('O2', 2), ('NaOH', 2*x1)]))
(OrderedDict([('Na', 2*x1), ('H2O', x1 + 2)]), OrderedDict([('H2', x1 + 2), ('O2', 1), ('Na2O', x1)]))
(OrderedDict([('Na', 2*x1), ('H2O', 2*x1 + 4)]), OrderedDict([('H2', x1 + 2), ('H2O2', 2), ('NaOH', 2*x1)]))
(OrderedDict([('Na', 2*x1), ('H2O', x1 + 2)]), OrderedDict([('H2', x1 + 1), ('H2O2', 1), ('Na2O', x1)]))
(OrderedDict([('Na', 2), 

(OrderedDict([('Na', 4*x1 + 4*x3), ('H2O', 2*x2 + 2*x3 + 2)]), OrderedDict([('O2', -x1 + x2 - x3 - 1), ('H2O2', 2), ('NaOH', 4*x3), ('H', 4*x2), ('Na2O', 2*x1)]))
(OrderedDict([('Na', 4*x3), ('H2O', 2*x2 + 2*x3 + 2)]), OrderedDict([('O2', -x1 + x2 - x3 - 1), ('H2O2', 2), ('NaOH', 4*x3), ('H', 4*x2), ('O', 2*x1)]))
(OrderedDict([('Na', 4*x2 + 4*x3), ('H2O', 2*x3 + 2)]), OrderedDict([('O2', -x1 - x2 - x3 - 1), ('H2O2', 2), ('NaOH', 4*x3), ('Na2O', 2*x2), ('O', 2*x1)]))
(OrderedDict([('Na', 4*x2), ('H2O', 2*x3 + 2)]), OrderedDict([('O2', -x1 - x2 + x3 - 1), ('H2O2', 2), ('H', 4*x3), ('Na2O', 2*x2), ('O', 2*x1)]))
(OrderedDict([('Na', 4*x2 + 2), ('H2O', 2*x3 + 1)]), OrderedDict([('O2', -x1 - x2 + x3 - 1/2), ('NaOH', 2), ('H', 4*x3), ('Na2O', 2*x2), ('O', 2*x1)]))
(OrderedDict([('Na', 2*x2 + 1), ('H2O', -x1 - x2 + 2*x3)]), OrderedDict([('H2O2', -x1 - x2 + x3 - 1/2), ('NaOH', 1), ('H', 2*x3), ('Na2O', x2), ('O', x1)]))
(OrderedDict([('Na', 2*x1 + 2*x3), ('H2O', x1 + 2*x3 + 2*x4 + 2)]), Order

things to do:
- stretch: substitute sympy symbols with "1"?
- check gibbs for 

In [322]:
np.array(list(balance_stoichiometry(reactants, ('H2O2', 'Na2O'))[0].values()))

array([2, -1], dtype=object)

In [329]:
np.array(list(balance_stoichiometry(reactants, ('H2O2', 'Na2O'))[0].values())) >= 1

array([ True, False])

In [324]:
(np.array(list(balance_stoichiometry(reactants, ('H2O2', 'Na2O'))[0].values())) >= 1).mean().astype(int)

0

In [337]:
temp_list = list(balance_stoichiometry(reactants, ('H2', 'O2', 'NaOH'))[0].values())
temp_list

[2*x1, 2*x1 + 4]

In [384]:
isinstance(np.array(list(balance_stoichiometry(reactants, ('H2', 'O2', 'NaOH'))[0].values())), sympy.mul.Mul)

False

In [378]:
isinstance(list(balance_stoichiometry(reactants, ('H2', 'O2', 'NaOH'))[0].values())[0], sympy.mul.Mul)

True

In [387]:
1 - np.array([isinstance(i, sympy.mul.Mul) for i in list(balance_stoichiometry(reactants, ('H2O2', 'Na2O'))[0].values())])

array([1, 1])

In [364]:
type(temp_list[0]) == sympy.mul.Mul

True

In [395]:
reactants

['Na', 'H2O']

In [521]:
def check_coefficients(reactants, products):
    try:
        balance = balance_stoichiometry(reactants, products)
        reac_coef = list(balance[0].values()) + list(balance[1].values())
        is_positive = np.floor((np.array(reac_coef) >= 1).mean()).astype(bool)
        is_definite = np.floor(np.array([isinstance(i, sympy.numbers.Number) for i in reac_coef]).mean()).astype(bool)
        return is_positive and is_definite
    except:
        return False   

In [472]:
test_prod = ['H2', 'NaOH']

check_coefficients(reactants, test_prod)

True

In [449]:
def simp_comp(substances, thorough=False):
    if thorough:
        composition = [0]
    else:
        composition = []
    for s in substances:
        sub = Substance.from_formula(s)
        composition += [*sub.composition]
    return set(composition)

In [433]:
reactants

['Na', 'H2O']

In [440]:
simp_comp(reactants)

{0, 1, 8, 11}

In [522]:
def stoich_filter(substances):
    
    stoich_temp = stoich.copy()
    
    z_ignore = [0, 'formula'] + list(simple_composition(substances))

    column_mask = [col for col in stoich.columns if col not in z_ignore]

    for col in column_mask:
        stoich_temp = stoich_temp[stoich_temp[col] == 0]

    stoich_temp = stoich_temp.loc[(stoich_temp.drop(columns='formula')!=0).any(axis=1)]

    stoich_temp = stoich_temp[z_ignore]
    
    return stoich_temp

In [609]:
# https://stackoverflow.com/questions/6618515/

def possibility_reducer(possibilities, length=12, offset=0):
    
    energies = np.array([min(get_gibbs(s)) / Substance.from_formula(s).mass for s in possibilities])
    indices = energies.argsort()
    sorted_possibilities = possibilities[indices]
    
    max_length = min(len(sorted_possibilities), (length + offset))
    
    return sorted_possibilities[offset:(max_length)]

In [753]:
def reaction_predictor(reactants):

    stoich_temp = stoich_filter(reactants)
    
    print('scoping possibilities...')
    possibilities = np.array([f for f in list(stoich_temp['formula']) if f not in reactants if f in list(thermo['formula'])])
    if len(possibilities) > 25:
        possibilities = possibility_reducer(possibilities)
    print(possibilities)
    
    print('  optimizing combinations...')
    combinations = []
    comb_length = min(6, len(possibilities))
    for i in range(1, comb_length):
        combinations += list(itertools.combinations(possibilities, i))
    combinations = [c for c in combinations if simp_comp(c) == simp_comp(reactants)]
    
    print('    deriving equations...')
    equations = []
    for i, comb in enumerate(combinations):    
        if check_coefficients(reactants, comb):
            equations.append(balance_stoichiometry(reactants, comb))
    
    print('      calculating energies...')
    energies = []
    for eq in equations:
        energies.append(standard_gibbs_free_energy(eq))
        
    best_index = energies.index(min(energies))
    
    return Reaction(*equations[best_index])

In [723]:
reaction_predictor(['Na', 'H2O'])

scoping possibilities...
  optimizing combinations...
    calculating energies...


In [724]:
reaction_predictor(['Al', 'O2'])

scoping possibilities...
  optimizing combinations...
    calculating energies...


In [748]:
reaction_predictor(['HCl', 'NaOH'])

scoping possibilities...
  optimizing combinations...
    deriving equations...
      calculating energies...


In [758]:
reaction_predictor(['O2', 'O'])

scoping possibilities...
['e-']
  optimizing combinations...
    deriving equations...
      calculating energies...


ValueError: min() arg is an empty sequence

In [489]:
cho_temp = product_predictor(['CH4', 'O2'])

error: 426 possible products


In [560]:
list(thermo['formula'])[:20]

['H2O',
 'e-',
 'H+',
 'Li+',
 'Na+',
 'K+',
 'Rb+',
 'Cs+',
 'Mg+2',
 'Ca+2',
 'Sr+2',
 'Ba+2',
 'HCO3-',
 'CO3-2',
 'Pb+2',
 'NO3-',
 'NO2-',
 'NH4+',
 'H2PO4-',
 'HPO4-2']

In [572]:
Substance.from_formula('H2O').mass

18.015

['H2O',
 'H+',
 'HCO3-',
 'CO3-2',
 'OH-',
 'H2',
 'O2',
 'HO2-',
 'H2O2',
 'CH4',
 'C2H6',
 'C3H8',
 'C5H12',
 'C6H14',
 'C7H16',
 'C8H18',
 'C2H4',
 'C3H6',
 'C4H8',
 'C6H12']

[-55085.268942547875,
 0.0,
 -9619.355708923093,
 -8798.384967554037,
 -9248.686034940638,
 0.0,
 0.0,
 -2039.612205564454,
 -3939.9517845592995,
 -3161.511039082466,
 -1097.4129697372796,
 -558.4739097897816,
 -171.88086097212792,
 -82.58469679036413,
 -33.194750760940074,
 30.363961061698998,
 517.5190703642975,
 402.18340818896894,
 1312.8131460754262,
 320.35474442147284]

[-55085.26894255 -55085.26894255 -55085.26894255 -55085.26894255
 -55085.26894255 -55085.26894255 -55085.26894255 -55085.26894255
 -55085.26894255 -17023.65       -17023.65       -13970.88596554
 -10566.82712938  -9864.17264106  -9619.35570892  -9248.68603494
  -8971.2510898   -8960.86564112  -8960.86564112  -8798.38496755]
['H2O' 'H2O' 'H2O' 'H2O' 'H2O' 'H2O' 'H2O' 'H2O' 'H2O' 'H' 'H' 'C6H6O3'
 'C7H12O2' 'C7H11O2-' 'HCO3-' 'OH-' 'C6H6O2' 'CO2' 'CO2' 'CO3-2']


In [524]:
list(cho_temp['formula'])

['H2O',
 'e-',
 'H+',
 'HCO3-',
 'CO3-2',
 'OH-',
 'H2',
 'O2',
 'HO2-',
 'H2O2',
 'CH4',
 'C2H6',
 'C3H8',
 'C4H10',
 'C5H12',
 'C6H14',
 'C7H16',
 'C8H18',
 'C2H4',
 'C3H6',
 'C4H8',
 'C5H10',
 'C6H12',
 'C7H14',
 'C8H16',
 'C2H2',
 'C3H4',
 'C4H6',
 'C5H8',
 'C6H10',
 'C7H12',
 'C8H14',
 'C6H5CH3',
 'C6H5C2H5',
 'C6H5C3H7',
 'C6H5C4H9',
 'C6H5C5H11',
 'C6H5C6H13',
 'C6H5C7H15',
 'C6H5C8H17',
 'CH3OH',
 'C2H5OH',
 'C3H7OH',
 'C4H9OH',
 'C5H11OH',
 'C6H13OH',
 'C7H15OH',
 'C8H17OH',
 'C3H6O',
 'C4H8O',
 'C5H10O',
 'C6H12O',
 'C7H14O',
 'C8H16O',
 'CH3COOCH2CH3',
 'CO',
 'HCHO',
 'CH3CHO',
 'CH3CH2CHO',
 'CH3(CH2)2CHO',
 'CH3(CH2)3CHO',
 'CH3(CH2)4CHO',
 'CH3(CH2)5CHO',
 'CH3(CH2)6CHO',
 'CH3(CH2)7CHO',
 'CH3(CH2)8CHO',
 'H2CO2',
 'C2H4O2',
 'C3H6O2',
 'C4H8O2',
 'C5H10O2',
 'C6H12O2',
 'C7H14O2',
 'C8H16O2',
 'C9H18O2',
 'C10H20O2',
 'C11H22O2',
 'C12H24O2',
 'C7H6O2',
 'C8H8O2',
 'C2H2O4',
 'C3H4O4',
 'C4H6O4',
 'C5H8O4',
 'C6H10O4',
 'C7H12O4',
 'C8H14O4',
 'C9H16O4',
 'C10H18O4',
 

In [499]:
thermo[thermo['formula'] == 'CH3OH']

Unnamed: 0,name,abbrv,formula,state,G,H,S,Cp
870,methanol,,CH3OH,aq,-175811.68,-246353.92,133.0512,157.98784
2676,methanol,,CH3OH,liq,-166765.872,-239111.416,127.1936,81.1696
3223,methanol,,CH3OH,gas,-162878.936,-201589.304,239.82688,43.932


In [None]:
reactant_comp = set(1, 8, 11)
for c in candidates:
    s = Substance.from_formula(c)
    z_ignore += [*s.composition]
z_ignore = set(z_ignore)
z_ignore

## creating a function that selects the appropriate gibbs value(s)

In [232]:
equation = balance_stoichiometry(['Na', 'H2O'], ['NaOH', 'H2'])
equation

(OrderedDict([('Na', 2), ('H2O', 2)]), OrderedDict([('NaOH', 2), ('H2', 1)]))

In [233]:
list(equation[1].items())

[('NaOH', 2), ('H2', 1)]

In [190]:
thermo[thermo['formula'] == 'NaOH']

Unnamed: 0,name,abbrv,formula,state,G,H,S,Cp
572,NaOH,NaOH,NaOH,aq,-417981.6,-469863.2,44.7688,-13.3888
3375,sodium hydroxide,NaOH,NaOH,cr,-379530.0,-425610.0,64.46,59.54


In [208]:
re.search('(?<=\()[aglsq]+', 'Ca(NO3)2(g)').group(0)

'g'

In [235]:
re.search('(?<=\()[aglsq]+', 'Ca(NO3)2(g)').start()

9

In [210]:
'Ca(NO3)2(g)'[:9]

'Ca(NO3)2('

In [219]:
def formula_state_separator(formula):
    try:
        regex = re.search('(?<=\()[aglsq]+', formula)
        state = regex.group(0)
        formula = formula[:regex.start() - 1]
        if state == 'aq':
            return formula, 'aq'
        elif state == 's':
            return formula, 'cr'
        elif state == 'l':
            return formula, 'liq'
        elif state == 'g':
            return formula, 'gas'
    except:
        return formula, None

In [236]:
formula_state_separator('Al(OH)3(s)')

('Al(OH)3', 'cr')

In [227]:
formula_state_separator('Al(OH)3')

('Al(OH)3', None)

In [222]:
def get_gibbs(formula):        
    formula, state = formula_state_separator(formula)
    
    is_formula = thermo['formula'] == formula
    
    if state == None:
        matches = thermo[is_formula]
    else:
        is_state = thermo['state'] == state
        matches = thermo[is_formula & is_state]
        
    return matches['G'].values

In [223]:
get_gibbs('NaOH')

array([-417981.6, -379530. ])

In [542]:
min(get_gibbs('NaOH'))

-417981.60000000003

In [None]:
thermo[(thermo['state'] == 'cr') & (thermo['formula'] == 'NaOH')]

In [509]:
min([1])

1

In [722]:
def standard_gibbs_free_energy(equation, kJ=True):
    
    # each side is a formula, coefficient tuple
    prod = list(equation[1].items())
    reac = list(equation[0].items())
    
    delG = 0
        
    # p[0] is the formula, with or without state
    # p[1] is the coefficient
    
    def gibbs_sum(side):
        interim_delG = 0
        for s in side:
            interim_delG += min(get_gibbs(s[0])) * s[1]
        return interim_delG
    
    delG = gibbs_sum(prod) - gibbs_sum(reac)
    
    return delG / (1 + 999*kJ)

In [225]:
gibbs_free_energy(equation)

-369.999488000000

In [226]:
equation2 = balance_stoichiometry(['Na', 'H2O'], ['NaH', 'O2'])
gibbs_free_energy(equation2)

331.963712000000

In [155]:
equation3 = balance_stoichiometry(['Na', 'H2O'], ['NaH', 'H2O2'])
gibbs_free_energy(equation3)

264.950192000000

In [156]:
equation4 = balance_stoichiometry(['Na', 'H2O'], ['Na2O', 'H2'])
gibbs_free_energy(equation4)

-143.088616000000

In [157]:
equation5 = balance_stoichiometry(['Na', 'H2O'], ['NaH', 'O'])
gibbs_free_energy(equation5)

56.6079120000000

In [249]:
output = Reaction(*balance_stoichiometry(['Na', 'H2O'], ['NaH', 'H2O2']))
output

In [228]:
thermo[thermo['formula'] == 'Na']

Unnamed: 0,name,abbrv,formula,state,G,H,S,Cp
3372,sodium,Na,Na,cr,0.0,0.0,51.21,28.24


In [173]:
keys = output.keys()
substances = {k: Substance.from_formula(k) for k in keys}
output.unicode(substances)

'2 Na + H₂O → Na₂O + H₂'

In [160]:
Reaction(*balance_stoichiometry(['Na(s)', 'H2O(l)'], ['H+(aq)', 'Na2O(s)', 'e-']))

In [None]:
for c in candidates:
    products = []
    cand = thermo.loc[c, 'formula']
    if cand not in ['Na', 'H2O']:
        