In [1]:
import os                               
import re                   
import itertools
import pickle

import numpy as np
import pandas as pd 

from chempy import Substance

from tika import parser     # the specific parser method 

## Import thermodynamic values

These values were taken from the following sources:
- [CHNOSZ](http://chnosz.net/vignettes/obigt.html), a chemistry and materials science package for [R](https://www.r-project.org/about.html). The dataframes were exported to csv files as-is.
- [Principles of Modern Chemistry, 8th ed.](https://www.amazon.com/Principles-Modern-Chemistry-David-Oxtoby-ebook/dp/B00UGDPNFI) by Oxtoby et al.

### CHNOSZ

In [2]:
chnosz = pd.read_csv('../data/external/thermo/chnosz_thermo.csv')

In [3]:
chnosz.columns

Index(['name', 'abbrv', 'formula', 'state', 'ref1', 'ref2', 'date', 'E_units',
       'G', 'H', 'S', 'Cp', 'V', 'a1.a', 'a2.b', 'a3.c', 'a4.d', 'c1.e',
       'c2.f', 'omega.lambda', 'z.T'],
      dtype='object')

In [4]:
chnosz = chnosz[['name', 'abbrv', 'formula', 'E_units', 'state', 'G', 'H', 'S', 'Cp']]
chnosz.head()

Unnamed: 0,name,abbrv,formula,E_units,state,G,H,S,Cp
0,water,,H2O,cal,liq,,,,
1,e-,,(Z-1),cal,aq,0.0,0.0,15.6166,0.0
2,H+,H+,H+,cal,aq,0.0,0.0,0.0,0.0
3,Li+,Li+,Li+,cal,aq,-69933.0,-66552.0,2.7,14.2
4,Na+,Na+,Na+,cal,aq,-62591.0,-57433.0,13.96,9.06


In [5]:
chnosz.shape

(3372, 9)

In [6]:
chnosz = chnosz[chnosz['name'].str[0] != '[']
chnosz.shape

(3210, 9)

In [7]:
chnosz['E_units'].value_counts()

cal    3172
J        38
Name: E_units, dtype: int64

In [8]:
# convert to joules
energy_columns = ['G', 'H', 'S', 'Cp']

for col in energy_columns:
    chnosz.loc[chnosz['E_units'] == 'cal', col] *= 4.184
    
chnosz.drop(columns='E_units', inplace=True)
chnosz.head()

Unnamed: 0,name,abbrv,formula,state,G,H,S,Cp
0,water,,H2O,liq,,,,
1,e-,,(Z-1),aq,0.0,0.0,65.339854,0.0
2,H+,H+,H+,aq,0.0,0.0,0.0,0.0
3,Li+,Li+,Li+,aq,-292599.672,-278453.568,11.2968,59.4128
4,Na+,Na+,Na+,aq,-261880.744,-240299.672,58.40864,37.90704


In [9]:
chnosz.at[0, 'G'] = -237180
chnosz.at[0, 'H'] = -285830
chnosz.at[0, 'S'] = 69.91
chnosz.at[0, 'Cp'] = 75.29

In [10]:
# just checking to make sure appropriate values are in joules

print(chnosz.loc[4, 'G'])
print(chnosz.loc[779, 'G'])

-261880.744
-713730.0


In [11]:
chnosz['state'].value_counts()

aq     1807
cr      601
liq     488
gas     285
cr2      14
cr3       9
cr8       1
cr4       1
cr9       1
cr5       1
cr6       1
cr7       1
Name: state, dtype: int64

In [12]:
value_dict = {
    'aq': '(aq)',
    'cr': '(s)',
    'liq': '(l)',
    'gas': '(g)',
    'cr2': '(s, II)',
    'cr3': '(s, III)',
    'cr4': '(s, IV)',
    'cr5': '(s, V)',
    'cr6': '(s, VI)',
    'cr7': '(s, VII)',
    'cr8': '(s, VIII)',
    'cr9': '(s, IX)',
    'g': '(g)',
}

chnosz['state'].replace(value_dict, inplace=True)

In [13]:
chnosz['formula'] = chnosz['formula'].astype(str) + chnosz['state'].astype(str)
chnosz.drop(columns='state', inplace=True)
chnosz.head()

Unnamed: 0,name,abbrv,formula,G,H,S,Cp
0,water,,H2O(l),-237180.0,-285830.0,69.91,75.29
1,e-,,(Z-1)(aq),0.0,0.0,65.339854,0.0
2,H+,H+,H+(aq),0.0,0.0,0.0,0.0
3,Li+,Li+,Li+(aq),-292599.672,-278453.568,11.2968,59.4128
4,Na+,Na+,Na+(aq),-261880.744,-240299.672,58.40864,37.90704


In [14]:
chnosz.at[1, 'formula'] = 'e-(aq)'

In [15]:
chnosz.isna().sum()

name          0
abbrv      2266
formula       0
G           121
H           151
S           132
Cp          245
dtype: int64

In [16]:
chnosz = chnosz.dropna(subset=['G'])
chnosz.shape

(3089, 7)

### Oxtoby

In [17]:
def get_text(file, sleep=0, counter=0):
    if counter == 2:        # so we stop the recursive function
        pass
    # grab the raw text using parser.from_file()
    raw = parser.from_file(file)
    status = raw['status']          # returns the status code from tika server
    # if things go well, return the raw text
    if status == 200:
        print(f"'{file}' successfully opened!")
        return raw['content']
    # if things don't go well, pause for five seconds and try again
    # we might not need this code, but it's useful for other server calls
    else:
        print(f'! ! ! ! error code {status} ! ! ! !')
        print(f'! ! ! ! trying again ! ! ! !')
        time.sleep(5)
        counter += 1
        # repeats grab_text up to twice
        return get_text(file, counter=counter)

In [18]:
oxtobya = get_text('../data/external/thermo/oxtoby8a.pdf')
oxtobyb = get_text('../data/external/thermo/oxtoby8b.pdf')

'../data/external/thermo/oxtoby8a.pdf' successfully opened!
'../data/external/thermo/oxtoby8b.pdf' successfully opened!


In [19]:
clean = re.sub('—', 'nan', oxtobya)
clean = re.sub('\\ue02c', 'l', clean)
clean = re.sub(r'\n\n[I]*\s*', r'RROOWW', clean)
clean = re.sub(r'([\d]+)\n([A-Z]+)', r'\1RROOWW\2', clean)
clean = re.sub(r'(nan)\s*\n([A-Z]+)', r'\1RROOWW\2', clean)
clean = re.sub('\\ue031', '+', clean)
clean = re.sub('([\d])\+\(', r'+\1(', clean)
clean = re.sub('\\ue032', '-', clean)
clean = re.sub('([\d])\-\(', r'-\1(', clean)
clean = re.sub('\(([aqslg]+)\,\s([\w]+)', r'(\1,\2', clean)
clean = re.sub('[\s]+', ' ', clean)
clean = re.sub(' mol-1', '', clean)
# clean = re.sub('Substance.*mol', 'HHEEAADDEERR', clean)
clean = re.split('RROOWW', clean)
clean = [re.split(' ', r) for r in clean]
clean = [r for r in clean if len(r) == 5]
oxtobya_clean = clean

In [20]:
clean = re.sub('—', 'nan', oxtobyb)
clean = re.sub('\\ue02c', 'l', clean)
clean = re.sub(r'\n\n[I]*\s*', r'RROOWW', clean)
clean = re.sub(r'([\d]+)\n([A-Z]+)', r'\1RROOWW\2', clean)
clean = re.sub(r'(nan)\s*\n([A-Z]+)', r'\1RROOWW\2', clean)
clean = re.sub('\\ue031', '+', clean)
clean = re.sub('([\d])\+\(', r'+\1(', clean)
clean = re.sub('\\ue032', '-', clean)
clean = re.sub('([\d])\-\(', r'-\1(', clean)
clean = re.sub('\(([aqslg]+)\,\s([\w]+)', r'(\1,\2', clean)
clean = re.sub('[\s]+', ' ', clean)
clean = re.sub(' mol-1', '', clean)
# clean = re.sub('Substance.*mol', 'HHEEAADDEERR', clean)
clean = re.split('RROOWW', clean)
clean = [re.split(' ', r) for r in clean]
clean = [r for r in clean if len(r) == 5]
oxtobyb_clean = clean

In [21]:
oxtoby = oxtobya_clean + oxtobyb_clean
oxtoby[:20]

[['H(g)', '217.96', '114.60', '203.26', '20.78'],
 ['H2(g)', '0', '130.57', '0', '28.82'],
 ['H+(aq)', '0', '0', '0', '0'],
 ['H3O+(aq)', '-285.83', '69.91', '-237.18', '75.29'],
 ['Li(s)', '0', '29.12', '0', '24.77'],
 ['Li(g)', '159.37', '138.66', '126.69', '20.79'],
 ['Li+(aq)', '-278.49', '13.4', '-293.31', '68.6'],
 ['LiH(s)', '-90.54', '20.01', '-68.37', '27.87'],
 ['Li2O(s)', '-597.94', '37.57', '-561.20', '54.10'],
 ['LiF(s)', '-615.97', '35.65', '-587.73', '41.59'],
 ['LiCl(s)', '-408.61', '59.33', '-384.39', '47.99'],
 ['LiBr(s)', '-351.21', '74.27', '-342.00', 'nan'],
 ['LiI(s)', '-270.41', '86.78', '-270.29', '51.04'],
 ['Na(s)', '0', '51.21', '0', '28.24'],
 ['Na(g)', '107.32', '153.60', '76.79', '20.79'],
 ['Na+(aq)', '-240.12', '59.0', '-261.90', '46.4'],
 ['Na2O(s)', '-414.22', '75.06', '-375.48', '69.12'],
 ['NaOH(s)', '-425.61', '64.46', '-379.53', '59.54'],
 ['NaF(s)', '-573.65', '51.46', '-543.51', '48.86'],
 ['NaCl(s)', '-411.15', '72.13', '-384.15', '50.50']]

In [22]:
oxtoby[-20:]

[['ClF3(g)', '-163.2', '281.50', '-123.0', '63.85'],
 ['Br2(l)', '0', '152.23', '0', '75.69'],
 ['Br2(g)', '30.91', '245.35', '3.14', '36.02'],
 ['Br2(aq)', '-2.59', '130.5', '3.93', 'nan'],
 ['Br(g)', '111.88', '174.91', '82.41', '20.79'],
 ['HBr(g)', '-36.40', '198.59', '-53.43', '29.14'],
 ['BrO-3(aq)', '-67.07', '161.71', '18.60', 'nan'],
 ['I2(s)', '0', '116.14', '0', '54.44'],
 ['I2(g)', '62.44', '260.58', '19.36', '36.90'],
 ['I2(aq)', '22.6', '137.2', '16.40', 'nan'],
 ['I(g)', '106.84', '180.68', '70.28', '20.79'],
 ['I-(aq)', '-55.19', '111.3', '-51.57', '-142.3'],
 ['I-3(aq)', '-51.5', '239.3', '-51.4', 'nan'],
 ['HI(g)', '26.48', '206.48', '1.72', '29.16'],
 ['ICl(g)', '17.78', '247.44', '-5.44', '35.56'],
 ['IBr(g)', '40.84', '258.66', '3.71', '36.44'],
 ['Ne(g)', '0', '146.22', '0', '20.79'],
 ['Ar(g)', '0', '154.73', '0', '20.79'],
 ['Kr(g)', '0', '163.97', '0', '20.79'],
 ['Xe(g)', '0', '169.57', '0', '20.79']]

In [23]:
oxtoby_df = pd.DataFrame(oxtoby, columns=['formula', 'H', 'S', 'G', 'Cp'])
oxtoby_df.head()

Unnamed: 0,formula,H,S,G,Cp
0,H(g),217.96,114.6,203.26,20.78
1,H2(g),0.0,130.57,0.0,28.82
2,H+(aq),0.0,0.0,0.0,0.0
3,H3O+(aq),-285.83,69.91,-237.18,75.29
4,Li(s),0.0,29.12,0.0,24.77


In [24]:
oxtoby_df.shape

(362, 5)

In [25]:
energy_columns = ['G', 'H', 'S', 'Cp']

for col in energy_columns:
    oxtoby_df[col] = oxtoby_df[col].astype(float)
    
oxtoby_df.head()

Unnamed: 0,formula,H,S,G,Cp
0,H(g),217.96,114.6,203.26,20.78
1,H2(g),0.0,130.57,0.0,28.82
2,H+(aq),0.0,0.0,0.0,0.0
3,H3O+(aq),-285.83,69.91,-237.18,75.29
4,Li(s),0.0,29.12,0.0,24.77


In [26]:
oxtoby_df.loc[:, 'H'] *= 1000
oxtoby_df.loc[:, 'G'] *= 1000
oxtoby_df.head()

Unnamed: 0,formula,H,S,G,Cp
0,H(g),217960.0,114.6,203260.0,20.78
1,H2(g),0.0,130.57,0.0,28.82
2,H+(aq),0.0,0.0,0.0,0.0
3,H3O+(aq),-285830.0,69.91,-237180.0,75.29
4,Li(s),0.0,29.12,0.0,24.77


In [27]:
oxtoby_df = oxtoby_df[['formula', 'G', 'H', 'S', 'Cp']]
oxtoby_df.head()

Unnamed: 0,formula,G,H,S,Cp
0,H(g),203260.0,217960.0,114.6,20.78
1,H2(g),0.0,0.0,130.57,28.82
2,H+(aq),0.0,0.0,0.0,0.0
3,H3O+(aq),-237180.0,-285830.0,69.91,75.29
4,Li(s),0.0,0.0,29.12,24.77


In [28]:
oxtoby_df.isna().sum()

formula     0
G           2
H           0
S           2
Cp         73
dtype: int64

In [29]:
oxtoby_df = oxtoby_df.dropna(subset=['G'])
oxtoby_df.shape

(360, 5)

## Combining Tables

In [30]:
thermo_df = pd.concat([chnosz, oxtoby_df], ignore_index=True).fillna(np.NaN)
thermo_df.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,Cp,G,H,S,abbrv,formula,name
0,75.29,-237180.0,-285830.0,69.91,,H2O(l),water
1,0.0,0.0,0.0,65.339854,,e-(aq),e-
2,0.0,0.0,0.0,0.0,H+,H+(aq),H+
3,59.4128,-292599.672,-278453.568,11.2968,Li+,Li+(aq),Li+
4,37.90704,-261880.744,-240299.672,58.40864,Na+,Na+(aq),Na+


In [31]:
thermo_df = thermo_df[['formula', 'abbrv', 'name', 'G', 'H', 'S', 'Cp']]
thermo_df.tail()

Unnamed: 0,formula,abbrv,name,G,H,S,Cp
3444,IBr(g),,,3710.0,40840.0,258.66,36.44
3445,Ne(g),,,0.0,0.0,146.22,20.79
3446,Ar(g),,,0.0,0.0,154.73,20.79
3447,Kr(g),,,0.0,0.0,163.97,20.79
3448,Xe(g),,,0.0,0.0,169.57,20.79


In [32]:
thermo_df.shape

(3449, 7)

In [33]:
thermo_df.drop_duplicates(subset='formula', keep='last', inplace=True)
thermo_df.reset_index(drop=True, inplace=True)
thermo_df.shape

(2837, 7)

In [34]:
thermo_df[thermo_df['formula'].str.contains('\.')].index

Int64Index([1765, 1766, 1767, 1770, 1785, 1787, 2596], dtype='int64')

In [35]:
thermo_df = thermo_df.drop(index=thermo_df[thermo_df['formula'].str.contains('\.')].index)
thermo_df.reset_index(drop=True, inplace=True)
thermo_df.shape

(2830, 7)

In [36]:
def formula_state_separator(formula, keep_state=False):
    '''
    Separates the state from a formula string.
    
    --Parameters--
    formula:        str
        a string of a single chemical formula
    
    --Output--
    tuple (str)
        
    --Examples--
    >>> formula_state_separator('NaCl(aq)')
    ('NaCl', 'aq')
    
    >>> formula_state_separator('NaCl')
    'NaCl'
    '''
    try:
        regex = re.search('(?<=\()[aglsq]+', formula)
        formula = formula[:regex.start() - 1]
        if keep_state:
            state = regex.group(0)
            return formula, state
        else:
            return formula
    except:
        return formula_state_separator

In [37]:
formulas = [formula_state_separator(f) for f in thermo_df['formula'].astype(str)]
formulas[:5]

['e-', 'HCO3-', 'NO3-', 'NO2-', 'NH4+']

In [38]:
new_list = []
for a, f in zip(list(thermo_df['abbrv']), formulas):
    if pd.isna(a):
        new_list.append(f)
    else:
        new_list.append(a)
new_list[:10]

['e-',
 'HCO3-',
 'NO3-',
 'NO2-',
 'NH4+',
 'H2PO4-',
 'HSO3-',
 'HSO4-',
 'S2O3-2',
 'S2O8-2']

In [39]:
thermo_df['abbrv'] = new_list

In [40]:
thermo_df.head()

Unnamed: 0,formula,abbrv,name,G,H,S,Cp
0,e-(aq),e-,e-,0.0,0.0,65.339854,0.0
1,HCO3-(aq),HCO3-,HCO3-,-586939.888,-689933.232,98.44952,-35.39664
2,NO3-(aq),NO3-,NO3-,-110905.288,-206810.936,146.94208,-68.6176
3,NO2-(aq),NO2-,NO2-,-32216.8,-104600.0,123.0096,-97.4872
4,NH4+(aq),NH4+,NH4+,-79454.16,-133260.4,111.16888,65.85616


In [41]:
thermo_df.tail()

Unnamed: 0,formula,abbrv,name,G,H,S,Cp
2825,IBr(g),IBr,,3710.0,40840.0,258.66,36.44
2826,Ne(g),Ne,,0.0,0.0,146.22,20.79
2827,Ar(g),Ar,,0.0,0.0,154.73,20.79
2828,Kr(g),Kr,,0.0,0.0,163.97,20.79
2829,Xe(g),Xe,,0.0,0.0,169.57,20.79


In [42]:
masses = [Substance.from_formula(f).mass for f in thermo_df['formula'].astype(str)]
masses[:5]

[0.0005489, 61.0165489, 62.004548899999996, 46.0055489, 18.038451100000003]

In [43]:
thermo_df['mass'] = masses

In [44]:
pickle.dump(thermo_df, open('../data/processed/thermo_df.p', 'wb'))

## replicate CHNOSZ stoichiometry csv

Originally, I had imported the `CHNOSZ` stoichiometry csv. However, I found the need to update it with whatever new entries were present in `thermo`.

The stoichiometry csv is formatted so that each formula is count-vectorized by element.

In [45]:
stoich_og = pd.read_csv('../data/external/thermo/chnosz_stoich.csv')
stoich_og.head()

Unnamed: 0.1,Unnamed: 0,Ag,Al,Ar,As,Au,B,Ba,Be,Bi,...,Tm,U,V,W,Xe,Y,Yb,Z,Zn,Zr
0,H2O,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,(Z-1),0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1,0,0
2,H+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Li+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Na+,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


For my workflow, we will rename the columns by atomic number and have charge occupy column `0` to be in-line with how `chempy` works. We'll recreate this by scanning the formulas in `thermo` and then vectorizing the formulas.

In [46]:
thermo_df.loc[10, 'formula']

'ClO3-(aq)'

In [49]:
chlorate = Substance.from_formula(thermo_df.loc[10, 'formula'])
chlorate.composition

{17: 1, 8: 3, 0: -1}

In [50]:
for k, v in chlorate.composition.items():
    print (k, v)

17 1
8 3
0 -1


Using `chempy` it becomes quite simple to vectorize.

From the composition dictionary:

- key $\Rightarrow$ `stoich[key]`  
- value $\Rightarrow$ `stoich.at[index, key]`

In [52]:
formulas = set(thermo_df['formula'])

In [53]:
stoich_df = pd.DataFrame(formulas)
stoich_df.head()

Unnamed: 0,0
0,C4H9SH(aq)
1,C10H14N5O10P2-(aq)
2,C5H12S2(l)
3,C7H12(aq)
4,C6H10N3O2+(aq)


In [54]:
# naming the column -1 for now so we can sort them later

stoich_df.columns = [-1]
stoich_df.head()

Unnamed: 0,-1
0,C4H9SH(aq)
1,C10H14N5O10P2-(aq)
2,C5H12S2(l)
3,C7H12(aq)
4,C6H10N3O2+(aq)


In [55]:
for i, f in enumerate(stoich_df[-1]):
    try:
        sub = Substance.from_formula(f)
        for k, v in sub.composition.items():
            stoich_df.at[i, k] = v
    except:
        pass
        
stoich_df.head()

Unnamed: 0,-1,6,1,16,7,8,15,0,63,71,...,10,41,86,18,51,91,75,32,43,87
0,C4H9SH(aq),4.0,10.0,1.0,,,,,,,...,,,,,,,,,,
1,C10H14N5O10P2-(aq),10.0,14.0,,5.0,10.0,2.0,-1.0,,,...,,,,,,,,,,
2,C5H12S2(l),5.0,12.0,2.0,,,,,,,...,,,,,,,,,,
3,C7H12(aq),7.0,12.0,,,,,,,,...,,,,,,,,,,
4,C6H10N3O2+(aq),6.0,10.0,,3.0,2.0,,1.0,,,...,,,,,,,,,,


In [56]:
stoich_df = stoich_df[sorted(stoich_df.columns)]
stoich_df.head()

Unnamed: 0,-1,0,1,2,3,4,5,6,7,8,...,80,81,82,83,86,87,88,90,91,92
0,C4H9SH(aq),,10.0,,,,,4.0,,,...,,,,,,,,,,
1,C10H14N5O10P2-(aq),-1.0,14.0,,,,,10.0,5.0,10.0,...,,,,,,,,,,
2,C5H12S2(l),,12.0,,,,,5.0,,,...,,,,,,,,,,
3,C7H12(aq),,12.0,,,,,7.0,,,...,,,,,,,,,,
4,C6H10N3O2+(aq),1.0,10.0,,,,,6.0,3.0,2.0,...,,,,,,,,,,


In [57]:
stoich_df.rename(columns={-1: 'formula'}, inplace=True)
stoich_df.fillna(0, inplace=True)
stoich_df.head()

Unnamed: 0,formula,0,1,2,3,4,5,6,7,8,...,80,81,82,83,86,87,88,90,91,92
0,C4H9SH(aq),0.0,10.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,C10H14N5O10P2-(aq),-1.0,14.0,0.0,0.0,0.0,0.0,10.0,5.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,C5H12S2(l),0.0,12.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C7H12(aq),0.0,12.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,C6H10N3O2+(aq),1.0,10.0,0.0,0.0,0.0,0.0,6.0,3.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
pickle.dump(stoich_df, open('../data/processed/stoich_df.p', 'wb'))