In [21]:
import re
import pickle
import pubchempy as pcp
import pandas as pd
import numpy as np
import itertools
import operator

from chempy import balance_stoichiometry
from chempy import Substance
from chempy import Reaction
from chempy.util import periodic

In [80]:
stoich_df = pickle.load(open('../data/processed/stoich_df.p', 'rb'))
thermo_df = pickle.load(open('../data/processed/thermo_df.p', 'rb'))

## pubchempy

In [81]:
def get_gibbs(formula, df=False):     
    '''
    Retrieves the free energy value, in J, of a single substance
    
    --Parameters--
    formula:        str
        a string of a single chemical formula
    
    --Output--
    list (float)    
        
    --Examples--
    >>> get_gibbs('NaCl(aq)')
    array([-388735.44])
    '''
    if (thermo_df['formula'] == formula).max():
        matches = thermo_df[thermo_df['formula'] == formula]
    else:
        matches = thermo_df[thermo_df['formula'].map(
            lambda x: x[:len(formula)] == formula)]
        matches = matches[matches['formula'].map(
            formula_state_separator) == formula]
        
    if df:
        return matches
    else:
        return list(matches['G'])

In [82]:
def state_predictor(formula):
    df = get_gibbs(formula, df=True)
    return list(df.sort_values(by='G')['formula'])[0]

In [83]:
def formula_state_separator(formula, keep_state=False):
    '''
    Separates the state from a formula string.
    
    --Parameters--
    formula:        str
        a string of a single chemical formula
    
    --Output--
    tuple (str)
        
    --Examples--
    >>> formula_state_separator('NaCl(aq)')
    ('NaCl', 'aq')
    
    >>> formula_state_separator('NaCl')
    'NaCl'
    '''
    try:
        regex = re.search('(?<=\()[aglsq]+', formula)
        formula = formula[:regex.start() - 1]
        if keep_state:
            state = regex.group(0)
            return formula, state
        else:
            return formula
    except:
        return formula

In [84]:
def Z_unique(substances):
    '''
    Returns a set representing unique atomic numbers present within a list of
    chemical formulas.
    
    --Parameters--
    substances:     iterable (str)
        any iterable containing strings with valid chemical formulas
    
    --Output--
    set (int)
        atomic numbers of each unique element present in substances
        
    --Example--
    >>> Z_unique(['CH4', 'H2O'])
    {1, 6, 8}
    '''
    if type(substances) == str:
        substances = [substances]
    
    composition = []
    for s in substances:
        sub = Substance.from_formula(s)
        composition += [*sub.composition]
    return set(composition)

In [85]:
list(Substance.from_formula('NaCl(aq)').composition.keys())

[11, 17]

In [86]:
def stoich_filter(substances, df=False, thorough=False, exact=False):
    '''
    Returns a masked copy of the stoich dataframe containing elements that
    only contain the elements present in substances. 
    
    --Parameters--
    substances:     iterable (str)
        any iterable containing strings with valid chemical formulas
    
    --Output--
    DataFrame
    '''
    if type(substances) == str:
        substances = [substances]
    
    stoich_temp = stoich_df.copy()
    
    # mask to keep the charge and formula columns in final dataframe
    z_keep = [0, 'formula'] + list(Z_unique(substances))
    
    # get all other columns
    column_mask = [col for col in stoich_temp.columns if col not in z_keep]
    for col in column_mask:
        # return the dataframe where these columns are all 0
        stoich_temp = stoich_temp[stoich_temp[col] == 0]
    
    # keep the columns where it's not all zero
    stoich_temp = stoich_temp.loc[(stoich_temp.drop(columns=['formula'])!=0).any(axis=1)]
    
    if exact:
        thorough = True
        substance = Substance.from_formula(substances[0])
        composition = substance.composition
        for z in list(composition.keys()):
            stoich_temp = stoich_temp[stoich_temp[z] == composition[z]]
    
    # return the dataframe with the columns we want to keep
    if df:
        return stoich_temp[z_keep]
    else:
        stoich_list = list(stoich_temp['formula'])
        if thorough:
            return [f for f in stoich_list]
        else:
            stoich_list = [formula_state_separator(f) for f in stoich_list]
            substances = [formula_state_separator(s) for s in substances]
            return [state_predictor(f) for f in stoich_list if f not in substances]

In [87]:
stoich_filter('CO2(g)', exact=True)

['CO2(aq)', 'CO2(g)']

In [88]:
stoich_filter('ClNa', exact=True)

['NaCl(s)', 'NaCl(aq)']

In [89]:
# https://stackoverflow.com/questions/1518522/

def most_common(L):
  # get an iterable of (item, iterable) pairs
  SL = sorted((x, i) for i, x in enumerate(L))
  # print 'SL:', SL
  groups = itertools.groupby(SL, key=operator.itemgetter(0))
  # auxiliary function to get "quality" for an item
  def _auxfun(g):
    item, iterable = g
    count = 0
    min_index = len(L)
    for _, where in iterable:
      count += 1
      min_index = min(min_index, where)
    # print 'item %r, count %r, minind %r' % (item, count, min_index)
    return count, -min_index
  # pick the highest-count/earliest item
  return max(groups, key=_auxfun)[0]

In [90]:
stoich_df[stoich_df['formula'] == 'CH3OCH3(g)']

Unnamed: 0,formula,0,1,2,3,4,5,6,7,8,...,80,81,82,83,86,87,88,90,91,92
1574,CH3OCH3(g),0.0,6.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
# there has to be a better way
# https://stackoverflow.com/questions/1518522/

def formula_rearranger(formula):
    if formula in list(thermo_df['formula']):
        return formula
    elif formula in list(thermo_df['abbrv']):
        return formula
    elif formula in list(thermo_df['formula'].apply(formula_state_separator)):
        return formula
    else:
        formulas = stoich_filter(formula, exact=True)
        formulas = [formula_state_separator(f) for f in formulas]
        return most_common(formulas)

In [92]:
formula_rearranger('BaO4S')

'BaSO4'

In [93]:
def formula_from_name(name):
    names = list(thermo_df['name'])
    nicknames = list(thermo_df['abbrv'])
    
    target = []
    for n in names:
        try:
            if name in n:
                target.append(n)
        except:
            for nn in nicknames:
                try:
                    if name in nn:
                        target.append
                except:
                    pass
    
    formulas = pcp.get_compounds(name, 'name', listkey_count=1)
    formula = formulas[0].molecular_formula
    formula = formula_rearranger(formula)
    return formula

In [94]:
formula_from_name('dimethyl ether')

'C2H5OH'

## going from sentence -> formula

In [95]:
import chemdataextractor as cde

In [96]:
sentence = 'ethane reacts with oxygen to make carbon dioxide and water'

In [97]:
processed = cde.doc.Paragraph(sentence)

In [104]:
if 'water' in processed.raw_tokens[0]:
    print(True)

True


In [107]:
[formula_from_name(cem.text) for cem in processed.cems]

['C2H6', 'O2', 'CO2']

In [108]:
formula_from_name('water')

'H2O'

## attempts at decoding names

In [63]:
def anion_namer(anion):
    anion = anion.lower()
    if anion[-3:] == 'ide':
        stem = anion[:-3]
        for element in periodic.lower_names:
            if stem in element:
                return element
    else:
        return anion

In [64]:
def unprefixer(name):
    
    prefixes = {
        'mono': 1,
        'di': 2,
        'tri': 3,
        'tetra': 4,
        'penta': 5,
        'hexa': 6,
        'hepta': 7,
        'octa': 8,
        'nona': 9,
        'deca': 10
    }
    
    num = 1
    name = name
    
    for prefix in prefixes.keys():
        p = len(prefix)
        o = p - 1
        if prefix == name[:p]:
            num = prefixes[prefix]
            name = name[p:]
        elif prefix[:-1] == name[:o]:
            num = prefixes[prefix]
            name = name[o:]
        
#     return str(num), str(element_symbolizer(anion_formulizer(name)))
    return anion_namer(name), num

In [65]:
unprefixer('dichloride')

('chlorine', 2)

In [66]:
def atomic_indexer(element, Z=False):
    if len(element) > 2:
        an = periodic.lower_names.index(element.lower()) + int(Z)
    else:
        try:
            an = periodic.symbols.index(element) + int(Z)
        except:
            an = np.nan
    return an

In [67]:
atomic_indexer('Chlorine', Z=True)

17

In [68]:
periodic.symbols.index('Cl')

16

In [69]:
# https://www.meta-synthesis.com/webbook/37_ak/triangles.html

def is_ionic(an1, an2, m=0.4):
    
    en1 = electronegativities[an1 - 1]
    en2 = electronegativities[an2 - 1]
    
    en_mean = np.mean((en1, en2))
    en_diff = (en1 - en2)
    
    if (en_diff >= (4.4 - 2*en_mean)) and (en_diff >= (-3.2 + 2*en_mean)):
        return True
    elif (en_diff >= (4.6 - m - 2*en_mean)) and (en_diff >= (-3 - m + 2*en_mean)):
        return 'Intermediate'
    else:
        return False

In [70]:
def binary_formulizer(n):
    names = n.split()
    comp = {}
    for n in names:
        name, num = unprefixer(n)
        Z = atomic_indexer(name, Z=True)
        comp[Z] = num
    formula = ''
    for z in [*comp]:
        formula += periodic.symbols[z - 1]
        if comp[z] != 1:
            formula += str(comp[z])
            
    return Substance.from_formula(formula)

In [71]:
electronegativities = (np.nan,
    2.2, np.nan, 0.98, 1.57, 2.04, 2.55, 3.04, 3.44, 3.98, np.nan, 0.93, 1.31,
    1.61,1.9, 2.19, 2.58, 3.16, np.nan, 0.82, 1.0, 1.36, 1.54, 1.63, 1.66,
    1.55, 1.83,1.88, 1.91, 1.9, 1.65, 1.81, 2.01, 2.18, 2.55, 2.96, 3.0, 0.82,
    0.95, 1.22, 1.33, 1.6, 2.16, 1.9, 2.2, 2.28, 2.2, 1.93, 1.69, 1.78, 1.96,
    2.05, 2.1, 2.66, 2.6, 0.79, 0.89, 1.1, 1.12, 1.13, 1.14, 1.13, 1.17, 1.2,
    1.2, 1.2, 1.22, 1.23, 1.24, 1.25, 1.1, 1.27, 1.3, 1.5, 2.36, 1.9, 2.2, 2.2,
    2.28, 2.54, 2.0, 1.62, 1.87, 2.02, 2.0, 2.2, 2.2, 0.7, 0.9, 1.1, 1.3, 1.5,
    1.38, 1.36, 1.28, 1.13, 1.28, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, np.nan,
    np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
    np.nan, np.nan, np.nan, np.nan, np.nan
)

In [72]:
groups_cas = ((np.nan, ''),
    (1,'A'), (8,'A'), (1,'A'), (2,'A'), (3,'A'), (4,'A'), (5,'A'), (6,'A'),
    (7,'A'), (8,'A'), (1,'A'), (2,'A'), (3,'A'), (4,'A'), (5,'A'), (6,'A'),
    (7,'A'), (8,'A'), (1,'A'), (2,'A'), (3,'B'), (4,'B'), (5,'B'), (6,'B'),
    (7,'B'), (8,'B'), (8,'B'), (8,'B'), (1,'B'), (2,'B'), (3,'A'), (4,'A'),
    (5,'A'), (6,'A'), (7,'A'), (8,'A'), (1,'A'), (2,'A'), (3,'B'), (4,'B'),
    (5,'B'), (6,'B'), (7,'B'), (8,'B'), (8,'B'), (8,'B'), (1,'B'), (2,'B'),
    (3,'A'), (4,'A'), (5,'A'), (6,'A'), (7,'A'), (8,'A'), (1,'A'), (2,'A'),
    (3,'B'), np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
    np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, (4,'B'), (5,'B'), (6,'B'),
    (7,'B'), (8,'B'), (8,'B'), (8,'B'), (1,'B'), (2,'B'), (3,'A'), (4,'A'),
    (5,'A'), (6,'A'), (7,'A'), (8,'A'), (1,'A'), (2,'A'), (3,'B'), np.nan,
    np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
    np.nan, np.nan, np.nan, np.nan, (4,'B'), (5,'B'), (6,'B'), (7,'B'),
    (8,'B'), (8,'B'), (8,'B'), (1,'B'), (2,'B'), (3,'A'), (4,'A'), (5,'A'),
    (6,'A'), (7,'A'), (8,'A')
)

In [55]:
    #     # list the electronegativities for each element
    #     ens = [electronegativities[z] for z in elems]

    #     # ens.index(max(ens)) gets the index of the element with the max EN
    #     zmax = elems[ens.index(max(ens))]

    #     num_zmax = substance.composition[zmax]

    #     # works for main-group elements. fix later
    #     onmax = groups_cas[zmax][0] - 8

    #     # assign value in dictionary
    #     ons[zmax] = onmax

    #     zmin = elems[ens.index(min(ens))]
    #     num_zmin = substance.composition[zmin]
    #     onmin = groups_cas[zmin][0]
    #     ons[zmin] = onmin

    # pos = num_zmin * onmin
    # neg = num_zmax * onmax

In [74]:
def oxidation_numberer(substance):
    # in the future, implement SMILES to calculate ON most accurately

    # list of all elements present
    elems, num_elems = zip(*substance.composition.items())
    elems = [e for e in elems if e > 0]
    
    try:
        # 0 is placeholder for charge in substance dictionary
        charge = substance.composition[0]
    except:
        charge = 0
    
    # eventually return a dictionary that lists the oxidation numbers
    ons = {}
    

    # ON is 0 if there is only one type of element
    if len(elems) == 1:
        return {elems[0]: charge / num_elems[0]}
    
    # else:
    #     if 9 in elems:
    #         ons[9] = -1
        


    while charge != pos + neg:
        print('will finish later')
        break
    return ons

In [75]:
tri53 = Substance.from_formula('I3-')

In [76]:
oxidation_numberer(Substance.from_formula('H2CO3'))

NameError: name 'pos' is not defined

In [52]:
XeF4 = binary_formulizer('xenon tetrafluoride')
XeF4

In [46]:
print(list(XeF4.composition.keys()))
print([*XeF4.composition])

[54, 9]
[54, 9]


In [47]:
XeF4.mass

207.286612652