In [1]:
import re
import pubchempy as pcp
import pandas as pd
import numpy as np

from chempy import balance_stoichiometry
from chempy import Substance
from chempy import Reaction
from chempy.util import periodic

## straightforward equation balancer

In [2]:
reactants = 'CH4 + O2'
products = 'CO2 + H2O'

In [3]:
def split(side):
    side = re.split('[\+\,]+', side)
    side = [s.strip() for s in side]
    return side

In [4]:
split(reactants)

['CH4', 'O2']

In [5]:
output = Reaction(*balance_stoichiometry(split(reactants), split(products)))

In [6]:
keys = output.keys()
keys

{'CH4', 'CO2', 'H2O', 'O2'}

In [7]:
substances = {k: Substance.from_formula(k) for k in keys}
substances

{'H2O': <Substance(name=H2O, ...)>,
 'CH4': <Substance(name=CH4, ...)>,
 'CO2': <Substance(name=CO2, ...)>,
 'O2': <Substance(name=O2, ...)>}

In [8]:
output.unicode(substances)

'CH₄ + 2 O₂ → CO₂ + 2 H₂O'

## pubchempy

In [9]:
sulfate = Substance.from_formula('SO4-2')

In [10]:
sulfate.composition

{16: 1, 8: 4, 0: -2}

In [11]:
[*sulfate.composition]

[16, 8, 0]

In [12]:
test_dict = {'sulfate': 'SO4-2'}

In [88]:
cop = 'copper(I) sulfate'

In [89]:
salt = pcp.get_compounds(cop, 'name', list_return='flat')
salt[0].molecular_formula

'Cu2O4S'

In [15]:
Substance.from_formula(salt[0].molecular_formula)

## going from sentence -> formula

In [16]:
import chemdataextractor as cde

In [90]:
sentence = 'ethane reacts with oxygen to make carbon dioxide and water'

In [91]:
processed = cde.doc.Paragraph(sentence)

In [92]:
processed.raw_tokens

[['ethane',
  'reacts',
  'with',
  'oxygen',
  'to',
  'make',
  'carbon',
  'dioxide',
  'and',
  'water']]

In [93]:
processed.pos_tags

[['NN', 'VBZ', 'IN', 'NN', 'TO', 'VB', 'NN', 'NN', 'CC', 'NN']]

In [94]:
processed.pos_tagged_tokens

[[('ethane', 'NN'),
  ('reacts', 'VBZ'),
  ('with', 'IN'),
  ('oxygen', 'NN'),
  ('to', 'TO'),
  ('make', 'VB'),
  ('carbon', 'NN'),
  ('dioxide', 'NN'),
  ('and', 'CC'),
  ('water', 'NN')]]

In [95]:
processed.cems

[Span('ethane', 0, 6), Span('oxygen', 19, 25), Span('carbon dioxide', 34, 48)]

In [96]:
processed.cems[0].text

'ethane'

In [99]:
def pcp_formula_namer(name):
    temp = pcp.get_compounds(name, 'name', listkey_count=1)
    return temp[0].molecular_formula

In [101]:
pcp_formula_namer('potassium permanganate')

'KMnO4'

## attempts at decoding names

In [24]:
def polyatomic_formulizer(ion):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-24-9d3a23448818>, line 2)

In [None]:
def anion_namer(anion):
    anion = anion.lower()
    if anion[-3:] == 'ide':
        stem = anion[:-3]
        for element in periodic.lower_names:
            if stem in element:
                return element
    else:
        return anion

In [27]:
def unprefixer(name):
    
    prefixes = {
        'mono': 1,
        'di': 2,
        'tri': 3,
        'tetra': 4,
        'penta': 5,
        'hexa': 6,
        'hepta': 7,
        'octa': 8,
        'nona': 9,
        'deca': 10
    }
    
    num = 1
    name = name
    
    for prefix in prefixes.keys():
        p = len(prefix)
        o = p - 1
        if prefix == name[:p]:
            num = prefixes[prefix]
            name = name[p:]
        elif prefix[:-1] == name[:o]:
            num = prefixes[prefix]
            name = name[o:]
        
#     return str(num), str(element_symbolizer(anion_formulizer(name)))
    return anion_namer(name), num

In [31]:
unprefixer('dichloride')

('chlorine', 2)

In [32]:
def atomic_indexer(element, Z=False):
    if len(element) > 2:
        an = periodic.lower_names.index(element.lower()) + int(Z)
    else:
        try:
            an = periodic.symbols.index(element) + int(Z)
        except:
            an = np.nan
    return an

In [33]:
atomic_indexer('Chlorine', Z=True)

17

In [34]:
periodic.symbols.index('Cl')

16

In [35]:
# https://www.meta-synthesis.com/webbook/37_ak/triangles.html

def is_ionic(an1, an2, m=0.4):
    
    en1 = electronegativities[an1 - 1]
    en2 = electronegativities[an2 - 1]
    
    en_mean = np.mean((en1, en2))
    en_diff = (en1 - en2)
    
    if (en_diff >= (4.4 - 2*en_mean)) and (en_diff >= (-3.2 + 2*en_mean)):
        return True
    elif (en_diff >= (4.6 - m - 2*en_mean)) and (en_diff >= (-3 - m + 2*en_mean)):
        return 'Intermediate'
    else:
        return False

In [36]:
def binary_formulizer(n):
    names = n.split()
    comp = {}
    for n in names:
        name, num = unprefixer(n)
        Z = atomic_indexer(name, Z=True)
        comp[Z] = num
    formula = ''
    for z in [*comp]:
        formula += periodic.symbols[z - 1]
        if comp[z] != 1:
            formula += str(comp[z])
            
    return Substance.from_formula(formula)

In [37]:
electronegativities = (np.nan,
    2.2, np.nan, 0.98, 1.57, 2.04, 2.55, 3.04, 3.44, 3.98, np.nan, 0.93, 1.31,
    1.61,1.9, 2.19, 2.58, 3.16, np.nan, 0.82, 1.0, 1.36, 1.54, 1.63, 1.66,
    1.55, 1.83,1.88, 1.91, 1.9, 1.65, 1.81, 2.01, 2.18, 2.55, 2.96, 3.0, 0.82,
    0.95, 1.22, 1.33, 1.6, 2.16, 1.9, 2.2, 2.28, 2.2, 1.93, 1.69, 1.78, 1.96,
    2.05, 2.1, 2.66, 2.6, 0.79, 0.89, 1.1, 1.12, 1.13, 1.14, 1.13, 1.17, 1.2,
    1.2, 1.2, 1.22, 1.23, 1.24, 1.25, 1.1, 1.27, 1.3, 1.5, 2.36, 1.9, 2.2, 2.2,
    2.28, 2.54, 2.0, 1.62, 1.87, 2.02, 2.0, 2.2, 2.2, 0.7, 0.9, 1.1, 1.3, 1.5,
    1.38, 1.36, 1.28, 1.13, 1.28, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, 1.3, np.nan,
    np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
    np.nan, np.nan, np.nan, np.nan, np.nan
)

In [38]:
groups_cas = ((np.nan, ''),
    (1,'A'), (8,'A'), (1,'A'), (2,'A'), (3,'A'), (4,'A'), (5,'A'), (6,'A'),
    (7,'A'), (8,'A'), (1,'A'), (2,'A'), (3,'A'), (4,'A'), (5,'A'), (6,'A'),
    (7,'A'), (8,'A'), (1,'A'), (2,'A'), (3,'B'), (4,'B'), (5,'B'), (6,'B'),
    (7,'B'), (8,'B'), (8,'B'), (8,'B'), (1,'B'), (2,'B'), (3,'A'), (4,'A'),
    (5,'A'), (6,'A'), (7,'A'), (8,'A'), (1,'A'), (2,'A'), (3,'B'), (4,'B'),
    (5,'B'), (6,'B'), (7,'B'), (8,'B'), (8,'B'), (8,'B'), (1,'B'), (2,'B'),
    (3,'A'), (4,'A'), (5,'A'), (6,'A'), (7,'A'), (8,'A'), (1,'A'), (2,'A'),
    (3,'B'), np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
    np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, (4,'B'), (5,'B'), (6,'B'),
    (7,'B'), (8,'B'), (8,'B'), (8,'B'), (1,'B'), (2,'B'), (3,'A'), (4,'A'),
    (5,'A'), (6,'A'), (7,'A'), (8,'A'), (1,'A'), (2,'A'), (3,'B'), np.nan,
    np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
    np.nan, np.nan, np.nan, np.nan, (4,'B'), (5,'B'), (6,'B'), (7,'B'),
    (8,'B'), (8,'B'), (8,'B'), (1,'B'), (2,'B'), (3,'A'), (4,'A'), (5,'A'),
    (6,'A'), (7,'A'), (8,'A')
)

In [39]:
ens = [electronegativities[z] for z in sulfate.composition.keys()]

# ens.index(max(ens)) returns the index of the element with the highest en
[*sulfate.composition][ens.index(min(ens))]

16

In [40]:
groups_cas[[*sulfate.composition][ens.index(max(ens))]][0] - 8

-2

In [41]:
sulfate.composition[8]

4

In [55]:
    #     # list the electronegativities for each element
    #     ens = [electronegativities[z] for z in elems]

    #     # ens.index(max(ens)) gets the index of the element with the max EN
    #     zmax = elems[ens.index(max(ens))]

    #     num_zmax = substance.composition[zmax]

    #     # works for main-group elements. fix later
    #     onmax = groups_cas[zmax][0] - 8

    #     # assign value in dictionary
    #     ons[zmax] = onmax

    #     zmin = elems[ens.index(min(ens))]
    #     num_zmin = substance.composition[zmin]
    #     onmin = groups_cas[zmin][0]
    #     ons[zmin] = onmin

    # pos = num_zmin * onmin
    # neg = num_zmax * onmax

In [85]:
def oxidation_numberer(substance):
    # in the future, implement SMILES to calculate ON most accurately

    # list of all elements present
    elems, num_elems = zip(*substance.composition.items())
    elems = [e for e in elems if e > 0]
    
    try:
        # 0 is placeholder for charge in substance dictionary
        charge = substance.composition[0]
    except:
        charge = 0
    
    # eventually return a dictionary that lists the oxidation numbers
    ons = {}
    

    # ON is 0 if there is only one type of element
    if len(elems) == 1:
        return {elems[0]: charge / num_elems[0]}
    
    # else:
    #     if 9 in elems:
    #         ons[9] = -1
        


    while charge != pos + neg:
        print('will finish later')
        break
    return ons

In [86]:
tri53 = Substance.from_formula('I3-')

In [87]:
oxidation_numberer(tri53)

{53: -0.3333333333333333}

In [52]:
XeF4 = binary_formulizer('xenon tetrafluoride')
XeF4

In [46]:
print(list(XeF4.composition.keys()))
print([*XeF4.composition])

[54, 9]
[54, 9]


In [47]:
XeF4.mass

207.286612652