In [1]:
import chemdataextractor as cde
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pickle
import numpy as np
import pandas as pd

## Types of balancing questions:

- Reaction without product. Problem statement.  
    Sodium metal reacts with chlorine gas in a combination reaction. Write a balanced equation to describe this reaction.
- Reaction with product. Problem statement.  
    Sodium metal reacts with chlorine gas to form sodium chloride.  
    Sodium metal and chlorine gas react to form sodium chloride.
- Problem statement: equation.  
    Balance the following equation: Na(s) + Cl2(g) -> NaCl(s)
- Problem statement, reaction.  
    Write a balanced chemical equation to describe the reaction between sodium metal and chlorine gas.  
    Complete and balance the equations for each of the following reactions.

In [2]:
reaction_phrases = [
    'methane reacts with oxygen to form carbon dioxide and water vapor.',
    'CH4 reacts with O2 to form CO2 and H2O.',
    'oxygen and methane react to form carbon dioxide and water.',
    'methane and oxygen react in a combustion reaction.',
    'methane combusts in the presence of oxygen.',
    'write a balanced chemical equation to describe the combustion of methane and oxygen.',
    'carbon dioxide is formed when methane combusts in oxygen.'
]

for r in reaction_phrases:
    r1 = cde.doc.Paragraph(r)
    print(r1.pos_tagged_tokens)
    print(r1.cems)
    print('===========')
    print()

[[('methane', 'NN'), ('reacts', 'VBZ'), ('with', 'IN'), ('oxygen', 'NN'), ('to', 'TO'), ('form', 'VB'), ('carbon', 'NN'), ('dioxide', 'NN'), ('and', 'CC'), ('water', 'NN'), ('vapor', 'NN'), ('.', '.')]]
[Span('methane', 0, 7), Span('oxygen', 20, 26), Span('carbon dioxide', 35, 49)]

[[('CH4', 'NN'), ('reacts', 'VBZ'), ('with', 'IN'), ('O2', 'NN'), ('to', 'TO'), ('form', 'VB'), ('CO2', 'NN'), ('and', 'CC'), ('H2O', 'NN'), ('.', '.')]]
[Span('CH4', 0, 3), Span('O2', 16, 18), Span('CO2', 27, 30), Span('H2O', 35, 38)]

[[('oxygen', 'NN'), ('and', 'CC'), ('methane', 'NN'), ('react', 'VBP'), ('to', 'TO'), ('form', 'VB'), ('carbon', 'NN'), ('dioxide', 'NN'), ('and', 'CC'), ('water', 'NN'), ('.', '.')]]
[Span('oxygen', 0, 6), Span('methane', 11, 18), Span('carbon dioxide', 33, 47)]

[[('methane', 'NN'), ('and', 'CC'), ('oxygen', 'NN'), ('react', 'NN'), ('in', 'IN'), ('a', 'DT'), ('combustion', 'NN'), ('reaction', 'NN'), ('.', '.')]]
[Span('methane', 0, 7), Span('oxygen', 12, 18)]

[[('methane'

In [170]:
r2 = cde.doc.Paragraph('''What is the electron configuration of radon?''')
r2.pos_tagged_tokens

[[('What', 'WP'),
  ('is', 'VBZ'),
  ('the', 'DT'),
  ('electron', 'NN'),
  ('configuration', 'NN'),
  ('of', 'IN'),
  ('radon', 'NN'),
  ('?', '.')]]

In [171]:
r2.cems

[Span('radon', 38, 43)]

In [99]:
z = Problem(sample)

In [106]:
z.question

In [108]:
print(z.inference)

None


In [None]:
def remove_stops(doc, i=0, progress=None):
    if progress:
        progress(i)
    doc = word_tokenize(doc)  # Split into words.
    doc = [w.lower() for w in doc if not w in stop_words]  # Remove stopwords.
    doc = [w for w in doc if w.isalpha()]  # Remove numbers and punctuation.
    return doc

In [64]:
balance_phrases = [
    'balance the following equations',
    'balance the following chemical equation',
    'complete and balance the equation',
    'use coefficients to balance each equation',
    'write a balanced chemical equation',
    'write a balanced molecular equation',
    'what is the balanced chemical equation',
    'when the equation above is balanced with lowest whole-number coefficients, the coefficient for is'
]

balanced = [remove_stops(sent) for sent in balance_phrases]
balanced

[['balance', 'following', 'equations'],
 ['balance', 'following', 'chemical', 'equation'],
 ['complete', 'balance', 'equation'],
 ['use', 'coefficients', 'balance', 'equation'],
 ['write', 'balanced', 'chemical', 'equation'],
 ['write', 'balanced', 'molecular', 'equation'],
 ['balanced', 'chemical', 'equation'],
 ['equation', 'balanced', 'lowest', 'coefficients', 'coefficient']]

In [63]:
model = pickle.load(open('../model/model.p', 'rb'))
w2v = pickle.load(open('../model/w2v.p', 'rb'))

word_vectors = model.wv

In [62]:
probs = pd.read_csv('../data/textbook-problems.csv')

In [67]:
probs.head()

Unnamed: 0,filepath,number,text,txt,balancing,e_config,distance,distlist
0,bauer-5,1,Consider the following molecular-level diagram...,"['consider', 'following', 'diagrams', 'chemica...",0,0,4.080827,
1,bauer-5,2,Consider the following molecular-level diagram...,"['consider', 'following', 'diagrams', 'chemica...",0,0,4.080827,
2,bauer-5,3,Balance the following chemical equations. Clas...,"['balance', 'following', 'chemical', 'equation...",1,0,3.988288,
3,bauer-5,4,Balance the following equations and classify t...,"['balance', 'following', 'equations', 'classif...",1,0,3.983582,
4,bauer-5,5,"When heated, nickel(II) carbonate undergoes a ...","['when', 'heated', 'nickel', 'ii', 'carbonate'...",1,0,4.066166,


In [75]:
for i in probs.index:
    distlist = [word_vectors.wmdistance(probs.loc[i, 'txt'], ' '.join(j)) for j in balanced]
    probs.loc[i, 'distance'] = np.mean(distlist)
probs.sort_values(by='distance')[:50]

Unnamed: 0,filepath,number,text,txt,balancing,e_config,distance,distlist,dist-list
52,bauer-5,53,Add the physical state for each substance in t...,"['add', 'physical', 'state', 'substance', 'fol...",1,0,0.664386,,
12,bauer-5,13,Complete and balance the equations for each of...,"['complete', 'balance', 'equations', 'followin...",1,0,0.678309,,
33,bauer-5,34,Write a balanced equation to describe any acid...,"['write', 'balanced', 'equation', 'describe', ...",1,0,0.678775,,
229,tro-7,34,Consider the unbalanced chemical equation. Al(...,"['consider', 'unbalanced', 'chemical', 'equati...",1,0,0.679457,,
32,bauer-5,33,Write a balanced equation to describe any acid...,"['write', 'balanced', 'equation', 'describe', ...",1,0,0.688059,,
10,bauer-5,11,Complete and balance the equation for each of ...,"['complete', 'balance', 'equation', 'following...",1,0,0.691208,,
9,bauer-5,10,Sodium metal reacts with chlorine gas in a com...,"['sodium', 'metal', 'reacts', 'chlorine', 'gas...",1,0,0.692503,,
11,bauer-5,12,Complete and balance the equation for each of ...,"['complete', 'balance', 'equation', 'following...",1,0,0.696141,,
275,tro-7,80,A beaker of nitric acid is neutralized with ca...,"['a', 'beaker', 'nitric', 'acid', 'neutralized...",1,0,0.701652,,
29,bauer-5,30,Aqueous ammonium chromate reacts with aqueous ...,"['aqueous', 'ammonium', 'chromate', 'reacts', ...",1,0,0.705137,,
