In [1]:
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
import multiprocessing

# https://stackoverflow.com/questions/33073972/
import os
import time
# os.environ['TIKA_SERVER_JAR'] = 'https://repo1.maven.org/maven2/org/apache/tika/tika-server/1.19/tika-server-1.19.jar'
import tika
from tika import parser

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

In [50]:
from sklearn.model_selection import train_test_split

In [2]:
stoichpaths = []

for file in os.listdir('../data/external/stoichiometry/'):
    if 'pdf' in file:
        stoichpaths.append('../data/external/stoichiometry/'+file)

In [3]:
quanpaths = []

for file in os.listdir('../data/external/quantum/'):
    if 'pdf' in file:
        quanpaths.append('../data/external/quantum/'+file)

In [4]:
stop_words = stopwords.words('english')
stop_words += ['copyright',
               'cengage',
               'pearson',
               'learning',
               'may',
               'copied',
               'scanned',
               'duplicated',
               'chapter',
               'practice',
               'problem',
               'exercise',
               'review',
               'question',
               'figure',
               'follow']
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [5]:
def remove_stops(doc, i=0, progress=None):
    if progress:
        progress(i)
    doc = word_tokenize(doc)  # Split into words.
    doc = [w.lower() for w in doc if not w in stop_words]  # Remove stopwords.
    doc = [w for w in doc if w.isalpha()]  # Remove numbers and punctuation.
    return doc

In [7]:
def progress(doc, i):
    if (i+1) % 10000 == 0:
        print(f'{i + 1} out of {len(doc)}')

In [9]:
def grab_text(file, sleep=0, counter=0):
    
    if counter == 2:
        pass

    raw = parser.from_file(file)
    status = raw['status']
    
    # if things go well
    if status == 200:
        print(f"'{file}' successfully opened!")
        return raw['content']
    
    # if things don't go well
    else:
        print(f'! ! ! ! error code {status} ! ! ! !')
        print(f'! ! ! ! trying again ! ! ! !')
        
        # we can usually get around other errors by waiting
        time.sleep(5)
        counter += 1
        return grab_text(file, counter=counter)

In [16]:
stoichtext = []
for f in stoichpaths:
    print(f'opening {f}')
    text = grab_text(f)
    stoichtext.append(text)

opening ../data/external/stoichiometry/_burdge-ch8.pdf
'../data/external/stoichiometry/_burdge-ch8.pdf' successfully opened!
opening ../data/external/stoichiometry/_zumdahl-ch5.pdf
'../data/external/stoichiometry/_zumdahl-ch5.pdf' successfully opened!
opening ../data/external/stoichiometry/_mcmurry-ch6.pdf
'../data/external/stoichiometry/_mcmurry-ch6.pdf' successfully opened!
opening ../data/external/stoichiometry/_gilbert-ch7.pdf
'../data/external/stoichiometry/_gilbert-ch7.pdf' successfully opened!


In [23]:
quantext = []
for f in quanpaths:
    print(f'opening {f}')
    text = grab_text(f)
    quantext.append(text)

opening ../data/external/quantum/_zumdahl-ch2.pdf
'../data/external/quantum/_zumdahl-ch2.pdf' successfully opened!
opening ../data/external/quantum/_burdge-ch2.pdf
'../data/external/quantum/_burdge-ch2.pdf' successfully opened!
opening ../data/external/quantum/_mcmurry-ch2.pdf
'../data/external/quantum/_mcmurry-ch2.pdf' successfully opened!
opening ../data/external/quantum/_gilbert-ch3.pdf
'../data/external/quantum/_gilbert-ch3.pdf' successfully opened!


In [17]:
stoichtext[0]

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nC H A P T E R\n\nChemical Reactions\n\n8\n\n8.1 Chemical Equations\n• Interpreting and Writing Chemical Equations  \n• Balancing Chemical Equations • Patterns of \nChemical Reactivity\n\n8.2 Combustion Analysis \n• Determination of Empirical Formula\n\n8.3 Calculations with Balanced Chemical \nEquations \n• Moles of Reactants and Products • Mass of \nReactants and Products\n\n8.4 Limiting Reactants\n• Determining the Limiting Reactant • Reaction \nYield\n\n8.5 Periodic Trends in Reactivity of the Main \nGroup Elements\n• General Trends in Reactivity • Reactions of  \nthe Active Metals • Reactions of Other Main \nGroup Elements • Comparison of Group 1A  \nand Group 1B Elements\n\nIn marine coastal environments, harmful algal blooms known as “red tides” \n(shown here) result when microscopic algal species grow uncontrolled. These \nevents can potentially imperil populations of fish, seabirds, and marine \nmammals.

In [20]:
len('bur11161_ 08_266-305.indd   273 11/9/10   3:21 PM')

49

In [42]:
clean = re.sub('\n\n', r'PPAARRAAGGRRAAPPHH', stoichtext[0])
clean = re.sub('\n', r'', clean)
# #     clean = re.sub('(\d+)\n\n', r'\1PPAARRAAGGRRAAPPHHJJOOIINN', clean)
# clean = re.sub('([A-Za-z]+)\-\n\n', 'PPAARRAAGGRRAAPPHH', clean)
# clean = re.sub('\s\n\n', 'PPAARRAAGGRRAAPPHH', clean)
# clean = re.sub('\n\n\n\n', 'PPAARRAAGGRRAAPPHH', clean)
# clean = re.sub('\n\n', ' ', clean)
# clean = re.sub('\-\n', '', clean)
# clean = re.sub('\n', ' ', clean)
# clean = re.sub('\t', ' ', clean)
# clean = re.sub('\s\s', ' ', clean)
# clean = re.sub('\-([a-zA-Z]+)', r'\1', clean)
# clean = re.sub('\ue060', 'INFINITY', clean)
# #     clean = re.sub('([A-Za-z]+)JJOOIINNPPAARRAAGGRRAAPPHH(.+)PPAARRAAGGRRAAPPHHJJOOIINN([a-z]+)', r'\1\3', clean)    
clean = re.split('PPAARRAAGGRRAAPPHH', clean)
clean = [c for c in clean if len(c) > 49]
clean

stoichiometry = [TaggedDocument(wordpunct_tokenize(c), [1]) for c in clean]

In [51]:
strain, stest = train_test_split(stoichiometry, test_size=0.25)

In [43]:
clean = re.sub('\n\n', r'PPAARRAAGGRRAAPPHH', quantext[0])
clean = re.sub('\n', r'', clean)
# #     clean = re.sub('(\d+)\n\n', r'\1PPAARRAAGGRRAAPPHHJJOOIINN', clean)
# clean = re.sub('([A-Za-z]+)\-\n\n', 'PPAARRAAGGRRAAPPHH', clean)
# clean = re.sub('\s\n\n', 'PPAARRAAGGRRAAPPHH', clean)
# clean = re.sub('\n\n\n\n', 'PPAARRAAGGRRAAPPHH', clean)
# clean = re.sub('\n\n', ' ', clean)
# clean = re.sub('\-\n', '', clean)
# clean = re.sub('\n', ' ', clean)
# clean = re.sub('\t', ' ', clean)
# clean = re.sub('\s\s', ' ', clean)
# clean = re.sub('\-([a-zA-Z]+)', r'\1', clean)
# clean = re.sub('\ue060', 'INFINITY', clean)
# #     clean = re.sub('([A-Za-z]+)JJOOIINNPPAARRAAGGRRAAPPHH(.+)PPAARRAAGGRRAAPPHHJJOOIINN([a-z]+)', r'\1\3', clean)    
clean = re.split('PPAARRAAGGRRAAPPHH', clean)
clean = [c for c in clean if len(c) > 49]
clean

quantum = [TaggedDocument(wordpunct_tokenize(c), [0]) for c in clean]

In [52]:
qtrain, qtest = train_test_split(quantum, test_size=0.25)

In [44]:
quantum[:5]

[TaggedDocument(words=['The', 'line', 'spectra', 'of', 'a', 'number', 'of', 'gases', '.', 'Each', 'spectrum', 'is', 'unique', 'and', 'allows', 'the', 'identification', 'of', 'the', 'elements', '.', '(', 'Ted', 'Kinsman', '/', 'Science', 'Source', ')'], tags=[0]),
 TaggedDocument(words=['2', '-', '1', 'Electromagnetic', 'Radiation2', '-', '2', 'The', 'Nature', 'of', 'Matter'], tags=[0]),
 TaggedDocument(words=['2', '-', '3', 'The', 'Atomic', 'Spectrum', 'of', 'Hydrogen2', '-', '4', 'The', 'Bohr', 'Model2', '-', '5', 'The', 'Quantum', 'Mechanical', 'Model', 'of', 'the', 'Atom'], tags=[0]),
 TaggedDocument(words=['2', '-', '6', 'Quantum', 'Numbers2', '-', '7', 'Orbital', 'Shapes', 'and', 'Energies2', '-', '8', 'Electron', 'Spin', 'and', 'the', 'Pauli', 'Principle'], tags=[0]),
 TaggedDocument(words=['2', '-', '9', 'Polyelectronic', 'Atoms2', '-', '10', 'The', 'History', 'of', 'the', 'Periodic', 'Table2', '-', '11', 'The', 'Aufbau', 'Principle', 'and', 'the', 'Periodic', 'Table2', '-', '12

In [53]:
train = strain + qtrain
test = stest + qtest

In [49]:
model = Doc2Vec(vector_size=100, window=3, min_count=2, epochs=40)

In [54]:
model.build_vocab(train)

In [55]:
model.train(train, total_examples=model.corpus_count, epochs=40)

In [61]:
model.wv.n_similarity(word_tokenize('balance the following chemical equation'), 
                             word_tokenize('balance equation'))

0.8905861

In [65]:
model.wv.n_similarity(word_tokenize('ground state orbital'), 
                      word_tokenize('unbalanced chemical equation'))

0.47139195

In [66]:
filepaths = []

for file in os.listdir('../data/external/texts/'):
    if 'pdf' in file:
        filepaths.append('../data/external/texts/'+file)
filepaths

['../data/external/texts/mcquarrie4.pdf',
 '../data/external/texts/silberberg8.pdf',
 '../data/external/texts/oxtoby7.pdf',
 '../data/external/texts/petrucci11.pdf',
 '../data/external/texts/_atkins6.pdf',
 '../data/external/texts/brown14.pdf',
 '../data/external/texts/chang12.pdf',
 '../data/external/texts/_gilbert2.pdf',
 '../data/external/texts/_mcmurry2.pdf',
 '../data/external/texts/zumdahl10.pdf',
 '../data/external/texts/_zumdahl2.pdf',
 '../data/external/texts/_burdge1.pdf',
 '../data/external/texts/tro4.pdf',
 '../data/external/texts/mcmurry7.pdf',
 '../data/external/texts/oxtoby8.pdf']

In [67]:
corpus = []
for f in filepaths:
    print(f'opening {f}')
    text = grab_text(f)
    corpus.append(text)

opening ../data/external/texts/mcquarrie4.pdf
'../data/external/texts/mcquarrie4.pdf' successfully opened!
opening ../data/external/texts/silberberg8.pdf
'../data/external/texts/silberberg8.pdf' successfully opened!
opening ../data/external/texts/oxtoby7.pdf
'../data/external/texts/oxtoby7.pdf' successfully opened!
opening ../data/external/texts/petrucci11.pdf
'../data/external/texts/petrucci11.pdf' successfully opened!
opening ../data/external/texts/_atkins6.pdf
'../data/external/texts/_atkins6.pdf' successfully opened!
opening ../data/external/texts/brown14.pdf
'../data/external/texts/brown14.pdf' successfully opened!
opening ../data/external/texts/chang12.pdf
'../data/external/texts/chang12.pdf' successfully opened!
opening ../data/external/texts/_gilbert2.pdf
'../data/external/texts/_gilbert2.pdf' successfully opened!
opening ../data/external/texts/_mcmurry2.pdf
'../data/external/texts/_mcmurry2.pdf' successfully opened!
opening ../data/external/texts/zumdahl10.pdf
'../data/externa

In [68]:
corpus = '\n\n\n\n'.join(corpus)

In [69]:
len(corpus)

60915170

In [102]:
pickle.dump(corpus, open('../data/processed/raw.p', 'wb'))

In [104]:
corpus[:50]

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGeneral Chemist'

In [107]:
def clean_textbook(text):
    clean = re.sub('\n\n', r'PPAARRAAGGRRAAPPHH', corpus)
    clean = re.sub('\n', r'', clean)
    clean = re.split('PPAARRAAGGRRAAPPHH', clean)
    clean = [TaggedDocument(wordpunct_tokenize(c), [i]) for i, c in enumerate(clean)\
             if len(c) > 49]
    return clean

In [70]:
clean = re.sub('\n\n', r'PPAARRAAGGRRAAPPHH', corpus)
clean = re.sub('\n', r'', clean)
# #     clean = re.sub('(\d+)\n\n', r'\1PPAARRAAGGRRAAPPHHJJOOIINN', clean)
# clean = re.sub('([A-Za-z]+)\-\n\n', 'PPAARRAAGGRRAAPPHH', clean)
# clean = re.sub('\s\n\n', 'PPAARRAAGGRRAAPPHH', clean)
# clean = re.sub('\n\n\n\n', 'PPAARRAAGGRRAAPPHH', clean)
# clean = re.sub('\n\n', ' ', clean)
# clean = re.sub('\-\n', '', clean)
# clean = re.sub('\n', ' ', clean)
# clean = re.sub('\t', ' ', clean)
# clean = re.sub('\s\s', ' ', clean)
# clean = re.sub('\-([a-zA-Z]+)', r'\1', clean)
# clean = re.sub('\ue060', 'INFINITY', clean)
# #     clean = re.sub('([A-Za-z]+)JJOOIINNPPAARRAAGGRRAAPPHH(.+)PPAARRAAGGRRAAPPHHJJOOIINN([a-z]+)', r'\1\3', clean)    
clean = re.split('PPAARRAAGGRRAAPPHH', clean)
clean = [TaggedDocument(wordpunct_tokenize(c), [i]) for i, c in enumerate(clean) if len(c) > 49]
clean

[TaggedDocument(words=['Donald', 'A', '.', 'McQuarrieUniversity', 'of', 'California', ',', 'Davis'], tags=[18]),
 TaggedDocument(words=['University', 'Science', 'Books20', 'Edgehill', 'RoadMill', 'Valley', ',', 'CA', '94941www', '.', 'uscibooks', '.', 'com'], tags=[24]),
 TaggedDocument(words=['Produced', 'by', 'Wilsted', '&', 'Taylor', 'Publishing', 'ServicesProject', 'Manager', ':', 'Jennifer', 'UhlichDevelopmental', 'Editor', ':', 'John', 'MurdzekCopy', 'Editor', ':', 'Jennifer', 'McClainEditorial', 'Assistance', ':', 'Nancy', 'Evans', ',', 'Andrew', 'Joron', ',', 'Antonia', 'AngressIllustrations', ':', 'George', 'Kelvin', 'and', 'Laurel', 'MullerBook', 'and', 'Cover', 'Design', ':', 'Yvonne', 'TsangComposition', ':', 'Yvonne', 'Tsang', ',', 'Laurel', 'Muller', ',', 'Hassan', 'Herz', ',', 'Lucy', 'Brank', ',', 'Jody', 'HansonProofreading', ':', 'Mervin', 'HansonPrinting', 'and', 'Binding', ':', 'Transcontinental'], tags=[25]),
 TaggedDocument(words=['ISBN', '978', '-', '1', '-', '89

In [71]:
pickle.dump(clean, open('../data/processed/tagged.p', 'wb'))

In [105]:
len(clean)

248197

In [73]:
model_1324 = Doc2Vec(vector_size=100, window=3, min_count=2, epochs=40)
model_1324.build_vocab(clean)
model_1324.train(clean, total_examples=model.corpus_count, epochs=model.epochs)

In [74]:
# pickle.dump(model_1324, open('../data/processed/model_1324.p', 'wb'))

In [5]:
model = pickle.load(open('../data/processed/model_1324.p', 'rb'))

In [6]:
model.wv.n_similarity(word_tokenize('balance the following chemical equation'), 
                             word_tokenize('balance equation'))

0.80299705

In [76]:
model_1324.wv.n_similarity(word_tokenize('balance the following chemical equation'), 
                      word_tokenize('electron configuration'))

0.42034274

In [77]:
model_1324.wv.n_similarity(word_tokenize('ground state electron'), 
                             word_tokenize('balance equation'))

0.44838914

In [78]:
model_1324.wv.n_similarity(word_tokenize('ground state electron'), 
                             word_tokenize('electron configuration'))

0.88627183

In [81]:
stoich_sentence = 'methane and oxygen react to form carbon dioxide and water. write the balanced chemical equation for this reaction.'

model_1324.wv.n_similarity(word_tokenize(stoich_sentence), 
                             word_tokenize('balance equation'))

0.66748023

In [82]:
model_1324.wv.n_similarity(word_tokenize(stoich_sentence), 
                             word_tokenize('electron configuration'))

0.43156946

In [84]:
quantum_sentence = "why is argon atomic radius smaller than that of krypton?"
model_1324.wv.n_similarity(word_tokenize(quantum_sentence), 
                             word_tokenize('balance equation'))

0.41321865

In [93]:
sentence = "redox reaction"
model_1324.wv.n_similarity(word_tokenize(sentence), 
                             word_tokenize('electron configuration'))

0.3454761

In [94]:
sentence = "redox reaction"
model_1324.wv.n_similarity(word_tokenize(sentence), 
                             word_tokenize('balanced equation'))

0.68762016

In [3]:
sentence = 'balance the following reaction'
model_1324.wv.n_similarity(word_tokenize(sentence), 
                             word_tokenize('stoichiometry balanced equation'))

NameError: name 'model_1324' is not defined

In [None]:
model_b = Doc2Vec(vector_size=300, window=3, min_count=2, epochs=15)
model_b.build_vocab(clean)
model_b.train(clean, total_examples=model.corpus_count, epochs=model.epochs)

## using only one textbook and increasing training epochs

In [106]:
zumdahl = grab_text('../data/external/texts/_zumdahl2.pdf')

'../data/external/texts/_zumdahl2.pdf' successfully opened!


In [108]:
zumdahl = clean_textbook(zumdahl)

In [110]:
model_z = Doc2Vec(vector_size=300, window=3, min_count=2, epochs=100)
model_z.build_vocab(zumdahl)
model_z.train(zumdahl, total_examples=model.corpus_count, epochs=model.epochs)

In [38]:
model_z.wv.n_similarity(word_tokenize('balance the following chemical equation'), 
                             word_tokenize('balance equation'))

NameError: name 'model_z' is not defined

In [112]:
model_z.wv.n_similarity(word_tokenize('balance the following chemical equation'), 
                      word_tokenize('electron configuration'))

0.41309083

In [37]:
stoich_sentence = 'methane and oxygen react to form carbon dioxide and water. write the balanced chemical equation for this reaction.'

model.wv.n_similarity(word_tokenize(stoich_sentence), 
                             word_tokenize('electron configuration'))

0.43156946

## classifying comparison

In [7]:
textbook = pd.read_csv('../data/processed/textbook-problems.csv')

In [23]:
balancing = list(textbook[textbook['balancing'] == 1]['text'])

In [13]:
balancing[:5]

['Balance the following chemical equations. Classify the reactions as decomposition, combination, single displacement, double-displacement, or combustion.  (a) CaCl2(aq ) + Na2SO4(aq )  CaSO4(s) + NaCl(aq ) (b) Ba(s) + HCl(aq ) BaCl2(aq ) + H2(g) (c) N2(g) + H2(g) NH3(g) (d) FeO(s) + CO(g) Fe(s) + CO2(g) (e) CaO(s) + H2O(l) Ca(OH)2(aq ) (f ) Na2CrO4(aq ) + Pb(NO3)2(aq )  PbCrO4(s) + NaNO3(aq ) (g) KI(aq ) + Cl2(g) KCl(aq ) + I2(aq ) (h) NaHCO3(s) Na2CO3(s) + CO2(g) + H2O(g)',
 'Balance the following equations and classify the reactions as decomposition, combination, single-displacement, double-displacement, or combustion.  (a) GaH3 + N(CH3)3 (CH3)3NGaH3 (b) Ca(s) + H2O(l) Ca(OH)2(aq ) + H2(g) (c) N2(g) + CaC2(s) C(s) + CaNCN(s) (d) N2(g) + Mg(s) Mg3N2(s) (e) NH4Cl(s) NH3(g) + HCl(g) (f) CaO(s) + SO3(g) CaSO4(s) (g) PCl5(g) PCl3(g) + Cl2(g) (h) Ca3N2(s) + H2O(l) Ca(OH)2(aq ) + NH3(g)',
 'When heated, nickel(II) carbonate undergoes a  decomposition reaction. Write a balanced equation to 

In [14]:
stoich_sentence = 'methane and oxygen react to form carbon dioxide and water. write the balanced chemical equation for this reaction.'

In [35]:
balscores = []
for b in balancing:
    balscores.append(model.wv.n_similarity(word_tokenize(stoich_sentence), wordpunct_tokenize(b.split('.')[0])))
np.mean(balscores)

KeyError: "word 'benches' not in vocabulary"

In [31]:
econfig = list(textbook[textbook['e_config'] == 1]['text'])

In [33]:
econfig

['Write the electron configuration for each of the following elements.  (a) silicon (b) lithium (c) magnesium',
 'Write the electron configuration for each of the  following elements.  (a) calcium (b) neon (c) vanadium',
 'Write the complete electron configurations for atoms of  the following elements. (a) Na  (b) Mn  (c) Se',
 'Write the complete electron configurations for atoms of  the following elements. (a) Sc  (b) As   (c) Ba',
 'What is wrong with the following electron configuration  for bromine, Br? 1s22s22p63s23p64s24d104p6',
 'What is wrong with the following electron configuration for rubidium, Rb? 1s22s22p63s23p64s23d103p64s2',
 'Identify the elements that have the following abbreviated electron configurations.  (a) [Ne] 3s23p 5  (b) [Ar] 4s23d7  (c) [Xe] 6s1',
 'Identify the elements that have the following abbreviated electron configurations.  (a) [Ar] 4s1  (b) [Xe] 6s2 4f 145d10  (c) [Kr] 5s24d105p5',
 'Write the abbreviated electron configuration for each of the follow

In [32]:
escores = []
for e in econfig:
    balscores.append(model.wv.n_similarity(word_tokenize(stoich_sentence), word_tokenize(e)))
np.mean(escores)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


nan