https://blog.cambridgespark.com/tutorial-build-your-own-embedding-and-use-it-in-a-neural-network-e9cde4a81296
https://github.com/materialsintelligence/mat2vec

install `pyemd`

# Building our Own Skip-gram Embeddings

## Import Libraries

In [4]:
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec, Doc2Vec
import multiprocessing

# https://stackoverflow.com/questions/33073972/
import os
import time
# os.environ['TIKA_SERVER_JAR'] = 'https://repo1.maven.org/maven2/org/apache/tika/tika-server/1.19/tika-server-1.19.jar'
import tika
from tika import parser

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

In [2]:
pd.set_option("display.max_colwidth", 100)

## Open Files

In [None]:
# zumdahl10 : Copyright 2018 Cengage Learning. All Rights Reserved. May not be copied, scanned, or duplicated, in whole or in part.


In [3]:
filepaths = []

for file in os.listdir('../data/external/texts/'):
    if 'pdf' in file:
        filepaths.append('../data/external/texts/'+file)

## Use Paragraphs as Documents

In [4]:
stop_words = stopwords.words('english')
stop_words += ['copyright',
               'cengage',
               'pearson',
               'learning',
               'may',
               'copied',
               'scanned',
               'duplicated',
               'chapter',
               'practice',
               'problem',
               'exercise',
               'review',
               'question',
               'figure',
               'follow']
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [5]:
def remove_stops(doc, i=0, progress=None):
    if progress:
        progress(i)
    doc = word_tokenize(doc)  # Split into words.
    doc = [w.lower() for w in doc if not w in stop_words]  # Remove stopwords.
    doc = [w for w in doc if w.isalpha()]  # Remove numbers and punctuation.
    return doc

In [6]:
def grab_text(file, sleep=0, counter=0):
    
    if counter == 2:
        pass

    raw = parser.from_file(file)
    status = raw['status']
    
    # if things go well
    if status == 200:
        print(f"'{file}' successfully opened!")
        return raw['content']
    
    # if things don't go well
    else:
        print(f'! ! ! ! error code {status} ! ! ! !')
        print(f'! ! ! ! trying again ! ! ! !')
        
        # we can usually get around other errors by waiting
        time.sleep(5)
        counter += 1
        return grab_text(file, counter=counter)

In [7]:
# https://stackoverflow.com/questions/44333462/

def make_paragraphs(document, sleep=0):
    
    clean = re.sub('([\.\?\!])\n\n([A-Z])', r'\1PPAARRAAGGRRAAPPHH\2', document)
    clean = re.sub('\n\n\n\n([a-z]+)', r'PPAARRAAGGRRAAPPHH', clean)
#     clean = re.sub('(\d+)\n\n', r'\1PPAARRAAGGRRAAPPHHJJOOIINN', clean)
    clean = re.sub('([A-Za-z]+)\-\n\n', 'PPAARRAAGGRRAAPPHH', clean)
    clean = re.sub('\s\n\n', 'PPAARRAAGGRRAAPPHH', clean)
    clean = re.sub('\n\n\n\n', 'PPAARRAAGGRRAAPPHH', clean)
    clean = re.sub('\n\n', ' ', clean)
    clean = re.sub('\-\n', '', clean)
    clean = re.sub('\n', ' ', clean)
    clean = re.sub('\t', ' ', clean)
    clean = re.sub('\s\s', ' ', clean)
    clean = re.sub('\-([a-zA-Z]+)', r'\1', clean)
    clean = re.sub('\ue060', 'INFINITY', clean)
#     clean = re.sub('([A-Za-z]+)JJOOIINNPPAARRAAGGRRAAPPHH(.+)PPAARRAAGGRRAAPPHHJJOOIINN([a-z]+)', r'\1\3', clean)    
    clean = re.split('PPAARRAAGGRRAAPPHH', clean)
    time.sleep(sleep)
    return clean

In [8]:
def progress(i):
    if (i+1) % 10000 == 0:
        print(f'{i + 1} out of {len(CLEANED)}')

In [9]:
RAW = []
CLEANED = []

for file in filepaths:
    print(f"'attempting to open '{file}'.")    
    text = grab_text(file)
    clean = make_paragraphs(text)
    RAW += [text]
    CLEANED += clean
    print(f"'{file}' complete!")
    print()

'attempting to open '../data/external/texts/mcquarrie4.pdf'.
'../data/external/texts/mcquarrie4.pdf' successfully opened!
'../data/external/texts/mcquarrie4.pdf' complete!

'attempting to open '../data/external/texts/silberberg8.pdf'.
'../data/external/texts/silberberg8.pdf' successfully opened!
'../data/external/texts/silberberg8.pdf' complete!

'attempting to open '../data/external/texts/oxtoby7.pdf'.
'../data/external/texts/oxtoby7.pdf' successfully opened!
'../data/external/texts/oxtoby7.pdf' complete!

'attempting to open '../data/external/texts/petrucci11.pdf'.
'../data/external/texts/petrucci11.pdf' successfully opened!
'../data/external/texts/petrucci11.pdf' complete!

'attempting to open '../data/external/texts/_atkins6.pdf'.
'../data/external/texts/_atkins6.pdf' successfully opened!
'../data/external/texts/_atkins6.pdf' complete!

'attempting to open '../data/external/texts/brown14.pdf'.
'../data/external/texts/brown14.pdf' successfully opened!
'../data/external/texts/brown14

## SAVE PICKLE

In [10]:
# pickle.dump(RAW, open('../data/raw_2.p', 'wb'))
# pickle.dump(CLEANED, open('../data/cleaned_2.p', 'wb'))

## OPEN PICKLE

In [3]:
RAW = pickle.load(open('../data/raw.p', 'rb'))
CLEANED = pickle.load(open('../data/cleaned.p', 'rb'))

In [11]:
CLEANER = [remove_stops(p, i, progress) for i, p in enumerate(CLEANED)]
pickle.dump(CLEANER, open('../data/cleaner_2.p', 'wb'))

# CLEANER = pickle.load(open('../data/cleaner.p', 'rb'))

10000 out of 127601
20000 out of 127601
30000 out of 127601
40000 out of 127601
50000 out of 127601
60000 out of 127601
70000 out of 127601
80000 out of 127601
90000 out of 127601
100000 out of 127601
110000 out of 127601
120000 out of 127601


## Make Skip-gram Embedding

In [6]:
RAW[0]

"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nGeneral Chemistry\nfourth edition\n\nDonald A. McQuarrie\nUniversity of California, Davis\n\nPeter A. Rock\nUniversity of California, Davis\n\nEthan B. Gallogly\nSanta Monica College\n\nIllustrations by\nGeorge Kelvin and Laurel Muller\n\nUniversity Science Books\nwww.uscibooks.com\n\n\n\nUniversity Science Books\n20 Edgehill Road\nMill Valley, CA 94941\nwww.uscibooks.com\n\nProduced by Wilsted & Taylor Publishing Services\nProject Manager: Jennifer Uhlich\nDevelopmental Editor: John Murdzek\nCopy Editor: Jennifer McClain\nEditorial Assistance: Nancy Evans, Andrew Joron, Antonia Angress\nIllustrations: George Kelvin and Laurel Muller\nBook and Cover Design: Yvonne Tsang\nComposition: Yvonne Tsang, Laurel Muller, Hassan Herz, Lucy Brank, Jody Hanson\nProofreading: Mervin Hanson\nPrinting and Binding: Transcontinental\n\nThis book is printed on acid-free paper.\n\nCopyright © 2011 by University Science Books\n\nISBN 9

In [245]:
EMB_DIM = 100
model = Word2Vec(CLEANER, size=EMB_DIM, window=5, min_count=5, negative=5, iter=20, sg=1)


In [259]:
w2v = dict(zip(model.wv.index2word, model.wv.vectors))

In [246]:
words = list(model.wv.vocab)
len(words)

18484

In [45]:
# model = pickle.dump(model, open('../model/model.p', 'wb+'))
# w2v = pickle.dump(w2v, open('../model/w2v.p', 'wb+'))

model = pickle.load(open('../models/model.p', 'rb'))

In [46]:
word_vectors = model.wv
result = word_vectors.most_similar(
    positive=['ionic'], 
    negative=[], topn=10)
result

[('covalent', 0.6917945742607117),
 ('naf', 0.6290937066078186),
 ('cations', 0.6068678498268127),
 ('lif', 0.60612952709198),
 ('binary', 0.5888495445251465),
 ('anions', 0.5879063010215759),
 ('ions', 0.5794575214385986),
 ('ionion', 0.5753878355026245),
 ('net', 0.5718735456466675),
 ('licl', 0.5708036422729492)]

## Quick Assessment of W2V Performance

In [16]:
probs = pd.read_csv('../data/processed/textbook-problems.csv')

In [17]:
for i in probs.index:
    probs.loc[i, 'distance'] = word_vectors.wmdistance(probs.loc[i, 'txt'], 'balanced equations')
probs.sort_values(by='distance')[:50]

Unnamed: 0,filepath,number,text,txt,balancing,e_config,distance
227,tro-7,32,"For each chemical equation (which may or may not be balanced), list the number of each type of ...","['for', 'chemical', 'equation', 'balanced', 'list', 'number', 'type', 'atom', 'side', 'equation'...",1,0,0.548691
226,tro-7,31,"For each chemical equation (which may or may not be balanced), list the number of each type of ...","['for', 'chemical', 'equation', 'balanced', 'list', 'number', 'type', 'atom', 'side', 'equation'...",1,0,0.550729
201,tro-7,6,"To balance a chemical equation, adjust the ______ as nec essary to make the numbers of each type...","['to', 'balance', 'chemical', 'equation', 'adjust', 'nec', 'essary', 'make', 'numbers', 'type', ...",0,0,0.606747
229,tro-7,34,Consider the unbalanced chemical equation. Al(s) + Cl2( g) ¡ AlCl3(s) A student tries to balanc...,"['consider', 'unbalanced', 'chemical', 'equation', 'al', 'g', 'a', 'student', 'tries', 'balance'...",1,0,0.638875
279,tro-7,84,Complete and balance each gas evolution reaction. (a) HClO4(aq) + K2CO3(aq) ¡ (b) HC2H3O2(aq) + ...,"['complete', 'balance', 'gas', 'evolution', 'reaction', 'aq', 'aq', 'b', 'aq', 'aq', 'c', 'aq', ...",1,0,0.666013
12,bauer-5,13,Complete and balance the equations for each of the following single-displacement reactions. (a)...,"['complete', 'balance', 'equations', 'following', 'reactions', 'zn', 'aq', 'b', 'na']",1,0,0.680883
22,bauer-5,23,Write a balanced equation to describe any precipitation reaction that should occur when the fol...,"['write', 'balanced', 'equation', 'describe', 'precipitation', 'reaction', 'occur', 'following',...",1,0,0.68117
28,bauer-5,29,Aqueous calcium chloride reacts with aqueous potassium carbonate in a double-displacement reacti...,"['aqueous', 'calcium', 'chloride', 'reacts', 'aqueous', 'potassium', 'carbonate', 'reaction', 'w...",1,0,0.685718
24,bauer-5,25,Complete and balance the equation for each of the following double-displacement reactions. (a) ...,"['complete', 'balance', 'equation', 'following', 'reactions', 'aq', 'b', 'aq', 'aq']",1,0,0.69071
23,bauer-5,24,Write a balanced equation to describe any precipitation reaction that should occur when the fol...,"['write', 'balanced', 'equation', 'describe', 'precipitation', 'reaction', 'occur', 'following',...",1,0,0.69656


In [57]:
test_balance = 'methane reacts with oxygen to form carbon dioxide and water vapor. write a balanced chemical equation for this reaction.'
test_electron = 'write the full electron configuration for krypton.'

In [62]:
test_balance = sent_tokenize(test_balance)

In [66]:
balancing_distances = []
for b in test_balance:
    for t in list(probs[probs['balancing'] == 1]['text']):
        balancing_distances.append(word_vectors.wmdistance(t, b))
    print(np.mean(balancing_distances))

0.9121886052672199
0.7948174112719066


In [67]:
balancing_distances = []
for b in test_balance:
    for t in list(probs[probs['e_config'] == 1]['text']):
        balancing_distances.append(word_vectors.wmdistance(t, b))
    print(np.mean(balancing_distances))

0.7745844907933134
0.7226932281553443


In [59]:
balancing_distances = []
for t in list(probs[probs['e_config'] == 1]['text']):
    balancing_distances.append(word_vectors.wmdistance(t, test_balance))
np.mean(balancing_distances)

0.6455036256908827

In [58]:
balancing_distances = []
for t in list(probs[probs['balancing'] == 1]['text']):
    balancing_distances.append(word_vectors.wmdistance(t, test_electron))
np.mean(balancing_distances)

1.1028356801775105

In [None]:
balancing_distances = []
for t in list(probs[probs['e_config'] == 1]['text']):
    balancing_distances.append(word_vectors.wmdistance(t, test_balance))
np.mean(balancing_distances)

In [23]:
probs.sort_values(by='distance', ascending=False)[:50]

Unnamed: 0,filepath,number,text,txt,balancing,e_config,distance
178,bauer-7,102,Why are krypton atoms larger than argon atoms?,"['why', 'krypton', 'atoms', 'larger', 'argon', 'atoms']",0,0,1.692802
309,tro-9,16,Bohr orbits have fixed ______ and fixed ______.,"['bohr', 'orbits', 'fixed', 'fixed']",0,0,1.681234
399,zumdahl-11,17,Which orbital is the first to be filled in any atom? Why?,"['which', 'orbital', 'first', 'filled', 'atom', 'why']",0,0,1.559453
179,bauer-7,103,"For each pair, identify the larger atom or ion. (a) Mg or Mg2+ (b) P or P3–","['for', 'pair', 'identify', 'larger', 'atom', 'ion', 'mg', 'b', 'p']",0,0,1.531878
208,tro-7,13,"What are the solubility rules, and how are they useful?","['what', 'solubility', 'rules', 'useful']",0,0,1.526772
181,bauer-7,105,"Which is the larger ion, K+ or Ca2+? Explain.","['which', 'larger', 'ion', 'explain']",0,0,1.520293
170,bauer-7,94,"Which ionization energy (IE1, IE2, or IE3) corresponds to the following process? Al2+(g) Al3+(g)...","['which', 'ionization', 'energy', 'corresponds', 'following', 'process', 'g', 'g']",0,0,1.496312
209,tro-7,14,What is a precipitation reaction? Provide an example and identify the precipitate.,"['what', 'precipitation', 'reaction', 'provide', 'example', 'identify', 'precipitate']",0,0,1.480344
180,bauer-7,104,"For each pair, identify the larger atom or ion. (a) F or F – (b) Sr or Sr2+","['for', 'pair', 'identify', 'larger', 'atom', 'ion', 'f', 'f', 'b', 'sr']",0,0,1.46224
50,bauer-5,51,What are spectator ions?,"['what', 'spectator', 'ions']",0,0,1.444406


In [24]:
for i in probs.index:
    probs.loc[i, 'distance'] = word_vectors.wmdistance(probs.loc[i, 'txt'], 'write the electron configuration')
probs.sort_values(by='distance')[:50]

Unnamed: 0,filepath,number,text,txt,balancing,e_config,distance
363,tro-9,70,Write the electron configuration for each ion. What do all of the electron configurations have i...,"['write', 'electron', 'configuration', 'ion', 'what', 'electron', 'configurations', 'common', 'b...",0,1,0.462223
367,tro-9,74,Identify what is wrong with each electron configuration and write the correct ground state (or l...,"['identify', 'wrong', 'electron', 'configuration', 'write', 'correct', 'ground', 'state', 'lowes...",0,1,0.465198
364,tro-9,71,Write the electron configuration for each ion. What do all of the electron configurations have i...,"['write', 'electron', 'configuration', 'ion', 'what', 'electron', 'configurations', 'common', 'f...",0,1,0.483336
325,tro-9,32,Write electron configurations for each transition metal. (a) Zn (b) Cu (c) Zr (d) Fe,"['write', 'electron', 'configurations', 'transition', 'metal', 'zn', 'b', 'cu', 'c', 'zr', 'fe']",0,1,0.499207
368,tro-9,75,Identify what is wrong with each electron configuration and write the correct ground state (or l...,"['identify', 'wrong', 'electron', 'configuration', 'write', 'correct', 'ground', 'state', 'lowes...",0,1,0.505725
153,bauer-7,77,How does the electron configuration for a cation differ from that of its neutral atom? How is i...,"['how', 'electron', 'configuration', 'cation', 'differ', 'neutral', 'atom', 'how', 'similar']",0,0,0.524373
145,bauer-7,69,How do you identify the valence electrons from an electron configuration?,"['how', 'identify', 'valence', 'electrons', 'electron', 'configuration']",0,1,0.527925
185,bauer-7,109,Write the abbreviated electron configuration for each of the following elements. (a) Bi (b) Rn...,"['write', 'abbreviated', 'electron', 'configuration', 'following', 'elements', 'bi', 'b', 'rn', ...",0,1,0.549422
459,zumdahl-11,77,A student writes the electron configuration of carbon (Z ! 6) as 1s32s3. Explain to him what is ...,"['a', 'student', 'writes', 'electron', 'configuration', 'carbon', 'z', 'explain', 'wrong', 'conf...",0,0,0.559342
320,tro-9,27,Write full electron configurations for each element. (a) N (b) Mg (c) Ar (d) Se ELECTRON CONFIGU...,"['write', 'full', 'electron', 'configurations', 'element', 'n', 'b', 'mg', 'c', 'ar', 'se', 'ele...",0,1,0.56293


In [25]:
probs.sort_values(by='distance', ascending=False)[:50]

Unnamed: 0,filepath,number,text,txt,balancing,e_config,distance
246,tro-7,51,Balance each chemical equation. (a) BaO2(s) + H2SO4(aq) ¡ BaSO4(s) + H2O2(aq) (b) Co(NO3)3(aq) +...,"['balance', 'chemical', 'equation', 'aq', 'aq', 'b', 'co', 'aq', 'aq', 'aq', 'c', 'l', 'lioh', '...",1,0,1.880992
518,zumdahl-6,45,Balance each of the following chemical equations. Cl2(g) ! KBr(aq) S Br2(l) ! KCl(aq) Cr(s) ! O2...,"['balance', 'following', 'chemical', 'equations', 'g', 'kbr', 'aq', 's', 'l', 'kcl', 'aq', 'cr',...",1,0,1.850931
486,zumdahl-6,13,Balance each of the following chemical equations. Fe3O4(s) ! H2(g) S Fe(l) ! H2O(g) K2SO4(aq) ! ...,"['balance', 'following', 'chemical', 'equations', 'g', 's', 'fe', 'l', 'g', 'aq', 'aq', 's', 'kc...",1,0,1.784565
244,tro-7,49,Balance each chemical equation. (a) Na2S(aq) + Cu(NO3)2(aq) ¡ NaNO3(aq) + CuS(s) (b) HCl(aq) + O...,"['balance', 'chemical', 'equation', 'aq', 'cu', 'aq', 'aq', 'cus', 'b', 'hcl', 'aq', 'g', 'l', '...",1,0,1.774573
484,zumdahl-6,11,Balance each of the following chemical equations. K2SO4(aq) ! BaCl2(aq) S BaSO4(s) ! KCl(aq) Fe(...,"['balance', 'following', 'chemical', 'equations', 'aq', 'aq', 's', 'kcl', 'aq', 'fe', 'g', 's', ...",1,0,1.715804
489,zumdahl-6,16,Balance each of the following chemical equations. Ba(NO3)2(aq) ! Na2CrO4(aq) S BaCrO4(s) ! NaNO3...,"['balance', 'following', 'chemical', 'equations', 'ba', 'aq', 'aq', 's', 'aq', 'aq', 'aq', 's', ...",1,0,1.714423
488,zumdahl-6,15,Balance each of the following chemical equations. KO2(s) ! H2O(l) S KOH(aq) ! O2(g) ! H2O2(aq) F...,"['balance', 'following', 'chemical', 'equations', 'l', 's', 'koh', 'aq', 'g', 'aq', 'aq', 's', '...",1,0,1.67671
482,zumdahl-6,9,Balance each of the following chemical equations. FeCl3(aq) ! KOH(aq) S Fe(OH)3(s) ! KCl(aq) Pb(...,"['balance', 'following', 'chemical', 'equations', 'aq', 'koh', 'aq', 's', 'fe', 'oh', 'kcl', 'aq...",1,0,1.65533
519,zumdahl-6,46,Balance each of the following chemical equations. Cl2(g) ! KI(aq) S KCl(aq) ! I2(s) CaC2(s) ! H2...,"['balance', 'following', 'chemical', 'equations', 'g', 'ki', 'aq', 's', 'kcl', 'aq', 'l', 's', '...",1,0,1.648158
247,tro-7,52,Balance each chemical equation. (a) MnO2(s) + HCl(aq) ¡ Cl2( g) + MnCl2(aq) + H2O(l ) (b) CO2(g)...,"['balance', 'chemical', 'equation', 'hcl', 'aq', 'g', 'aq', 'l', 'b', 'g', 'l', 'ca', 'aq', 'c',...",1,0,1.643856


In [31]:
wordnet = WordNetLemmatizer()

In [21]:
def problem_comparison(problem):
    distance_b = word_vectors.wmdistance(problem, 'balanced equation')
    distance_e = word_vectors.wmdistance(problem, 'electron configuration')
    print(distance_b)
    print(distance_e)
    if distance_b < distance_e:
        return 'balanced equation'
    else:
        return 'electron configuration'

In [37]:

problem_comparison(problem)

0.8441398172023028
0.960134549432326


'balanced equation'

In [32]:
word_tokenize(problem)

['balance', 'equation']

In [34]:
wordnet.lemmatize('following')

'following'

0.7953042384676635
1.0937212322601568


'balanced equation'

In [44]:
problem = 'complete and balance each acid-base reaction'
new_problem = ' '.join([wordnet.lemmatize(w, pos='v') for w in word_tokenize(problem) if w.isalpha()])
problem_comparison(new_problem)

0.7790875674722784
1.165993943178571


'balanced equation'