## Test Stem Question Calssification

Load test data

In [1]:
# some_file.py
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '/Users/jahan/workspace/stem_parser/src')

import helper

In [44]:
import random
import fasttext
from sklearn.metrics import confusion_matrix

In [4]:
math_fn = '../data/math_test.txt'
phys_fn = '../data/phys_test.txt'
chem_fn = '../data/chem_test.txt'

filenames = {'math': math_fn, 'phys': phys_fn, 'chem': chem_fn}

In [5]:
# Load files
raw_data_test = []
for label, fn in filenames.items():
    data = helper.load_data(fn, label)
    raw_data_test += data

In [6]:
len(raw_data_test)

63

In [7]:
raw_data_test[0]

('6.334 * 104 = 0.0006334 0.06334 6334 63340 633400', 'math')

In [9]:
random.shuffle(raw_data_test)

## Train model

In [10]:
def load_train_data(filenames):
    # Load files
    raw_data = []
    for label, fn in filenames.items():
        data = helper.load_data(fn, label)
        raw_data += data

    return raw_data

In [14]:
math_fn = '../data/math.txt'
phys_fn = '../data/phys.txt'
chem_fn = '../data/chem.txt'
filenames = {'math': math_fn, 'phys': phys_fn, 'chem': chem_fn}
raw_data = load_train_data(filenames)
print(raw_data[0])
print(raw_data[1])
print(raw_data[-1])
print("Len of stem data: {}".format(len(raw_data)))
random.shuffle(raw_data)
other_data = helper.load_quora_data('../data/others.txt')
print(other_data[0])
print(other_data[-1])
print(len(other_data))
random.shuffle(other_data)
raw_data += other_data[:len(raw_data)]
random.shuffle(raw_data)
print(len(raw_data))
# Create FT dataset
train_fn = '../data/stem.train'
helper.create_ft_data(raw_data, train_fn)

('1. An instrument store gives a 10% discount to all students off the original cost of an instrument. During a back to school sale an additional 15% is taken off the discounted price. Julie, a student at the local high school, purchases a flute for $306. How much did it originally cost? 1. 325 2.375 3. 400 4. 408 5. 425', 'math')
('If y(x-1)=z then x= 1. y-z 2. z/y + 1 3. y(z-1) 4. z(y-1) 5. 1-zy', 'math')
('What is the molarity of a sodium hydroxide solution if 50.0 mL of the solution requires 38.6 mL of 0.0976 M HCl for titration?', 'chem')
Len of stem data: 121
('What is the step by step guide to invest in share market in india?', 'other')
('What makes a good programmer great?', 'other')
1004
242


In [18]:
model = fasttext.train_supervised(input=train_fn, lr=1.0, epoch=25, wordNgrams=2)
model.save_model("../models/model_stem.bin")

Read 0M words
Number of words:  1865
Number of labels: 4
Progress: 100.0% words/sec/thread:  449658 lr:  0.000000 avg.loss:  0.299530 ETA:   0h 0m 0s


## Test model

In [19]:
model.predict("Which baking dish is best to bake a banana bread ?")

(('__label__other',), array([0.9270876]))

In [22]:
model.predict("The circumference of a circle is 30. What is its area? 15pi 225pi 400pi 900pi 3000pi", k=3)

(('__label__phys', '__label__chem', '__label__other'),
 array([0.5274325 , 0.41930121, 0.04156993]))

In [23]:
test = """An intensity of 60 decibels is ___ times as intense as an intensity of 30 decibels. A. 2 B. 30 C. 60 D. 90 E. 1000"""
model.predict(test)

(('__label__phys',), array([0.77218282]))


In [24]:
test = "In a flame test, the presence of copper in a solution is evident by what color flame? Is the flame w) red x) orange y) indigo z) blue-green"
model.predict(test)

(('__label__chem',), array([0.74388403]))


In [25]:
test = "Compute the largest root of x4 − x3 − 5x2 + 2x + 6."
model.predict(test)

(('__label__math',), array([0.99200779]))


## Test with more test examples

In [46]:
true_labels = []
preds = []
probs = []

for item in raw_data_test:
    txt, label = item
    pred = model.predict(txt)
    true_labels.append(label)
    preds.append(pred[0][0].replace('__label__', ''))
    probs.append(pred[1][0])

In [47]:
true_labels[:5]

['phys', 'math', 'math', 'chem', 'phys']

In [48]:
preds[:5]

['phys', 'phys', 'chem', 'chem', 'phys']

In [49]:
probs[:5]

[0.8561019897460938,
 0.8574915528297424,
 0.5558924078941345,
 0.34417781233787537,
 0.7424158453941345]

In [50]:
true_labels[-5:]

['math', 'math', 'chem', 'phys', 'phys']

In [51]:
preds[-5:]

['chem', 'phys', 'chem', 'phys', 'phys']

## Confusion matrix

`Cndarray of shape (n_classes, n_classes)
Confusion matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class and predicted label being j-th class.`

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [52]:
confusion_matrix(true_labels, preds, labels=["math", "phys", "chem", "other"])

array([[ 3,  6,  6,  1],
       [ 0, 33,  0,  1],
       [ 0,  1, 11,  1],
       [ 0,  0,  0,  0]])

## Math tests

https://www.maths.ox.ac.uk/study-here/undergraduate-study/practice-problems

In [57]:
math_test = """Find the radius and centre of the circle described by the equation
x
2 + y
2 − 2x − 4y + 1 = 0
by writing it in the form (x − a)
2 + (y − b)
2 = c
2
for suitable a, b and c.""".replace('\n', ' ')

model.predict(math_test)


(('__label__math',), array([0.91819733]))

In [58]:
math_test = "If y = x2 ln x, find d2y/dx2."
model.predict(math_test)

(('__label__math',), array([0.7456615]))

In [59]:
math_test = "Given that eiθ = cos θ + i sin θ, prove that cos(A + B) = cos A cos B − sin A sin B."

model.predict(math_test)

(('__label__math',), array([0.84309477]))

## Physics tests

https://www.physics.harvard.edu/undergrad/problems

In [62]:
phys_test = """Assuming that the index of refraction of water is 4/3 and that raindrops are spherical, show that the location of a rainbow is approximately 42◦ above the line from
the sun to you. If you see a double rainbow, what is the angle of the second one?
Even triple rainbows are possible, although they are difficult to see; where is the
third one?""".replace('\n', ' ')

model.predict(phys_test, k=2)

(('__label__chem', '__label__phys'), array([0.51262999, 0.41493207]))

In [63]:
phys_test = """A rubber band with initial length L has one end tied to a wall. At t = 0, the other
end is pulled away from the wall at speed V (assume that the rubber band stretches
uniformly). At the same time, an ant located at the end not attached to the wall
begins to crawl toward the wall, with speed u relative to the band. Will the ant
reach the wall? If so, how much time will it take?""".replace('\n', ' ')

model.predict(phys_test, k=2)

(('__label__phys', '__label__chem'), array([0.80879223, 0.1318285 ]))

In [64]:
phys_test = """A ladder of length ` and uniform mass density stands on a frictionless floor and
leans against a frictionless wall. It is initially held motionless, with its bottom end
an infinitesimal distance from the wall. It is then released, whereupon the bottom
end slides away from the wall, and the top end slides down the wall. When it loses
contact with the wall, what is the horizontal component of the velocity of the center
of mass?"""

model.predict(phys_test.replace('\n', ' '), k=2)

(('__label__phys', '__label__chem'), array([0.68361628, 0.28976056]))

In [65]:
phys_test = """A tennis ball with (small) mass m2 sits on top of a basketball with (large)
mass m1. The bottom of the basketball is a height h above the ground, and
the bottom of the tennis ball is a height h + d above the ground. The balls
are dropped. To what height does the tennis ball bounce?"""

model.predict(phys_test.replace('\n', ' '), k=2)

(('__label__phys', '__label__chem'), array([0.69385701, 0.28722027]))

## Chem problems

https://en.wikipedia.org/wiki/List_of_unsolved_problems_in_chemistry

In [67]:
chems = ["Can the transition temperature of high-temperature superconductors be brought up to room temperature?", 
        "What is the origin of homochirality in biomolecules?", 
        "Why are accelerated kinetics observed for some organic reactions at the water-organic interface?", 
        "Do replacement reactions of aryl diazonium salts (dediazotizations) predominantly undergo SN1 or a radical mechanism?", 
        "Is it possible to predict the secondary, tertiary and quaternary structure of a polypeptide sequence based solely on the sequence and environmental information? Inverse protein-folding problem: Is it possible to design a polypeptide sequence which will adopt a given structure under certain environmental conditions?[1][5] This has been achieved for several small globular proteins in recent years.[6] In 2020, it was announced that Google's AlphaFold, a neural network based on DeepMind artificial intelligence, is capable of predicting a protein's final shape based solely on its amino-acid chain with an accuracy of around 90% on a test sample of proteins used by the team."]

for chem_test in chems:
    print(model.predict(chem_test.replace('\n', ' '), k=2))

(('__label__other', '__label__chem'), array([0.76867008, 0.18265003]))
(('__label__other', '__label__chem'), array([0.60215354, 0.39508656]))
(('__label__other', '__label__chem'), array([0.83355796, 0.11246318]))
(('__label__other', '__label__chem'), array([0.82129347, 0.13756821]))
(('__label__phys', '__label__other'), array([0.48167232, 0.24768554]))
