# MathQA Operator Classifier

#### Imports

In [1]:
from enum import Enum
import os
import pandas as pd
import math
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForSequenceClassification
from sklearn.metrics import f1_score, accuracy_score
from datasets import Dataset
import warnings
import pickle

#### Constants

In [14]:
TRAINING_MODE = False # Set this to false to switch this into load and evaluate mode
DATA_PATH = './dataset/'
SET_NAMES = ['train', 'validation', 'test']
MODEL_TYPE = 'distilroberta-base' # A more optimized version of roberta obtaining 95% of its performance
MAX_LENGTH = 392
NUM_MASK = '<num>'
WORKING_DIR = 'TEMP/'
FINAL_DIR = 'pickle/'

class Op(Enum):
    ADD = '+'
    SUB = '-'
    MULT = '*'
    DIV = '/'
    POW = '^'
    
class Const(Enum):
    CONST_NEG_1 = 'const_neg_1' # I added this
    CONST_0_25 = 'const_0_25'
    CONST_0_2778 = 'const_0_2778'
    CONST_0_33 = 'const_0_33'
    CONST_0_3937 = 'const_0_3937'
    CONST_1 = 'const_1'
    CONST_1_6 = 'const_1_6'
    CONST_2 = 'const_2'
    CONST_3 = 'const_3'
    CONST_PI = 'const_pi'
    CONST_3_6 = 'const_3_6'
    CONST_4 = 'const_4'
    CONST_5 = 'const_5'
    CONST_6 = 'const_6'
    CONST_10 = 'const_10'
    CONST_12 = 'const_12'
    CONST_26 = 'const_26'
    CONST_52 = 'const_52'
    CONST_60 = 'const_60'
    CONST_100 = 'const_100'
    CONST_180 = 'const_180'
    CONST_360 = 'const_360'
    CONST_1000 = 'const_1000'
    CONST_3600 = 'const_3600'

values = [-1, 0.25, 0.2778, 0.33, 0.3937, 1, 1.6, 2, 3, math.pi, 3.6, 4, 5, 6, 10, 12, 26, 52, 60, 100, 180, 360, 1000, 3600]
const2val = {k:v for k,v in zip(Const._value2member_map_.keys(), values)}    

op2id = {k:v for k,v in zip(Op._value2member_map_.keys(), range(len(Op._value2member_map_)))}
const2id = {k:v for k,v in zip(Const._value2member_map_.keys(), range(len(Const._value2member_map_)))}

## Loading the data

In [3]:
data = {name:pd.read_csv(f'{DATA_PATH}{name}.csv') for name in SET_NAMES}

This function converts all of the operators used in a problem to a single one hot encoded vector used for multi class classification

In [4]:
def onehot_op(data):
    labels = []
    for op_set in data.ops:
        op_set = eval(op_set)
        idx = [op2id[op] for op in op_set]
        onehot = np.zeros(len(op2id))
        onehot[idx] = 1
        labels.append(onehot)
    return np.array(labels)

Getting only the necessary columns from the data (text/label)

In [5]:
data = {name:Dataset.from_dict({'text':data[name]['problem'], 'labels':onehot_op(data[name])}) for name in SET_NAMES}
data['train'].set_format('torch')
data['validation'].set_format('torch')
data['test'].set_format('torch')

## Tokenization

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE)

def tokenization(items):
    return tokenizer(items['text'], padding='max_length', max_length=MAX_LENGTH, truncation=True)

encoded = {k:v.map(tokenization, batched=True, remove_columns=['text']) for k,v in data.items()}

Map:   0%|          | 0/18215 [00:00<?, ? examples/s]

Map:   0%|          | 0/2710 [00:00<?, ? examples/s]

Map:   0%|          | 0/1798 [00:00<?, ? examples/s]

In [8]:
batch = 8
model_path = f'models/{MODEL_TYPE}-op_classifier-mathqa'
if TRAINING_MODE:
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_TYPE, 
                                                               problem_type = 'multi_label_classification', 
                                                               num_labels = len(op2id))
else:
    model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                               problem_type = 'multi_label_classification', 
                                                               num_labels = len(op2id))
    
def get_metrics(y_true, y_pred):
    return {
        'micro-f1': f1_score(y_true=y_true, y_pred=y_pred, average='micro'), 
        'macro-f1': f1_score(y_true=y_true, y_pred=y_pred, average='macro'),
        'weighted-f1': f1_score(y_true=y_true, y_pred=y_pred, average='weighted'),
        'accuracy': accuracy_score(y_true=y_true, y_pred=y_pred)
    }

def compute_metrics_helper(p, label, thresh=0.5):
    # Converting all values in the vector to be between 0 and 1
    # The reason why softmax is not used is because we are doing multi label classification, meaning the total sum may be above 1
    sigmoid = torch.nn.Sigmoid()
    prob = sigmoid(torch.Tensor(p))
    
    # Converting items above the threshold to integers
    y_pred = np.zeros(prob.shape)
    y_pred[np.where(prob >= thresh)] = 1
    
    # Computing the metrics (f1, accuracy)
    return get_metrics(label, y_pred)

def compute_metrics(p):
    return compute_metrics_helper(p.predictions, p.label_ids)

args = TrainingArguments(
    output_dir = f'{WORKING_DIR}{model_path}',
    evaluation_strategy = 'epoch',
    per_device_train_batch_size = batch,
    per_device_eval_batch_size = batch,
    num_train_epochs = 5,
)

trainer = Trainer(
    model,
    args,
    train_dataset = encoded['train'],
    eval_dataset = encoded['validation'],
    compute_metrics = compute_metrics,
)

In [9]:
if TRAINING_MODE:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        trainer.train()
        trainer.save_model(model_path)
else:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        predictions = {name:trainer.predict(encoded[name]) for name in SET_NAMES}

## Analysis and Training

As we can see, f1 scores are above 90% and accuracy lies in the mid 80s.

In [10]:
if not TRAINING_MODE:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        for name in SET_NAMES:
            print(f"{name.upper()}: ")
            print(compute_metrics_helper(predictions[name].predictions, predictions[name].label_ids))
            print()

TRAIN: 
{'micro-f1': 0.9878143473894543, 'macro-f1': 0.9834399883952448, 'weighted-f1': 0.9878178142459981, 'accuracy': 0.9497666758166347}

VALIDATION: 
{'micro-f1': 0.946394786630229, 'macro-f1': 0.9329214387304215, 'weighted-f1': 0.9464146740484812, 'accuracy': 0.8361623616236162}

TEST: 
{'micro-f1': 0.9498525073746312, 'macro-f1': 0.9313226983129687, 'weighted-f1': 0.9498498023679985, 'accuracy': 0.842602892102336}



Similar to with the constants, we are much more concerened with recall than precision, so we test that below

In [11]:
def percent_without_missed_const(p, label, thresh=0.5):
    # Converting all values in the vector to be between 0 and 1
    # The reason why softmax is not used is because we are doing multi label classification, meaning the total sum may be above 1
    sigmoid = torch.nn.Sigmoid()
    prob = sigmoid(torch.Tensor(p))
    
    # Converting items above the threshold to integers
    y_pred = np.zeros(prob.shape)
    y_pred[np.where(prob >= thresh)] = 1
    
    y_true = label
    
    # if the true set is a subset of the predicted set then it did not miss any constants in the problem
    # returns avg # constants per problem, percent without any missed
    return np.mean(np.sum(y_pred, axis=1)), np.sum([set(np.where(true==1)[0]) <= set(np.where(pred==1)[0]) for true, pred in zip(y_true, y_pred)])/len(y_true)

Shown below is the percentage of problems without any missed operators along with the average number of operators predicted given different thresholds. Using a threshold of .003 on the validation set still results in over a 98% coverage, but reduces the average number of constants from 5 to 3.4.

In [12]:
thresholds = [0.5, 0.05, 0.003, 0.001, 0]
for thresh in thresholds:
    print(f'THRESH: {thresh}')
    avg_per_prob, percent_no_missed = percent_without_missed_const(predictions['validation'].predictions, predictions['validation'].label_ids, thresh)
    print(f"Percent without any missed: {percent_no_missed}")
    print(f"Average number of operators per problem: {avg_per_prob}")
    print()

THRESH: 0.5
Percent without any missed: 0.9062730627306274
Average number of constants per problem: 2.6538745387453875

THRESH: 0.05
Percent without any missed: 0.9450184501845018
Average number of constants per problem: 2.802952029520295

THRESH: 0.003
Percent without any missed: 0.9826568265682657
Average number of constants per problem: 3.3808118081180814

THRESH: 0.001
Percent without any missed: 0.992619926199262
Average number of constants per problem: 4.352767527675277

THRESH: 0
Percent without any missed: 1.0
Average number of constants per problem: 5.0



Showing that this generalizes

In [13]:
thresholds = [.003]
for thresh in thresholds:
    print(f'THRESH: {thresh}')
    avg_per_prob, percent_no_missed = percent_without_missed_const(predictions['test'].predictions, predictions['test'].label_ids, thresh)
    print(f"Percent without any missed: {percent_no_missed}")
    print(f"Average number of constants per problem: {avg_per_prob}")
    print()

THRESH: 0.003
Percent without any missed: 0.9816462736373749
Average number of constants per problem: 3.360956618464961



## Storing Results

In [15]:
def get_pred(p, thresh=0.005):
    # Converting all values in the vector to be between 0 and 1
    # The reason why softmax is not used is because we are doing multi label classification, meaning the total sum may be above 1
    sigmoid = torch.nn.Sigmoid()
    prob = sigmoid(torch.Tensor(p))
    
    # Converting items above the threshold to integers
    y_pred = np.zeros(prob.shape)
    y_pred[np.where(prob >= thresh)] = 1
    
    return y_pred

y_pred = {name:get_pred(predictions[name].predictions) for name in SET_NAMES}

In [16]:
results = {'pred': y_pred, 'map':op2id}

In [17]:
if not os.path.exists(FINAL_DIR):
    os.makedirs(FINAL_DIR)
    
with open(f'{FINAL_DIR}ops.pickle', 'wb') as f:
    pickle.dump(results, f)