# MathQA Training

#### Imports

In [2]:
from enum import Enum
import os
import anytree
import pandas as pd
from itertools import permutations
import seaborn as sns
import math
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from sklearn.utils.class_weight import compute_class_weight
import re

#### Constants

In [123]:
DATA_PATH = './dataset/'
SET_NAMES = ['train', 'validation', 'test']
MODEL = 'roberta-base'
MAX_TOKENS = 128
DEVICE = 'cuda:0'

class Op(Enum):
    ADD = '+'
    SUB = '-'
    MULT = '*'
    DIV = '/'
    POW = '^'
    
class Const(Enum):
    CONST_PI = 'const_pi'
    CONST_NEG_1 = 'const_neg_1' # I added this
    CONST_DEG_TO_RAD = 'const_deg_to_rad' # pi / 180 (There is only one example of this and its actually used incorrectly)
    CONST_1 = 'const_1'
    CONST_2 = 'const_2'
    CONST_3 = 'const_3'
    CONST_4 = 'const_4'
    CONST_5 = 'const_5'
    CONST_6 = 'const_6'
    CONST_10 = 'const_10'
    CONST_12 = 'const_12'
    CONST_26 = 'const_26'
    CONST_52 = 'const_52'
    CONST_60 = 'const_60'
    CONST_100 = 'const_100'
    CONST_180 = 'const_180'
    CONST_360 = 'const_360'
    CONST_1000 = 'const_1000'
    CONST_3600 = 'const_3600' 
    CONST_0_25 = 'const_0_25'
    CONST_0_2778 = 'const_0_2778'
    CONST_0_33 = 'const_0_33'
    CONST_0_3937 = 'const_0_3937'
    CONST_0_4535 = 'const_0_4535'
    CONST_0_6 = 'const_0_6'
    CONST_1_6 = 'const_1_6'
    CONST_2_2046 = 'const_2_2046'
    CONST_2_54 = 'const_2_54'
    CONST_3_6 = 'const_3_6' 
    CONST_0dot25 = 'const_0.25'
    CONST_0dot5 = 'const_0.5' 
    CONST_2dot0 = 'const_2.0'
    CONST_3dot0 = 'const_3.0'
    CONST_4dot0 = 'const_4.0'
    CONST_60dot0 = 'const_60.0'
    CONST_100dot0 = 'const_100.0'

values = [math.pi, -1, math.pi/180, 1, 2, 3, 4, 5, 6, 10, 12, 26, 52, 60, 100, 180, 360, 1000, 3600,
          0.25, 0.2778, 1/3, 0.3937, 0.4535, 0.6, 1.6, 2.2046, 2.54, 3.6, 0.25, 0.5, 2.0, 3.0, 4.0, 60.0, 100.0]
const2val = {k:v for k,v in zip(Const._value2member_map_.keys(), values)}    
const2val['const_0_5'] = 0.5

op2id = {k:v for k,v in zip(Op._value2member_map_.keys(), range(len(Op._value2member_map_)))}

## Loading the data

Reading csv into a dictionary of dataframes

In [4]:
data = {name:pd.read_csv(f'{DATA_PATH}{name}.csv') for name in SET_NAMES}

Converts operations for each problem into a multi label onehot encoded setup

In [4]:
def onehot_ops(data):
    labels = []
    for op_set in data.ops:
        op_set = eval(op_set)
        idx = [op2id[op] for op in op_set]
        onehot = np.zeros(len(op2id))
        onehot[idx] = 1
        labels.append(onehot)
    return np.array(labels)
        
#onehot_ops(data['train'])

Sort nums for each each problem in increasing order

In [9]:
def max_num(nums):
    get_float = lambda x: float(const2val[x]) if x in const2val else float(x)
    return max(map(get_float, nums))

def remove_const(data):
    nums = []
    for num_list in data.nums:
        nums.append(set([float(x) for x in eval(num_list) if x not in const2val]))
    return nums

def get_nums_from_problem(data, convert_to_float=True):
    nums = []
    for problem in data.problem:
        num = re.compile('([+-]?((\d+(\.\d*)?)|(\.\d+)))')
        big = re.compile(r'(-?\d{1,3}(,\d{3})+(\.\d*)?)')
        
        big_results = re.findall(big, problem)
        problem = re.sub(big, '', problem)        
        num_results = re.findall(num, problem)

        if convert_to_float:
            s1 = set([float(x[0].replace(',','')) for x in big_results])
            s2 = set([float(x[0]) for x in num_results])
        else:
            s1 = set([x[0] for x in big_results])
            s2 = set([x[0] for x in num_results])
        
        nums.append(s1.union(s2))
    return nums

def sort_nums(data):
    nums_sorted = []
    nums_no_const_sorted = []
    for nums in data.nums_no_const:
        nums_no_const_sorted.append(sorted(list(eval(nums)), key=lambda x: float(x)))
    for nums in data.nums:
        num_list = list(eval(nums))
        maximum = max_num(num_list)
        get_float = lambda x: float(const2val[x])+maximum if x in const2val else float(x)
        nums_sorted.append(sorted(num_list, key=get_float))
    return nums_sorted, nums_no_const_sorted

#sort_nums(data['train'])

Here I do some testing to see if the numbers from the equation can be found in the problem description using simple regexes. This actually works extremely well, having no examples where the expected numbers is not a subset of the obtained numbers. This does not include constants. Constants are values which should not occur in the problem description (like pi or the 2 in r^2 for example)

In [10]:
expected = remove_const(data['train'])
obtained = get_nums_from_problem(data['train'])

idx = 0
for x, y in zip(expected, obtained):
    if not (x <= y):
        print('------------------')
        print(data['train']['problem'][idx])
        print(f'Expected: {x}')
        print(f'Obtained: {y}')
        print('------------------')
    idx += 1

In [7]:
data['train']['category'].value_counts()

category
general        7231
physics        4908
gain           3544
geometry       1422
other          1071
probability     145
Name: count, dtype: int64

## Encoder

In this step, we use Roberta to get contextualized embeddings for each math problem

In [75]:
lengths = []
for x in data['train']['problem']:
    lengths.append(len(x.split()))
lengths = np.array(lengths)
print(f'Num Greater than 128: {np.sum(lengths>128)}')

Num Greater than 128: 22


In [131]:
encoder = AutoModel.from_pretrained(MODEL, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize_data(data):
    tokenization = lambda x: tokenizer(x, padding='max_length', max_length=MAX_TOKENS, truncation=True)
    
    tokenized = data['problem'].map(tokenization).tolist()
    input_ids = torch.stack([torch.tensor(x['input_ids']) for x in tokenized])
    attention_mask = torch.stack([torch.tensor(x['attention_mask']) for x in tokenized])
    
    return {'input_ids':input_ids.long(), 'attention_mask':attention_mask.int()}

tokenized = {name:tokenize_data(data[name]) for name in SET_NAMES}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The following code gets the number of problems in the training set that exceed 128 tokens. Based on the low number of problems that exceed this, 128 is a good number of maximum tokens.

In [84]:
print(f"Number of problems that exceed 128 tokens: {np.sum(np.array(tokenized['train']['input_ids'][:,-1]!=1))}")

Number of problems that exceed 128 tokens: 41


Getting the contextualized embeddings

In [129]:
encoder.to(DEVICE)
with torch.no_grad():
    tokenized['test']['input_ids'].to(DEVICE)
    tokenized['test']['attention_mask'].to(DEVICE)
    output = encoder(**tokenized['test'])
torch.cuda.empty_cache()
output

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.32 GiB. GPU 0 has a total capacty of 15.61 GiB of which 1.28 GiB is free. Including non-PyTorch memory, this process has 14.33 GiB memory in use. Of the allocated memory 13.55 GiB is allocated by PyTorch, and 82.58 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [116]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0110,  0.0662, -0.0402,  ..., -0.0765, -0.0393,  0.0051],
         [ 0.0246, -0.1859, -0.0735,  ..., -0.3455,  0.1115,  0.1199],
         [ 0.0491,  0.0213,  0.0921,  ..., -0.0817,  0.0128, -0.0482],
         ...,
         [ 0.0790,  0.0204,  0.0805,  ...,  0.0735,  0.0150,  0.0238],
         [ 0.0790,  0.0204,  0.0805,  ...,  0.0735,  0.0150,  0.0238],
         [ 0.0790,  0.0204,  0.0805,  ...,  0.0735,  0.0150,  0.0238]],

        [[-0.0115,  0.0960, -0.0436,  ..., -0.0642, -0.0510, -0.0248],
         [ 0.1513, -0.4553, -0.0328,  ..., -0.1751,  0.3167, -0.2626],
         [ 0.2684, -0.0636,  0.1094,  ..., -0.0759,  0.2083, -0.1840],
         ...,
         [ 0.1074,  0.0680,  0.0232,  ...,  0.1364, -0.0319, -0.0557],
         [ 0.1074,  0.0680,  0.0232,  ...,  0.1364, -0.0319, -0.0557],
         [ 0.1074,  0.0680,  0.0232,  ...,  0.1364, -0.0319, -0.0557]],

        [[-0.0231,  0.0543, -0.0471,  ..., -0.1168, -