# MathQA Training

#### Imports

In [19]:
from enum import Enum
import os
import anytree
import pandas as pd
from itertools import permutations
import seaborn as sns
import math
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer
from sklearn.metrics import f1_score, accuracy_score
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from sklearn.utils.class_weight import compute_class_weight
import re
from sentence_transformers import SentenceTransformer, models # https://arxiv.org/abs/1908.10084

#### Constants

In [224]:
DATA_PATH = './dataset/'
SET_NAMES = ['train', 'validation', 'test']
ENCODER_MODEL = 'roberta-base'
MAX_TOKENS = 392
DEVICE = 'cuda:0'
NUM_MASK = '<NUM>'

class Op(Enum):
    ADD = '+'
    SUB = '-'
    MULT = '*'
    DIV = '/'
    POW = '^'
    
class Const(Enum):
    CONST_NEG_1 = 'const_neg_1' # I added this
    CONST_0_25 = 'const_0_25'
    CONST_0_2778 = 'const_0_2778'
    CONST_0_33 = 'const_0_33'
    CONST_0_3937 = 'const_0_3937'
    CONST_1 = 'const_1'
    CONST_1_6 = 'const_1_6'
    CONST_2 = 'const_2'
    CONST_3 = 'const_3'
    CONST_PI = 'const_pi'
    CONST_3_6 = 'const_3_6'
    CONST_4 = 'const_4'
    CONST_5 = 'const_5'
    CONST_6 = 'const_6'
    CONST_10 = 'const_10'
    CONST_12 = 'const_12'
    CONST_26 = 'const_26'
    CONST_52 = 'const_52'
    CONST_60 = 'const_60'
    CONST_100 = 'const_100'
    CONST_180 = 'const_180'
    CONST_360 = 'const_360'
    CONST_1000 = 'const_1000'
    CONST_3600 = 'const_3600'

values = [-1, 0.25, 0.2778, 0.33, 0.3937, 1, 1.6, 2, 3, math.pi, 3.6, 4, 5, 6, 10, 12, 26, 52, 60, 100, 180, 360, 1000, 3600]
const2val = {k:v for k,v in zip(Const._value2member_map_.keys(), values)}    

op2id = {k:v for k,v in zip(Op._value2member_map_.keys(), range(len(Op._value2member_map_)))}
const2id = {k:v for k,v in zip(Const._value2member_map_.keys(), range(len(Const._value2member_map_)))}

## Loading the data

Reading csv into a dictionary of dataframes

In [206]:
data = {name:pd.read_csv(f'{DATA_PATH}{name}.csv') for name in SET_NAMES}

Converts operations for each problem into a multi label onehot encoded setup

In [207]:
def onehot_ops(data):
    labels = []
    for op_set in data.ops:
        op_set = eval(op_set)
        idx = [op2id[op] for op in op_set]
        onehot = np.zeros(len(op2id))
        onehot[idx] = 1
        labels.append(onehot)
    return np.array(labels)
        
#onehot_ops(data['train'])

Sort nums for each each problem in increasing order

In [208]:
def max_num(nums):
    get_float = lambda x: float(const2val[x]) if x in const2val else float(x)
    return max(map(get_float, nums))

def remove_const(data):
    nums = []
    for num_list in data.nums:
        nums.append(set([float(x) for x in eval(num_list) if x not in const2val]))
    return nums

# Gets the numbers listed in a problem
# Once found, numbers are masked using a number mask
def get_nums_from_problem(data, convert_to_float=False):
    nums = []
    problems = []
    for problem in data.problem:
        num = re.compile('([+-]?((\d+(\.\d*)?)|(\.\d+)))')
        big = re.compile(r'(-?\d{1,3}(,\d{3})+(\.\d*)?)')
        
        big_results = re.finditer(big, problem)
        problem = re.sub(big, NUM_MASK, problem)        
        num_results = re.finditer(num, problem)
        problem = re.sub(num, NUM_MASK, problem)
        
        # Getting the combined numbers in order of occurence
        combined = [x for x in num_results]
        combined.extend([x for x in big_results])
        combined = sorted(combined, key=lambda x: x.start(0))
        
        if convert_to_float:
            combined = [float(x.group(0).replace(',','')) for x in combined]
        else:
            combined = [x.group(0) for x in combined]
        
        nums.append(combined)
        problems.append(problem)
    return nums, problems

def sort_nums(data):
    nums_sorted = []
    nums_no_const_sorted = []
    for nums in data.nums_no_const:
        nums_no_const_sorted.append(sorted(list(eval(nums)), key=lambda x: float(x)))
    for nums in data.nums:
        num_list = list(eval(nums))
        maximum = max_num(num_list)
        get_float = lambda x: float(const2val[x])+maximum if x in const2val else float(x)
        nums_sorted.append(sorted(num_list, key=get_float))
    return nums_sorted, nums_no_const_sorted

#sort_nums(data['train'])

Here I do some testing to see if the numbers from the equation can be found in the problem description using simple regexes. This actually works extremely well, having no examples where the expected numbers is not a subset of the obtained numbers. This does not include constants. Constants are values which should not occur in the problem description (like pi or the 2 in r^2 for example)

In [209]:
expected = remove_const(data['train'])
obtained,_ = get_nums_from_problem(data['train'], convert_to_float=True)
obtained = [set(x) for x in obtained]

idx = 0
for x, y in zip(expected, obtained):
    if not (x <= y):
        print('------------------')
        print(data['train']['problem'][idx])
        print(f'Expected: {x}')
        print(f'Obtained: {y}')
        print('------------------')
    idx += 1

In [210]:
data['train']['category'].value_counts()

category
general        7231
physics        4908
gain           3544
geometry       1422
other          1071
probability     145
Name: count, dtype: int64

## Encoder

In this step, we use Roberta to get contextualized embeddings for each math problem

In [294]:
encoder = AutoModel.from_pretrained(ENCODER_MODEL, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(ENCODER_MODEL)

# Adding a new token to the model, for masking out numbers.
tokenizer.add_special_tokens({'additional_special_tokens':[NUM_MASK]})
encoder.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)

def tokenize_data(data):
    tokenization = lambda x: tokenizer(x, padding='max_length', max_length=MAX_TOKENS, truncation=True)
    _,problem = get_nums_from_problem(data)
    
    tokenized = list(map(tokenization, problem))
    input_ids = torch.stack([torch.tensor(x['input_ids']) for x in tokenized])
    attention_mask = torch.stack([torch.tensor(x['attention_mask']) for x in tokenized])
    
    return {'input_ids':input_ids.long(), 'attention_mask':attention_mask.int()}

tokenized = {name:tokenize_data(data[name]) for name in SET_NAMES}

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [295]:
print(f"Number of problems that exceed {MAX_TOKENS} tokens: {np.sum(np.array((tokenized['train']['input_ids'][:,-1]!=1)))}") # 1 is the padding token

Number of problems that exceed 392 tokens: 0


This function gets the number index for each masked number token in the tokenized problems

In [291]:
def get_masked_idx(tokenized):
    mask_id = tokenizer.encode(NUM_MASK, add_special_tokens=False)[0]
    ids = tokenized['input_ids']
    return np.where(ids == mask_id)

masked_idx = {name:get_masked_idx(tokenized[name]) for name in data.keys()}

In [329]:
# Putting model on gpu
encoder.to(DEVICE)

# batching ids and masks
num_per_batch = 100
batched_ids = np.array_split(tokenized['test']['input_ids'], num_per_batch)
batched_masks = np.array_split(tokenized['test']['attention_mask'], num_per_batch)

# Putting first batch on gpu
ids = batched_ids[0].to(DEVICE)
mask = batched_masks[0].to(DEVICE)

# Getting the raw hidden output
with torch.no_grad():
    output = encoder(ids,mask)
    
# Cleaning up for the next batch
del ids
del mask
torch.cuda.empty_cache()

output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0234,  0.0500, -0.0306,  ..., -0.0659, -0.0383,  0.0204],
         [ 0.0053, -0.1929, -0.0862,  ..., -0.3437,  0.1006,  0.1238],
         [ 0.0485,  0.1048,  0.0571,  ..., -0.0692,  0.0082, -0.0438],
         ...,
         [ 0.0737, -0.0216,  0.0868,  ...,  0.1614,  0.0419,  0.0368],
         [ 0.0737, -0.0216,  0.0868,  ...,  0.1614,  0.0419,  0.0368],
         [ 0.0737, -0.0216,  0.0868,  ...,  0.1614,  0.0419,  0.0368]],

        [[-0.0206,  0.0630, -0.0431,  ..., -0.0734, -0.0581, -0.0069],
         [ 0.1169, -0.5166,  0.0120,  ..., -0.2104,  0.2985, -0.1666],
         [ 0.2478, -0.0571,  0.1196,  ..., -0.0772,  0.1470, -0.1475],
         ...,
         [ 0.0589,  0.0078,  0.0283,  ...,  0.1506, -0.0401, -0.0129],
         [ 0.0589,  0.0078,  0.0283,  ...,  0.1506, -0.0401, -0.0129],
         [ 0.0589,  0.0078,  0.0283,  ...,  0.1506, -0.0401, -0.0129]],

        [[-0.0340,  0.0524, -0.0443,  ..., -0.1116, -

In [330]:
torch.stack(output[2], dim=0).shape

torch.Size([13, 19, 392, 768])

- dim1: bert layers
- dim2: problems
- dim3: tokens
- dim4: embeddings size

In [311]:
torch.cat((torch.stack(output[2], dim=0), torch.stack(output[2], dim=0)), dim=1).shape

torch.Size([13, 20, 392, 768])

#### Plan so far
- ✓ Obtain all of the numbers from the problem text using regular expressions
- ✓ Mask these numbers using \<NUM\> to ensure that they have no effect on the next step (you should be able to generate the formulas to a math problem without necessarily needing the numbers themselves)
- Get the relation embeddings for each masked number in the problem text using an encoder only model (roberta in this case). Map these to their appropriate value and store for later. (https://medium.com/mlearning-ai/getting-contextualized-word-embeddings-with-bert-20798d8b43a4)
- Some embedding will also need to be created for the constants not included in the problem description. My thoughts right now are create some average of all of the problem's embeddings that contain that constant and use that as the constant embedding. These should also be added to the values for use later.
- Create a multi class classification that will return the X most likely constants given a problem description. This would hopefully cut down the possible permutations of numbers.
- Given a list of numbers and a list of operators, format this as a stepwise relation extraction problem? (WIP)

NOTE TO SELF: 
- For constants maybe get representation by doing some sort of average of all of the problems it is useful in (embedding of their CLS tokens?)
- For actual values, get embedding for num mask at its position.
- Create a model to find the top x most likely constants given a problem description (choose x in such a way that it is nearly 100% accurate) (probably multiclass classification)
- Create a multiclass classification model to find x most likely operators (not as useful, but maybe worth a try)