# MathQA Training

#### Imports

In [269]:
from enum import Enum
import os
import anytree
import pandas as pd
from itertools import permutations
import seaborn as sns
import math
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from sklearn.metrics import f1_score, accuracy_score
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from sklearn.utils.class_weight import compute_class_weight
import re
import pickle

#### Constants

In [270]:
DATA_PATH = './dataset/'
SET_NAMES = ['train', 'validation', 'test']
ENCODER_MODEL = 'distilroberta-base' # A more optimized version of roberta obtaining 95% of its performance
MAX_TOKENS = 392
DEVICE = 'cuda:0'
NUM_MASK = '<num>'
WORKING_DIR = 'TEMP/'
FINAL_DIR = 'embeddings/'

class Op(Enum):
    ADD = '+'
    SUB = '-'
    MULT = '*'
    DIV = '/'
    POW = '^'
    
class Const(Enum):
    CONST_NEG_1 = 'const_neg_1' # I added this
    CONST_0_25 = 'const_0_25'
    CONST_0_2778 = 'const_0_2778'
    CONST_0_33 = 'const_0_33'
    CONST_0_3937 = 'const_0_3937'
    CONST_1 = 'const_1'
    CONST_1_6 = 'const_1_6'
    CONST_2 = 'const_2'
    CONST_3 = 'const_3'
    CONST_PI = 'const_pi'
    CONST_3_6 = 'const_3_6'
    CONST_4 = 'const_4'
    CONST_5 = 'const_5'
    CONST_6 = 'const_6'
    CONST_10 = 'const_10'
    CONST_12 = 'const_12'
    CONST_26 = 'const_26'
    CONST_52 = 'const_52'
    CONST_60 = 'const_60'
    CONST_100 = 'const_100'
    CONST_180 = 'const_180'
    CONST_360 = 'const_360'
    CONST_1000 = 'const_1000'
    CONST_3600 = 'const_3600'

values = [-1, 0.25, 0.2778, 0.33, 0.3937, 1, 1.6, 2, 3, math.pi, 3.6, 4, 5, 6, 10, 12, 26, 52, 60, 100, 180, 360, 1000, 3600]
const2val = {k:v for k,v in zip(Const._value2member_map_.keys(), values)}    

op2id = {k:v for k,v in zip(Op._value2member_map_.keys(), range(len(Op._value2member_map_)))}
const2id = {k:v for k,v in zip(Const._value2member_map_.keys(), range(len(Const._value2member_map_)))}

## Loading the data

Reading csv into a dictionary of dataframes

In [3]:
data = {name:pd.read_csv(f'{DATA_PATH}{name}.csv') for name in SET_NAMES}

Converts operations for each problem into a multi label onehot encoded setup

In [4]:
def onehot_ops(data):
    labels = []
    for op_set in data.ops:
        op_set = eval(op_set)
        idx = [op2id[op] for op in op_set]
        onehot = np.zeros(len(op2id))
        onehot[idx] = 1
        labels.append(onehot)
    return np.array(labels)
        
#onehot_ops(data['train'])

Sort nums for each each problem in increasing order

In [22]:
def max_num(nums):
    get_float = lambda x: float(const2val[x]) if x in const2val else float(x)
    return max(map(get_float, nums))

def remove_const(data):
    nums = []
    for num_list in data.nums:
        nums.append(set([float(x) for x in eval(num_list) if x not in const2val]))
    return nums

# Gets the numbers listed in a problem
# Once found, numbers are masked using a number mask
def get_nums_from_problem(data, convert_to_float=False):
    nums = []
    problems = []
    for problem in data.problem:
        num = re.compile('([+-]?((\d+(\.\d*)?)|(\.\d+)))')
        big = re.compile(r'(-?\d{1,3}(,\d{3})+(\.\d*)?)')
        
        big_results = re.finditer(big, problem)
        problem = re.sub(big, NUM_MASK, problem)        
        num_results = re.finditer(num, problem)
        problem = re.sub(num, NUM_MASK, problem)
        
        # Getting the combined numbers in order of occurence
        combined = [x for x in num_results]
        combined.extend([x for x in big_results])
        combined = sorted(combined, key=lambda x: x.start(0))
        
        if convert_to_float:
            combined = [float(x.group(0).replace(',','')) for x in combined]
        else:
            combined = [x.group(0) for x in combined]
        
        nums.append(combined)
        problems.append(problem)
    return nums, problems

def sort_nums(data):
    nums_sorted = []
    nums_no_const_sorted = []
    for nums in data.nums_no_const:
        nums_no_const_sorted.append(sorted(list(eval(nums)), key=lambda x: float(x)))
    for nums in data.nums:
        num_list = list(eval(nums))
        maximum = max_num(num_list)
        get_float = lambda x: float(const2val[x])+maximum if x in const2val else float(x)
        nums_sorted.append(sorted(num_list, key=get_float))
    return nums_sorted, nums_no_const_sorted

#sort_nums(data['train'])

Here I do some testing to see if the numbers from the equation can be found in the problem description using simple regexes. This actually works extremely well, having no examples where the expected numbers is not a subset of the obtained numbers. This does not include constants. Constants are values which should not occur in the problem description (like pi or the 2 in r^2 for example)

In [6]:
expected = remove_const(data['train'])
obtained,_ = get_nums_from_problem(data['train'], convert_to_float=True)
obtained = [set(x) for x in obtained]

idx = 0
for x, y in zip(expected, obtained):
    if not (x <= y):
        print('------------------')
        print(data['train']['problem'][idx])
        print(f'Expected: {x}')
        print(f'Obtained: {y}')
        print('------------------')
    idx += 1

In [7]:
data['train']['category'].value_counts()

category
general        7231
physics        4908
gain           3544
geometry       1422
other          1071
probability     145
Name: count, dtype: int64

## Encoder

In this step, we use Roberta to get contextualized embeddings for each math problem

First, the problem texts must be tokenized into input ids. A number mask token is used for each number in the problem, as they should not affect the problem itself.

In [9]:
model_path = f'{ENCODER_MODEL}-encoder-mathqa'
#model = AutoModelForMaskedLM.from_pretrained(ENCODER_MODEL) # Used for fine tuning only
model = AutoModel.from_pretrained(model_path, output_hidden_states=True) # Fine tuned model used for getting the contextualized embeddings

Some weights of RobertaModel were not initialized from the model checkpoint at distilroberta-base-encoder-mathqa and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
tokenizer = AutoTokenizer.from_pretrained(ENCODER_MODEL)

# Adding a new token to the model, for masking out numbers.
tokenizer.add_special_tokens({'additional_special_tokens':[NUM_MASK]})
model.resize_token_embeddings(len(tokenizer))

def tokenize_data(data):
    tokenization = lambda x: tokenizer(x, padding='max_length', max_length=MAX_TOKENS, truncation=True)
    _,problem = get_nums_from_problem(data)
    
    tokenized = list(map(tokenization, problem))
    input_ids = torch.stack([torch.tensor(x['input_ids']) for x in tokenized])
    attention_mask = torch.stack([torch.tensor(x['attention_mask']) for x in tokenized])
    
    return {'input_ids':input_ids.long(), 'attention_mask':attention_mask.int()}

tokenized = {name:tokenize_data(data[name]) for name in SET_NAMES}

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 50266. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


In [11]:
print(f"Number of problems that exceed {MAX_TOKENS} tokens: {np.sum(np.array((tokenized['train']['input_ids'][:,-1]!=1)))}") # 1 is the padding token

Number of problems that exceed 392 tokens: 0


Next, the encoder model is finetuned on MathQA, using masked language modeling, similar to how bert does its trainined. This allows the model to create better contextualized representations for each math problem. Hyperparameters courtesy of https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling.ipynb#scrollTo=QRTpmyCc3l_T

In [None]:
# args = TrainingArguments(
#     f'{WORKING_DIR}{model_path}',
#     evaluation_strategy='epoch',
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     per_device_train_batch_size = 8,
#     per_device_eval_batch_size = 8,
# )

# train = Dataset.from_dict(tokenized['train'])
# val = Dataset.from_dict(tokenized['validation'])
# train.set_format('torch')
# val.set_format('torch')

# trainer = Trainer(
#     model=model,
#     args=args,
#     train_dataset=train,
#     eval_dataset=val,
#     data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) # using the masked probability from BERT
# )

In [None]:
#trainer.train()

In [None]:
#trainer.save_model(model_path)

This function gets the number index for each masked number token in the tokenized problems

In [12]:
def get_masked_idx(tokenized):
    mask_id = tokenizer.encode(NUM_MASK, add_special_tokens=False)[0]
    ids = tokenized['input_ids']
    
    return [np.where(id==mask_id)[0] for id in ids]

masked_idx = {name:get_masked_idx(tokenized[name]) for name in data.keys()}

This function gets the problem indices that each constant is used in

In [42]:
def get_const_problems(data):
    const2idx = {const:[] for const in Const._value2member_map_.keys()}
    for idx, num_list in enumerate(data.nums):
        for x in eval(num_list):
            if x in const2val:
                const2idx[x].append(idx)
    return {k:np.array(v) for k,v in const2idx.items()}

const2idx = {name:get_const_problems(data[name]) for name in SET_NAMES}

Here the contextualized embeddings are obtained using the fientuned roberta model for the problem, problem numbers, and constants. The contextualized embeddings are just the sum of the last four hidden layers outputted from bert.

In [147]:
# Batches a non homogeneous array given a number of splits
def non_homogeneous_split(arr, num_per_batch):
    return [arr[idx:idx+num_per_batch] for idx in range(0,len(arr),num_per_batch)]

# Batches the const2idx dictionary
def batch_const2idx(const2idx, name):
    batched_const2idx = [{const:[] for const in Const._value2member_map_.keys()} for x in range(num_splits)]
    split_size = math.ceil(len(data[name])/num_splits)
    for k,v in const2idx[name].items():
        for batch_num, batch_idx in zip(v//96, v%96):
            batched_const2idx[batch_num][k].append(batch_idx)
    return [{k:np.array(v) for k,v in x.items()} for x in batched_const2idx]

# Putting model on gpu
model.to(DEVICE)

def get_embeddings(name):
    # The final embeddings
    embeddings = {'problem':None, 'num':[], 'const':None}

    # batching ids and masks
    num_per_batch = 100
    num_splits = math.ceil(len(tokenized[name]['input_ids'])/num_per_batch)
    batched_ids = torch.split(tokenized[name]['input_ids'], num_per_batch)
    batched_masks = torch.split(tokenized[name]['attention_mask'], num_per_batch)
    batched_idx = non_homogeneous_split(masked_idx[name], num_per_batch)

    for batch_num in range(num_splits):
        # Getting first batch and putting on gpu
        ids = batched_ids[batch_num].to(DEVICE)
        mask = batched_masks[batch_num].to(DEVICE)
        idx = batched_idx[batch_num]

        # Getting the raw hidden layer output
        with torch.no_grad():
            output = model(ids, mask)

        # [batch_size * tokens * 13 * 768]
        output = torch.stack(output[2], dim=0).permute(1,2,0,3)

        # Summing the last 4 hidden layers from roberta to be used as the contextualized embeddings
        output = torch.sum(output[:,:,-4:,:], dim=2)

        # Getting the num embeddings at the index of each masked number
        num_embeddings = [output[x,:,:][idx[x]].to('cpu') for x in range(len(idx))]
        embeddings['num'].extend(num_embeddings)

        # Getting the problem embeddings (Using the CLS (<s> for roberta) token to get a representation of the whole text)
        problem_embeddings = output[:,0,:].to('cpu')
        if embeddings['problem'] is None:
            embeddings['problem'] = problem_embeddings
        else:
            embeddings['problem'] = torch.cat((embeddings['problem'], problem_embeddings), dim=0)
            

        # Cleaning up for the next batch
        del ids
        del mask
        del idx
        del output
        torch.cuda.empty_cache()
    return embeddings

In [151]:
embeddings = {name:get_embeddings(name) for name in SET_NAMES}

To get the constant embeddings, we take the average of all of the problem embeddings that the constant was used in. This should hopefully give the constants some more context during downstream training. The training data is only used for the constant embeddings, as you would not know what constants belong to the problem in the test/validation

In [195]:
def get_const_embeddings():
    name = 'train'
    return torch.stack(tuple([torch.mean(embeddings[name]['problem'][const2idx[name][k]], dim=0) for k in const2idx[name].keys()]))

embeddings['train']['const'] = get_const_embeddings()
embeddings['validation']['const'] = get_const_embeddings()
embeddings['test']['const'] = get_const_embeddings()

This functions creates a problem index so that the numbers and embeddings for each number can be flattened

In [232]:
def get_nums_mapping(name):
    nums,_ = get_nums_from_problem(data[name])
    problem_idx = np.concatenate(tuple([np.full(len(num), idx) for idx, num in enumerate(nums)]))
    problem_nums = np.concatenate(tuple(nums))
    return problem_idx, problem_nums

Flattening the num embeddigns

In [247]:
embeddings['train']['num'] = torch.cat(tuple(embeddings['train']['num']), dim=0)
embeddings['validation']['num'] = torch.cat(tuple(embeddings['validation']['num']), dim=0)
embeddings['test']['num'] = torch.cat(tuple(embeddings['test']['num']), dim=0)

Adding the num mapping to the dictionary

In [259]:
embeddings['train']['num_mapping'] = get_nums_mapping('train')
embeddings['validation']['num_mapping'] = get_nums_mapping('validation')
embeddings['test']['num_mapping'] = get_nums_mapping('test')

Adding a mapping for the constants to the dictionary

In [264]:
embeddings['train']['const_mapping'] = const2id
embeddings['validation']['const_mapping'] = const2id
embeddings['test']['const_mapping'] = const2id

Lastly, the embeddings dictionary is stored to disk to be used in future ipynb files

To load this object simply do:

```
with open(f'{FINAL_DIR}embeddings.pickle', 'rb') as f:
    embeddings = pickle.load(handle)
```

In [271]:
if not os.path.exists(FINAL_DIR):
    os.makedirs(FINAL_DIR)
    
with open(f'{FINAL_DIR}embeddings.pickle', 'wb') as f:
    pickle.dump(embeddings, f)