# MathQA Final Model

#### Imports

In [1]:
from enum import Enum
import math
import os
import re
from copy import deepcopy
import random
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer, 
    AutoConfig,
    BitsAndBytesConfig
)
import pickle
from scipy.optimize import linear_sum_assignment
import unittest
import torch
import evaluate
import anytree
from anytree.importer import DictImporter

False

The following directories listed in your path were found to be non-existent: {WindowsPath('D')}
The following directories listed in your path were found to be non-existent: {WindowsPath('AQAAANCMnd8BFdERjHoAwE/Cl+sBAAAAan+7O/HXGUqbHMC20qgFNAQAAAACAAAAAAAQZgAAAAEAACAAAAC8w/AUVoGAsqbSQ4BP00dpBuGDWNmlRDCh1ZtQ7JQdAQAAAAAOgAAAAAIAACAAAAAvgHApRUBuDQJb8EDv4lNKq7azDvkAMeQH0Lhlr+X0eWAAAABwNY/l5gPC9bcRZJSjpAala2n1ymOhAo8/TXNDiROtOe7at4ABGEtaXXnF7hDoxWJlGXMxJTKukH9ihVP+QY8gNobGVSScwqd1bkNzcz5x5Wb4qizsH517NFzu0P086yVAAAAA4u8Y1cDAwvbg0oghzLLIAsLGq12LQ4Gp4pAyNRX4Hu7CZH4H8ZJ9L9QhMg1bd71RNypAmMtw1NLXb0bDqRD5uw==')}
The following directories listed in your path were found to be non-existent: {WindowsPath('/matplotlib_inline.backend_inline'), WindowsPath('module')}
The following directories listed in your path were found to be non-existent: {WindowsPath('/usr/local/cuda/lib64')}
DEBUG: Possible options found for libcudart.so: set()
CUDA SETUP: PyTorch settings found: CUDA_VERSION=121, Highest Com


python -m bitsandbytes


  warn(msg)
  warn(msg)


RuntimeError: Failed to import transformers.trainer_seq2seq because of the following error (look up to see its traceback):

        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues

#### Constants

In [None]:
K = 6
MAX_LAYERS = 8
MAX_TOKENS = 392
EMBEDDING_SIZE = 768

DATA_PATH = './dataset/'
SET_NAMES = ['train', 'validation', 'test']
MODEL = 'google/flan-t5-xxl'
MODEL_PATH = f'models/{MODEL.split("/")[-1]}-MathQA'
DEVICE = 'cuda:0'
NUM_MASK = '<num>'
WORKING_DIR = 'TEMP/'

OBJ_DIR = 'pickle/'


class Op(Enum):
    ADD = '+'
    SUB = '-'
    MULT = '*'
    DIV = '/'
    POW = '^'
    
class Const(Enum):
    CONST_NEG_1 = 'const_neg_1' # I added this
    CONST_0_25 = 'const_0_25'
    CONST_0_2778 = 'const_0_2778'
    CONST_0_33 = 'const_0_33'
    CONST_0_3937 = 'const_0_3937'
    CONST_1 = 'const_1'
    CONST_1_6 = 'const_1_6'
    CONST_2 = 'const_2'
    CONST_3 = 'const_3'
    CONST_PI = 'const_pi'
    CONST_3_6 = 'const_3_6'
    CONST_4 = 'const_4'
    CONST_5 = 'const_5'
    CONST_6 = 'const_6'
    CONST_10 = 'const_10'
    CONST_12 = 'const_12'
    CONST_26 = 'const_26'
    CONST_52 = 'const_52'
    CONST_60 = 'const_60'
    CONST_100 = 'const_100'
    CONST_180 = 'const_180'
    CONST_360 = 'const_360'
    CONST_1000 = 'const_1000'
    CONST_3600 = 'const_3600'

values = [-1, 0.25, 0.2778, 0.33, 0.3937, 1, 1.6, 2, 3, 3.1416, 3.6, 4, 5, 6, 10, 12, 26, 52, 60, 100, 180, 360, 1000, 3600]
const2val = {k:v for k,v in zip(Const._value2member_map_.keys(), values)}    

op2id = {k:v for k,v in zip(Op._value2member_map_.keys(), range(len(Op._value2member_map_)))}
op2id['None'] = 5
id2op = np.array(list(op2id.keys()))
const2id = {k:v for k,v in zip(Const._value2member_map_.keys(), range(len(Const._value2member_map_)))}
id2const = np.array(list(const2id.keys()))

torch.set_printoptions(sci_mode=False)

class Util():
    def load_obj(self, path):
        with open(path, 'rb') as f:
            o = pickle.load(f)
        return o
    
    def save_obj(self, path, o):
        with open(path, 'wb') as f:
            pickle.dump(o, f)
            
    def load_data(self):
        return {name:pd.read_csv(f'{DATA_PATH}{name}.csv') for name in SET_NAMES}
    
    def set_seed(self, seed):
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
util = Util()

## Preprocessing

In [41]:
def flatten(arr):
    idx = np.concatenate([[i]*len(x) for i,x in enumerate(arr)])
    flattened = np.concatenate(arr) 
    return idx, flattened

def get_nums_and_mask(data):
    nums = {name:[] for name in SET_NAMES}
    problems = {name:[] for name in SET_NAMES}
    num_idx = {name:[] for name in SET_NAMES}
    for name in SET_NAMES:
        for i,problem in enumerate(data[name].problem):
            num = re.compile('([+-]?((\d+(\.\d*)?)|(\.\d+)))') # normal num
            big = re.compile(r'(-?\d{1,3}(,\d{3})+(\.\d*)?)') # num with comma

            big_results = re.finditer(big, problem)
            problem = re.sub(big, NUM_MASK, problem)        
            num_results = re.finditer(num, problem)
            problem = re.sub(num, NUM_MASK, problem)

            # Getting the combined numbers in order of occurence
            combined = [x for x in num_results]
            combined.extend([x for x in big_results])
            combined = sorted(combined, key=lambda x: x.start(0))

            combined = [float(x.group(0).replace(',','')) for x in combined]

            nums[name].append(np.array(combined))
            problems[name].append(problem)
        num_idx[name], nums[name] = flatten(np.array(nums[name], dtype=object))
        problems[name] = np.array(problems[name])
    return {name:{'idx':torch.tensor(num_idx[name]), 'literals':nums[name]} for name in SET_NAMES}, problems

In [42]:
data = util.load_data()

In [43]:
op = util.load_obj(f'{OBJ_DIR}ops.pickle')
const = util.load_obj(f'{OBJ_DIR}constants.pickle')
subexp = util.load_obj(f'{OBJ_DIR}subexp.pickle')
nums,_ = get_nums_and_mask(data)

In [44]:
# text preprocessing
def determine_if_int(num):
    if float(num).is_integer():
        return int(float(num))
    else:
        return float(num)

def str_numpy(arr, t=None):
    if t == 'num':
        convert = lambda x: str(determine_if_int(x))
    elif t == 'eq':
        split = lambda x: x.split()
        convert_split = lambda x: f'{x[1]} {process_num(x[0])} {process_num(x[2])}'
        convert = lambda x: convert_split(split(x))        
    else:
        convert = lambda x: str(x)
        
    output = '{'
    if len(arr) > 0:
        output += f'{convert(arr[0])}'
        for x in arr[1:]:
            output += f', {convert(x)}'
    output += '}'
    return output   

def process_num(num):
    if num in const2val:
        return str(determine_if_int(const2val[num]))
    else:
        return str(determine_if_int(num))
    
# label preprocessing
importer =  DictImporter()
def process_item(item):
    if item in id2op:
        return str(item)
    elif item in const2val:
        return str(determine_if_int(const2val[item]))
    else:
        return str(determine_if_int(item))

def convert_to_preorder(name):
    labels = []
    for tree in data[name]['tree']:
        root = importer.import_(eval(tree))
        output = ''
        for node in anytree.PreOrderIter(root):
            output += process_item(node.name) + ' '
        labels.append(output[0:-1])
    return labels
     
# main function
def prompt_engineering(name):
    eq, eq_idx = subexp[name]
    eq = np.array(eq)
    eq_idx = np.array(eq_idx)
    idx = nums[name]['idx']
    literals = nums[name]['literals']
    engineered = []
    
    for i in range(len(data[name])):
        ops = str_numpy(id2op[0:-1][op['pred'][name][i].astype(bool)])
        constants = id2const[const['pred'][name][i].astype(bool)]
        constants = str_numpy([const2val[x] for x in constants], t='num')
        numbers = str_numpy(literals[idx==i], t='num')
        equations = str_numpy(eq[eq_idx==i], t='eq')
        problem = data[name]['problem'][i]
        prompt = f'Find the mathematical formula given: numbers: {numbers}, constants: {constants}, operations: {ops}, and potential subexpressions: {equations} for problem: "{problem}"'
        engineered.append(prompt)
        
    return engineered

#labels = {name:data[name]['formula_no_const'].str.replace(r'(\d)\.0(\s|\))', r'\1\2', regex=True).str.replace(' ','') for name in SET_NAMES}
engineered = {name:Dataset.from_dict({'text':prompt_engineering(name), 'label':convert_to_preorder(name)}) for name in SET_NAMES}
engineered = DatasetDict(engineered)

In [45]:
engineered['train'][0]

{'text': 'Find the mathematical formula given: numbers: {3, 10, 36}, constants: {1, 2, 100}, operations: {*, /}, and potential subexpressions: {* 3 10, * 36 100} for problem: "the banker \' s gain of a certain sum due 3 years hence at 10 % per annum is rs . 36 . what is the present worth ?"',
 'label': '/ * 100 / * 36 100 * 3 10 * 3 10'}

In [46]:
engineered

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 18215
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2710
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1798
    })
})

## Tokenization

In [48]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
data_collator = DataCollatorForSeq2Seq(tokenizer)
max_source = 512
max_target = 186 # The amount needed to emcompass all equations

Downloading tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [49]:
def tokenization(batch):
    tokenized = tokenizer(text = batch['text'], max_length = max_source, truncation = True)
    labels = tokenizer(text_target = batch['label'], max_length = max_source, truncation = True)
    tokenized['labels'] = labels['input_ids']   
    return tokenized

tokenized = engineered.map(tokenization, batched=True, remove_columns=['text','label'])

Map:   0%|          | 0/18215 [00:00<?, ? examples/s]

Map:   0%|          | 0/2710 [00:00<?, ? examples/s]

Map:   0%|          | 0/1798 [00:00<?, ? examples/s]

In [50]:
tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 18215
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2710
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1798
    })
})

## Quantization

In [52]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compuite_dtype=getattr(torch, "float16"),
    bnb_4bit_use_double_quant=False,
)

In [54]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL,
    quantization_config=bnb_config,
    device_map='auto',
)

Downloading config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

ImportError: Using `load_in_8bit=True` requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes `pip install -i https://test.pypi.org/simple/ bitsandbytes` or pip install bitsandbytes` 

## Training

In [36]:
batch_size = 1
grad_acc = 4
args = Seq2SeqTrainingArguments(
    WORKING_DIR,
    evaluation_strategy='steps',
    eval_steps=200,
    logging_strategy="steps",
    logging_steps=200,
    save_strategy="steps",
    save_steps=200,
    learning_rate=5e-5, # learning rates around this worked well for other models trained with this dataset
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=grad_acc,
    weight_decay=.01, # to maintain consistency with pytorch AdamW optimizer
    save_total_limit=3, # ensures dont run out of disk space
    num_train_epochs=5,
    predict_with_generate=True, # allows the use of rouge and bleu metrics
    #generation_max_length=50, 
    load_best_model_at_end=True,
    metric_for_best_model="exact_match",
)

In [13]:
rouge_metric = evaluate.load('rouge')
bleu_metric = evaluate.load('bleu')
def compute_metrics(p):
    predictions = p[0]
    labels = p[1]
    pred_decode = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # the data collator will pad labels with -100 (to signal that they should not be used in loss calculation)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    label_decode = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # rouge (recall)
    scores = rouge_metric.compute(predictions=pred_decode, references=label_decode, use_stemmer=False)
    
    # exact match
    exact = (np.array(pred_decode)==np.array(label_decode)).sum()/len(pred_decode)
    scores['exact_match'] = exact
    
    # getting scores
    return scores

In [14]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Exact Match
200,1.3899,1.139394,0.727257,0.511561,0.649339,0.649453,0.089299


KeyboardInterrupt: 

In [None]:
trainer.save_model(MODEL_PATH)

NameError: name 'tokenized' is not defined