# MathQA Preprocessing2

Here some adjustments are made to the original preprocessed data in a separate file, so the first file does not need to be rerun

#### Imports

In [175]:
from enum import Enum
import os
import anytree
from anytree import RenderTree
from anytree.importer import DictImporter
import pandas as pd
from itertools import permutations
import seaborn as sns
import math
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from sklearn.metrics import f1_score, accuracy_score
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from sklearn.utils.class_weight import compute_class_weight
import re
import pickle
from copy import deepcopy
import warnings

#### Constants

In [171]:
K = 6
MAX_LAYERS = 8

DATA_PATH = './dataset/'
SET_NAMES = ['train', 'validation', 'test']
ENCODER_MODEL = 'distilroberta-base' # A more optimized version of roberta obtaining 95% of its performance
MAX_TOKENS = 392
DEVICE = 'cuda:0'
NUM_MASK = '<num>'
WORKING_DIR = 'TEMP/'

OBJ_DIR = 'pickle/'


class Op(Enum):
    ADD = '+'
    SUB = '-'
    MULT = '*'
    DIV = '/'
    POW = '^'
    
class Const(Enum):
    CONST_NEG_1 = 'const_neg_1' # I added this
    CONST_0_25 = 'const_0_25'
    CONST_0_2778 = 'const_0_2778'
    CONST_0_33 = 'const_0_33'
    CONST_0_3937 = 'const_0_3937'
    CONST_1 = 'const_1'
    CONST_1_6 = 'const_1_6'
    CONST_2 = 'const_2'
    CONST_3 = 'const_3'
    CONST_PI = 'const_pi'
    CONST_3_6 = 'const_3_6'
    CONST_4 = 'const_4'
    CONST_5 = 'const_5'
    CONST_6 = 'const_6'
    CONST_10 = 'const_10'
    CONST_12 = 'const_12'
    CONST_26 = 'const_26'
    CONST_52 = 'const_52'
    CONST_60 = 'const_60'
    CONST_100 = 'const_100'
    CONST_180 = 'const_180'
    CONST_360 = 'const_360'
    CONST_1000 = 'const_1000'
    CONST_3600 = 'const_3600'

values = [-1, 0.25, 0.2778, 0.33, 0.3937, 1, 1.6, 2, 3, math.pi, 3.6, 4, 5, 6, 10, 12, 26, 52, 60, 100, 180, 360, 1000, 3600]
const2val = {k:v for k,v in zip(Const._value2member_map_.keys(), values)}    

op2id = {k:v for k,v in zip(Op._value2member_map_.keys(), range(len(Op._value2member_map_)))}
const2id = {k:v for k,v in zip(Const._value2member_map_.keys(), range(len(Const._value2member_map_)))}

## Loading the data

In [168]:
data = {name:pd.read_csv(f'{DATA_PATH}{name}.csv') for name in SET_NAMES}

## Preprocessing

This function reads the stored expression tree dictionary representations back into anytree obects. Print tree is for debugging.

In [169]:
# Outputs a tree given its root in human readable format
def print_tree(tree):
    for pre, fill, node in RenderTree(tree):
        print("%s%s" % (pre, node.name)) 

# Imports all trees from the data
def import_trees(name):
    importer = DictImporter()
    return np.array([importer.import_(eval(tree)) for tree in data[name]['tree']])

trees = {name:import_trees(name) for name in SET_NAMES}

In [170]:
data['train'] = data['train'].drop(columns=['incremental_no_const'])
data['test'] = data['test'].drop(columns=['incremental_no_const'])
data['validation'] = data['validation'].drop(columns=['incremental_no_const'])

Here the labels are created for training. Labels are created in a step wise approach, where each step includes all independent expressions.

In [172]:
def list_str(arr):
    result = f"['{arr[0]}'"
    for x in arr[1:]:
        if x:
            result += f", '{x}'"
        else:
            result += f', {x}'
    result += ']'
    return result

def create_label(root):
    # making sure source tree is not overwritten
    root = deepcopy(root)
    
    results = []
    problem_idx = []
    
    idx = 1
    
    while root and root.children:
        #print_tree(root)
        ret = create_label_helper(root, idx)
        
        # if result is none, K was exceeded
        if ret is None:
            return None
        else:
            idx, result = ret

        idx = idx
        results.append(list_str(result))

    if len(results) > MAX_LAYERS:
        return None
    else:
        return ' ; '.join(results)

def create_label_helper(root, idx):    
    if not root:
        return None
    
    # final equation results
    results = {}
    
    # stack for traversing tree
    stack = []
    stack.append(root) 
    
    while len(stack) > 0:
        curr = stack.pop()
        
        # found an equation pair
        if len(curr.children) == 2 and not curr.children[0].children and not curr.children[1].children:
            eq = f'{curr.children[0].name} {curr.name} {curr.children[1].name}'
            reverse = f'{curr.children[1].name} {curr.name} {curr.children[0].name}'
            
            # adding equation (if addition or multiplication, order does not matter)
            if curr.name in [Op.ADD.value, Op.MULT.value]:
                if eq not in results and reverse not in results:
                    results[eq] = idx
                    idx += 1                    
            elif eq not in results:
                results[eq] = idx
                idx += 1

            # marking as a variable
            if eq in results:
                curr.name = f'x{results[eq]}'
            else:
                curr.name = f'x{results[reverse]}'
            curr.children = ()       
            
        if curr.children:
            stack.append(curr.children[0])
            stack.append(curr.children[1])
    
    results = list(results.keys())
    
    # If the number of independent formulas is greater than K, we have to throw out the label
    if len(results) > K:
        return None
    
    # Padding to length K
    results.extend([None]*(K-len(results)))
    
    return idx, np.array(results)

def create_all_labels(name):
    labels = []
    problem_idx = []
    idx = 0
    for tree in trees[name]:
        label = create_label(tree)
        if label is not None:
            problem_idx.append(idx)
            labels.append(label)
        idx += 1
    
    return labels, problem_idx

In [176]:
# A problem idx is returned so we known if/what problems had to be dropped
def update_date(name):
    labels, problem_idx = create_all_labels(name)
    data[name]['incremental'] = pd.NA
    data[name]['incremental'].iloc[problem_idx] = labels
with warnings.catch_warnings():
    update_date('train')
    update_date('test')
    update_date('validation')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[name]['incremental'].iloc[problem_idx] = labels
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[name]['incremental'].iloc[problem_idx] = labels
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[name]['incremental'].iloc[problem_idx] = labels


In [182]:
data['train'] = data['train'].dropna()
data['test'] = data['test'].dropna()
data['validation'] = data['validation'].dropna()

## Saving new results

In [183]:
for name in SET_NAMES:
    data[name].to_csv(DATA_PATH+f'{name}.csv', index=False)