# MathQA Expression AutoEncoder

#### Imports

In [1]:
from enum import Enum
import os
import anytree
from anytree import RenderTree
from anytree.importer import DictImporter
import pandas as pd
from itertools import permutations
import seaborn as sns
import math
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from sklearn.metrics import f1_score, accuracy_score
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from sklearn.utils.class_weight import compute_class_weight
import re
import pickle
from copy import deepcopy

#### Constants

In [3]:
K = 6
MAX_LAYERS = 8
MAX_TOKENS = 392
EMBEDDING_SIZE = 768

DATA_PATH = './dataset/'
SET_NAMES = ['train', 'validation', 'test']
ENCODER_MODEL = 'distilroberta-base' # A more optimized version of roberta obtaining 95% of its performance
DEVICE = 'cuda:0'
NUM_MASK = '<num>'
WORKING_DIR = 'TEMP/'

OBJ_DIR = 'pickle/'


class Op(Enum):
    ADD = '+'
    SUB = '-'
    MULT = '*'
    DIV = '/'
    POW = '^'
    
class Const(Enum):
    CONST_NEG_1 = 'const_neg_1' # I added this
    CONST_0_25 = 'const_0_25'
    CONST_0_2778 = 'const_0_2778'
    CONST_0_33 = 'const_0_33'
    CONST_0_3937 = 'const_0_3937'
    CONST_1 = 'const_1'
    CONST_1_6 = 'const_1_6'
    CONST_2 = 'const_2'
    CONST_3 = 'const_3'
    CONST_PI = 'const_pi'
    CONST_3_6 = 'const_3_6'
    CONST_4 = 'const_4'
    CONST_5 = 'const_5'
    CONST_6 = 'const_6'
    CONST_10 = 'const_10'
    CONST_12 = 'const_12'
    CONST_26 = 'const_26'
    CONST_52 = 'const_52'
    CONST_60 = 'const_60'
    CONST_100 = 'const_100'
    CONST_180 = 'const_180'
    CONST_360 = 'const_360'
    CONST_1000 = 'const_1000'
    CONST_3600 = 'const_3600'

values = [-1, 0.25, 0.2778, 0.33, 0.3937, 1, 1.6, 2, 3, math.pi, 3.6, 4, 5, 6, 10, 12, 26, 52, 60, 100, 180, 360, 1000, 3600]
const2val = {k:v for k,v in zip(Const._value2member_map_.keys(), values)}    

op2id = {k:v for k,v in zip(Op._value2member_map_.keys(), range(len(Op._value2member_map_)))}
op2id['None'] = 5
const2id = {k:v for k,v in zip(Const._value2member_map_.keys(), range(len(Const._value2member_map_)))}

## Loading the data

In [4]:
data = {name:pd.read_csv(f'{DATA_PATH}{name}.csv') for name in SET_NAMES}

## Getting Labels

In [111]:
def get_nums_from_problem(data, convert_to_float=False):
    nums = []
    problems = []
    for problem in data.problem:
        num = re.compile('([+-]?((\d+(\.\d*)?)|(\.\d+)))')
        big = re.compile(r'(-?\d{1,3}(,\d{3})+(\.\d*)?)')
        
        big_results = re.finditer(big, problem)
        problem = re.sub(big, NUM_MASK, problem)        
        num_results = re.finditer(num, problem)
        problem = re.sub(num, NUM_MASK, problem)
        
        # Getting the combined numbers in order of occurence
        combined = [x for x in num_results]
        combined.extend([x for x in big_results])
        combined = sorted(combined, key=lambda x: x.start(0))
        
        if convert_to_float:
            combined = [float(x.group(0).replace(',','')) for x in combined]
        else:
            combined = [x.group(0) for x in combined]
        
        nums.append(combined)
        problems.append(problem)
    return nums, problems
obtained,_ = get_nums_from_problem(data['train'], convert_to_float=True)

In [119]:
# Batches a non homogeneous array given a number of splits
def non_homogeneous_split(arr, num_per_batch):
    return [arr[idx:idx+num_per_batch] for idx in range(0,len(arr),num_per_batch)]

non_homogeneous_split(obtained,8)

[[[3.0, 10.0, 36.0],
  [2.0, 3.0, 90.0],
  [120.0, 50.0],
  [10.0, 20.0],
  [218.0, 12.0, 16.0, 30.0],
  [6.0, 8.0, 3.0, 4.0],
  [25.0, 20.0],
  [4.0, 5.0, 4.0, 5.0, 36.0]],
 [[2.0, 30.0, 6.0, 40.0, 8.0, 10.0],
  [324.0, 20.0, 25.0, 70.0, 20.0],
  [6.0, 5.0, 2.0, 19.0],
  [6.0, 8.25, 0.1],
  [57.0, 43.0],
  [2008.0, 8.0, 2009.0, 20.0, 15.0, 2009.0, 2008.0],
  [2000.0, 54.0, 15.0, 20.0],
  [30.0, 10.0]],
 [[263.0, 935.0, 1383.0, 7.0],
  [900.0, 63.0, 3.0],
  [6.0, 38.0, 4.0],
  [96.0],
  [27.0, 75.0, 14.0],
  [150.0, 45.0, 10.0, 25.0],
  [4.0, 8.0, 306.0],
  [50.0, 144.0]],
 [[40.0, 30.0, 3.0],
  [65.0, 6.0],
  [13.0, 7.0, 8.0],
  [4.0, 5.0, 11.0, 7.0, 2.0, 2.0],
  [40.0, 14.0, 3.0, 4.0],
  [5.0, 8.0],
  [1.0, 2.0, 3.0, 6000.0],
  [3.0, 5.0, 50.0]],
 [[60.0, 30.0, 20.0, 40.0],
  [20.0, 30.0],
  [10.0, 2.0, 5.0, 10.0],
  [0.0, 1.0, 20.0, 3.0, 10.0],
  [1992.0, 20.0, 1991.0, 1993.0, 5.0, 1992.0, 1993.0, 1991.0],
  [10.0, 20.0],
  [2.0, 12.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 36.0],
  [10.0,

In [53]:
def get_exp(name):
    reorder = lambda x: (x[1], x[0], x[2])
    convert_to_arr = lambda d: [reorder(tuple(x.split())) for arr in d.split(' ; ') for x in eval(arr) if x is not None]
    return np.concatenate(data[name]['incremental'].map(convert_to_arr))

exp = np.concatenate([get_exp(name) for name in SET_NAMES])

In [55]:
exp.shape

(97064, 3)

In [108]:
get_nums = lambda name: set([x for arr in data[name]['nums'] for x in eval(arr) if x not in const2val])
all_nums = set.union(*[get_nums(name) for name in SET_NAMES])
all_const = const2val.keys()

In [109]:
all_const

dict_keys(['const_neg_1', 'const_0_25', 'const_0_2778', 'const_0_33', 'const_0_3937', 'const_1', 'const_1_6', 'const_2', 'const_3', 'const_pi', 'const_3_6', 'const_4', 'const_5', 'const_6', 'const_10', 'const_12', 'const_26', 'const_52', 'const_60', 'const_100', 'const_180', 'const_360', 'const_1000', 'const_3600'])

In [100]:
num_const = 0
num_nested = 0
for num in np.concatenate((exp[:,1], exp[:,2])):
    if num in const2val:
        num_const+=1
    if 'x15' in num:
        num_nested+=1

In [101]:
num_nested

14

In [80]:
num_const/(exp.shape[0]*2)

0.19562350614027857

In [85]:
num_nested/(exp.shape[0]*2)

0.41089899447787026

- 20% of the time its a constant
- 40% of the time its an expression
- 40% of the time its a number