# MathQA Expression AutoEncoder

#### Imports

In [1]:
from enum import Enum
import os
import anytree
from anytree import RenderTree
from anytree.importer import DictImporter
import pandas as pd
from itertools import permutations
import seaborn as sns
import math
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForMaskedLM, DataCollatorForLanguageModeling
from sklearn.metrics import f1_score, accuracy_score
from datasets import Dataset
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
from sklearn.utils.class_weight import compute_class_weight
import re
import pickle
from copy import deepcopy

#### Constants

In [2]:
K = 6
MAX_LAYERS = 8
MAX_TOKENS = 392
EMBEDDING_SIZE = 768

DATA_PATH = './dataset/'
SET_NAMES = ['train', 'validation', 'test']
ENCODER_MODEL = 'distilroberta-base' # A more optimized version of roberta obtaining 95% of its performance
DEVICE = 'cuda:0'
NUM_MASK = '<num>'
WORKING_DIR = 'TEMP/'

OBJ_DIR = 'pickle/'


class Op(Enum):
    ADD = '+'
    SUB = '-'
    MULT = '*'
    DIV = '/'
    POW = '^'
    
class Const(Enum):
    CONST_NEG_1 = 'const_neg_1' # I added this
    CONST_0_25 = 'const_0_25'
    CONST_0_2778 = 'const_0_2778'
    CONST_0_33 = 'const_0_33'
    CONST_0_3937 = 'const_0_3937'
    CONST_1 = 'const_1'
    CONST_1_6 = 'const_1_6'
    CONST_2 = 'const_2'
    CONST_3 = 'const_3'
    CONST_PI = 'const_pi'
    CONST_3_6 = 'const_3_6'
    CONST_4 = 'const_4'
    CONST_5 = 'const_5'
    CONST_6 = 'const_6'
    CONST_10 = 'const_10'
    CONST_12 = 'const_12'
    CONST_26 = 'const_26'
    CONST_52 = 'const_52'
    CONST_60 = 'const_60'
    CONST_100 = 'const_100'
    CONST_180 = 'const_180'
    CONST_360 = 'const_360'
    CONST_1000 = 'const_1000'
    CONST_3600 = 'const_3600'

values = [-1, 0.25, 0.2778, 0.33, 0.3937, 1, 1.6, 2, 3, math.pi, 3.6, 4, 5, 6, 10, 12, 26, 52, 60, 100, 180, 360, 1000, 3600]
const2val = {k:v for k,v in zip(Const._value2member_map_.keys(), values)}    

op2id = {k:v for k,v in zip(Op._value2member_map_.keys(), range(len(Op._value2member_map_)))}
op2id['None'] = 5
const2id = {k:v for k,v in zip(Const._value2member_map_.keys(), range(len(Const._value2member_map_)))}

## Loading the data

In [3]:
data = {name:pd.read_csv(f'{DATA_PATH}{name}.csv') for name in SET_NAMES}

## Getting Labels

In [135]:
with open(f'{OBJ_DIR}embeddings/train/batch0.pickle', 'rb') as f:
    embeddings = pickle.load(f)

In [54]:
def get_exp(name):
    reorder = lambda x: (x[1], x[0], x[2])
    convert_to_arr = lambda d: [[reorder(tuple(x.split())),idx] for idx, arr in enumerate(d.split(' ; ')) for x in eval(arr) if x is not None]
    return data[name]['incremental'].map(convert_to_arr)
#     return np.concatenate(data[name]['incremental'].map(convert_to_arr))

#exp = np.concatenate([get_exp(name) for name in SET_NAMES])
exp = get_exp('train')

In [173]:
id2const = {v:k for k, v in const2id.items()}

def process_num(num, literals, idx, prob, prev):
    if num in const2val:
        return np.where((literals[idx==prob]==num))[0][0]
    elif 'x' in num:
        eq_idx = int(num[1:])-1
        return (prev[0][eq_idx], prev[1][eq_idx])
    else:
        return np.where((literals[idx==prob]==str(float(num))))[0][0]

num_literals = embeddings['num_literals']
num_idx = np.array(embeddings['num_idx'])
layer_idx = []
num1_labels = []
num2_labels = []
op_labels = []
for (op, num1, num2), layer in exp[0]:
    num1_labels.append(process_num(num1, num_literals, num_idx, 0, (num1_labels, num2_labels)))
    num2_labels.append(process_num(num2, num_literals, num_idx, 0, (num1_labels, num2_labels)))
    op_labels.append(op2id[op])
    layer_idx.append(layer)
    print(num1_labels)
    print(num2_labels)
    print(op_labels)
    print()

[0]
[1]
[2]

[0, 2]
[1, 5]
[2, 2]

[0, 2, (2, 5)]
[1, 5, (0, 1)]
[2, 2, 3]

[0, 2, (2, 5), 5]
[1, 5, (0, 1), ((2, 5), (0, 1))]
[2, 2, 3, 2]

[0, 2, (2, 5), 5, (5, ((2, 5), (0, 1)))]
[1, 5, (0, 1), ((2, 5), (0, 1)), (0, 1)]
[2, 2, 3, 2, 3]



In [172]:
num_literals[idx==0]

array(['3.0', '10.0', '36.0', 'const_1', 'const_2', 'const_100'],
      dtype='<U32')

In [166]:
literals[idx==0]

array(['3.0', '10.0', '36.0', 'const_1', 'const_1', 'const_1'],
      dtype='<U32')

In [137]:
embeddings

{'problem': tensor([[[ 1.5479e-01,  2.2711e-01,  1.1456e-03,  ..., -1.3655e-01,
            6.2029e-02,  1.7538e-01],
          [ 7.7430e-02, -3.4275e-01,  5.2688e-01,  ..., -3.3167e+00,
            3.7329e-01,  1.8739e+00],
          [ 4.1332e-01,  3.5344e+00, -5.5767e-01,  ..., -4.5617e+00,
           -5.1729e-01,  1.5321e+00],
          ...,
          [ 4.0086e-01, -1.4287e+00,  1.9589e-01,  ...,  5.7325e-01,
            5.6364e-01, -8.0716e-01],
          [ 4.0086e-01, -1.4287e+00,  1.9589e-01,  ...,  5.7325e-01,
            5.6364e-01, -8.0716e-01],
          [ 4.0086e-01, -1.4287e+00,  1.9589e-01,  ...,  5.7325e-01,
            5.6364e-01, -8.0716e-01]],
 
         [[ 1.0875e-01,  2.0369e-01,  1.7802e-02,  ..., -2.2420e-01,
            1.3723e-01,  3.5693e-02],
          [ 1.4829e+00,  5.5949e-01,  4.7207e-01,  ..., -1.8843e+00,
            3.7704e-01,  4.7303e-01],
          [ 1.1472e+00,  4.3673e+00,  1.0028e-01,  ..., -4.0872e+00,
           -2.9929e-01,  3.5677e-01],
        

In [138]:
embeddings['nums'].shape

torch.Size([56, 768])

In [49]:
exp[0]

[[('-', 'const_100', '5'), 0],
 [('+', 'const_100', '31.1'), 0],
 [('*', 'x2', 'const_100'), 1],
 [('/', 'x3', 'x1'), 2],
 [('-', 'x4', 'const_100'), 3]]

In [108]:
get_nums = lambda name: set([x for arr in data[name]['nums'] for x in eval(arr) if x not in const2val])
all_nums = set.union(*[get_nums(name) for name in SET_NAMES])
all_const = const2val.keys()

In [109]:
all_const

dict_keys(['const_neg_1', 'const_0_25', 'const_0_2778', 'const_0_33', 'const_0_3937', 'const_1', 'const_1_6', 'const_2', 'const_3', 'const_pi', 'const_3_6', 'const_4', 'const_5', 'const_6', 'const_10', 'const_12', 'const_26', 'const_52', 'const_60', 'const_100', 'const_180', 'const_360', 'const_1000', 'const_3600'])

In [100]:
num_const = 0
num_nested = 0
for num in np.concatenate((exp[:,1], exp[:,2])):
    if num in const2val:
        num_const+=1
    if 'x15' in num:
        num_nested+=1

In [101]:
num_nested

14

In [80]:
num_const/(exp.shape[0]*2)

0.19562350614027857

In [85]:
num_nested/(exp.shape[0]*2)

0.41089899447787026

- 20% of the time its a constant
- 40% of the time its an expression
- 40% of the time its a number