In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
import torch
import torch.optim as optim
import os
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torchvision.transforms as transforms
import torch.nn.functional as F
import torchtext
import json
from transformers import BertTokenizer, BertModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dir = '/kaggle/input/dataset/data/' if os.path.exists('/kaggle') else './data/'
local_dir = './'
out_dir = '/kaggle/working/' if os.path.exists('/kaggle') else './'
batch_size = 32
num_epochs = 60

model_to_train = 0
load = False

In [None]:
# Load data from test.json, train.json, dev.json using DataLoader

class seqDataset(Dataset):
    def __init__(self, path, mode, in_vocab = {}, out_vocab = {}):
        self.path = path
        self.mode = mode
        self.Problem = []
        self.linear_formula = []
        self.answer = []
        self.load_data()
        self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

    def load_data(self):
        with open(self.path + self.mode + '.json', 'r') as f:
            data = json.load(f)
            for i in range(len(data)):
                x = (data[i]['Problem'] + ' <eos>').split()
                n = 0
                for j in range(len(x)):
                    try:
                        float(x[j])
                    except:
                        continue
                    x[j] = 'n' + str(n)
                    n += 1
                self.Problem.append(x)
                # self.Problem.append((data[i]['Problem'] + ' <eos>').split())
                self.linear_formula.append((data[i]['linear_formula'] + '|<eos>').replace(' ', '|').replace('(','|').replace(')', '|').replace(',', '|').replace('|', ' ').split())
                self.answer.append(data[i]['answer'])

    def __len__(self):
        return len(self.Problem)

    def __getitem__(self, idx):
        return (self.Problem[idx]), (self.linear_formula[idx]), self.answer[idx]
    
    def print(self, idx):
        print(' '.join(self.Problem[idx]), '|'.join(self.linear_formula[idx]), self.answer[idx], sep='\n')
    

def collate_fn(batch):
    Problem, linear_formula, answer = zip(*batch)
    Problem = pad_sequence(Problem, batch_first=True, padding_value=0)
    linear_formula = pad_sequence(linear_formula, batch_first=True, padding_value=0)
    # answer = torch.tensor(answer, dtype=torch.float32)
    return Problem, linear_formula, answer

train_dataset = seqDataset(dir, 'train')
test_dataset = seqDataset(dir, 'test')
dev_dataset = seqDataset(dir, 'dev')

In [None]:
bert_vocab = BertTokenizer.from_pretrained('bert-base-uncased')
# print(bert_vocab.vocab['[PAD]'])
# something = [' '.join(sentence[:-1]) for sentence in train_dataset.Problem[:32]]
# something_else = bert_vocab(something, padding=True, return_tensors='pt')
# print(something)
# print(something_else['input_ids'])
# print(something_else['attention_mask'])

def bert_collate(batch):
    Problem, linear_formula, answer = zip(*batch)
    Problem = [' '.join(sentence[:-1]) for sentence in Problem]
    Problem = bert_vocab(Problem, padding=True, return_tensors='pt')
    linear_formula = pad_sequence(linear_formula, batch_first=True, padding_value=0)
    return Problem, linear_formula, answer

In [None]:
embedding_dim = 300
hidden_dim = 256

In [None]:
GloVe = torchtext.vocab.GloVe(name='6B', dim=embedding_dim)

In [None]:
def make_glove_embedding(model, model_vocab, train_sentences):
    vocab = {'<pad>':0, '<start>':1, '<eos>':2, '<unknown>':3}
    i = 4
    for sentence in train_sentences:
        for word in sentence:
            if word not in vocab:
                vocab[word] = i
                i += 1

    embedding = torch.FloatTensor(len(vocab), embedding_dim).uniform_(-0.25, 0.25)

    for word in vocab:
        if word in model_vocab:
            embedding[vocab[word]] = model.vectors[model_vocab[word]]

    return embedding, vocab

In [None]:
def make_output_vocab(train_out_sentences):
    vocab = {'<pad>':0, '<start>':1, '<eos>':2, '<unknown>':3}
    rev_vocab = {0:'<pad>', 1:'<start>', 2:'<eos>', 3:'<unknown>'}
    i = 4
    for sentence in train_out_sentences:
        for word in sentence:
            if word not in vocab:
                vocab[word] = i
                rev_vocab[i] = word
                i += 1
    
    return vocab, rev_vocab

In [None]:
def modify_dataset(vocab, data):    
    for i in range(len(data)):
        data[i] = torch.tensor([vocab[word] if word in vocab else vocab['<unknown>'] for word in data[i]])
    return data

In [None]:
embedding, vocab = make_glove_embedding(GloVe, GloVe.stoi, train_dataset.Problem)
out_vocab, rev_out_vocab = make_output_vocab(train_dataset.linear_formula)

In [None]:
collator = collate_fn

In [None]:
if(model_to_train == 0 or model_to_train == 1):
    train_dataset.Problem = modify_dataset(vocab, train_dataset.Problem)
    test_dataset.Problem = modify_dataset(vocab, test_dataset.Problem)
    dev_dataset.Problem = modify_dataset(vocab, dev_dataset.Problem)

    train_dataset.linear_formula = modify_dataset(out_vocab, train_dataset.linear_formula)
    test_dataset.linear_formula = modify_dataset(out_vocab, test_dataset.linear_formula)
    dev_dataset.linear_formula = modify_dataset(out_vocab, dev_dataset.linear_formula)

In [None]:
# bert_vocab = BertTokenizer.from_pretrained('bert-base-uncased')
if(model_to_train == 2 or model_to_train==3):
    print('Loading BERT dataset')
    # train_dataset.Problem = [torch.tensor(bert_vocab.encode(' '.join(sentence))) for sentence in train_dataset.Problem]
    # test_dataset.Problem = [torch.tensor(bert_vocab.encode(' '.join(sentence))) for sentence in test_dataset.Problem]
    # dev_dataset.Problem = [torch.tensor(bert_vocab.encode(' '.join(sentence))) for sentence in dev_dataset.Problem]

    train_dataset.linear_formula = modify_dataset(out_vocab, train_dataset.linear_formula)
    test_dataset.linear_formula = modify_dataset(out_vocab, test_dataset.linear_formula)
    dev_dataset.linear_formula = modify_dataset(out_vocab, dev_dataset.linear_formula)

    vocab = bert_vocab.vocab
    collator = bert_collate

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collator)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, collate_fn=collator)

train_acc_loader = DataLoader(train_dataset, batch_size=1, shuffle=False, collate_fn=collator)
test_acc_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collator)
dev_acc_loader = DataLoader(dev_dataset, batch_size=1, shuffle=False, collate_fn=collator)

In [None]:
class beamdata:
    def __init__(self, score, sequence, hidden, cell):
        self.score = score
        self.sequence = sequence
        self.hidden = hidden
        self.cell = cell

In [None]:
# MODEL 1
# seq2seq

class Encoder(nn.Module):
    def __init__(self, embedding = embedding, hidden_dim = hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding, padding_idx=0)
        self.embedding_dim = self.embedding.embedding_dim
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.5)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, batch_first=True, bidirectional=True)
        self.linear_h = nn.Linear(self.hidden_dim*2, self.hidden_dim)
        self.linear_c = nn.Linear(self.hidden_dim*2, self.hidden_dim)

    def forward(self, x):
        x = self.dropout(self.embedding(x))
        x, (h, c) = self.lstm(x)
        h = torch.tanh(self.linear_h(h.permute(1, 0, 2).reshape(x.size(0), -1)))
        c = torch.tanh(self.linear_c(c.permute(1, 0, 2).reshape(x.size(0), -1)))
        return x, h, c


class Decoder(nn.Module):
    def __init__(self, out_vocab = out_vocab, hidden_dim = hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(len(out_vocab), embedding_dim, padding_idx=0)
        self.embedding_dim = self.embedding.embedding_dim
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.5)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, batch_first=True)
        self.linear = nn.Linear(self.hidden_dim, len(out_vocab))

    def forward(self, x, h, c):
        h = h.unsqueeze(0)
        c = c.unsqueeze(0)
        x = self.dropout(self.embedding(x))
        x = x.unsqueeze(1)
        x, (h, c) = self.lstm(x, (h, c))
        x = self.linear(x.squeeze(1))
        h = h.squeeze(0)
        c = c.squeeze(0)
        return x, h, c

class seq2seq(nn.Module):
    def __init__(self, teacher_forcing_ratio = 0.6, beam_size = 10, out_vocab = out_vocab):
        super(seq2seq, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.beam_size = beam_size
        self.out_vocab = out_vocab

    def forward(self, x, y):
        if(self.training):
            return self.train_forward(x, y)
        else:
            return self.test_forward_beam(x, y)
        
    def train_forward(self, x, y):
        _, h, c = self.encoder(x)
        out = out_vocab['<start>']*torch.ones(x.size(0), dtype = torch.long).to(device)
        outputs = torch.zeros(y.size(0), y.size(1), len(out_vocab)).to(device)
        for i in range(y.size(1)):
            if(np.random.random() < self.teacher_forcing_ratio):
                if(i != 0):
                    out = y[:, i-1]
            else:
                if(i != 0):
                    out = torch.argmax(out, dim=1)
            out, h, c = self.decoder(out, h, c)
            outputs[:, i] = out
        return outputs
    
    def test_forward_beam(self, x, y):
        _, h, c = self.encoder(x)
        out = out_vocab['<start>']*torch.ones(x.size(0), dtype = torch.long).to(device)
        beam = [beamdata(0, [out], h, c)]
        for i in range(y.size(1)):
            new_beam = []
            for b in beam:
                out, h, c = self.decoder(torch.tensor(b.sequence[-1]), b.hidden, b.cell)
                out = F.log_softmax(out, dim=1)
                out = torch.topk(out, self.beam_size, dim=1)
                for j in range(self.beam_size):
                    new_beam.append(beamdata(b.score + out[0][:,j], b.sequence + [out[1][:,j]], h, c))

            beam = sorted(new_beam, key=lambda x: x.score.sum(), reverse=True)[:self.beam_size]

        sequence = torch.zeros(y.size(0), y.size(1), dtype=torch.long).to(device)
        for i in range(y.size(1)):
            sequence[:, i] = torch.tensor(beam[0].sequence[i])

        return sequence

In [None]:
# MODEL 2
# AttnSeq2seq

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.attn = nn.Linear(self.hidden_dim*3, self.hidden_dim)
        self.v = nn.Linear(self.hidden_dim, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        hidden = hidden.unsqueeze(1)
        hidden = hidden.repeat(1, encoder_outputs.size(1), 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), 2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=1)


class AttnDecoder(nn.Module):
    def __init__(self, out_vocab = out_vocab, hidden_dim = hidden_dim):
        super(AttnDecoder, self).__init__()
        self.embedding = nn.Embedding(len(out_vocab), embedding_dim, padding_idx=0)
        self.embedding_dim = self.embedding.embedding_dim
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.5)
        self.lstm = nn.LSTM(self.embedding_dim + 2*self.hidden_dim, self.hidden_dim, batch_first=True)
        self.linear = nn.Linear(self.hidden_dim, len(out_vocab))
        self.attention = Attention(self.hidden_dim)

    def forward(self, x, h, c, encoder_outputs):
        x = x.unsqueeze(1)
        x = self.dropout(self.embedding(x))
        attention_weights = self.attention(encoder_outputs, h)
        attention_weights = attention_weights.unsqueeze(1)
        context = attention_weights.bmm(encoder_outputs)
        x = torch.cat((x, context), dim=2)
        x, (h, c) = self.lstm(x, (h.unsqueeze(0), c.unsqueeze(0)))
        x = self.linear(x.squeeze(1))
        h = h.squeeze(0)
        c = c.squeeze(0)
        return x, h, c
    

class AttnSeq2seq(nn.Module):
    def __init__(self, teacher_forcing_ratio = 0.6, beam_size = 10, out_vocab = out_vocab):
        super(AttnSeq2seq, self).__init__()
        self.encoder = Encoder()
        self.decoder = AttnDecoder()
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.beam_size = beam_size
        self.out_vocab = out_vocab

    def forward(self, x, y):
        if(self.training):
            return self.train_forward(x, y)
        else:
            return self.test_forward_beam(x, y)
        
    def train_forward(self, x, y):
        encoder_outputs, h, c = self.encoder(x)
        out = out_vocab['<start>']*torch.ones(x.size(0), dtype = torch.long).to(device)
        outputs = torch.zeros(y.size(0), y.size(1), len(out_vocab)).to(device)
        for i in range(y.size(1)):
            if(np.random.random() < self.teacher_forcing_ratio):
                if(i != 0):
                    out = y[:, i-1]
            else:
                if(i != 0):
                    out = torch.argmax(out, dim=1)
            out, h, c = self.decoder(out, h, c, encoder_outputs)
            outputs[:, i] = out
        return outputs
    
    def test_forward_beam(self, x, y):
        encoder_outputs, h, c = self.encoder(x)
        out = out_vocab['<start>']*torch.ones(x.size(0), dtype = torch.long).to(device)
        beam = [beamdata(0, [out], h, c)]
        for i in range(y.size(1)):
            new_beam = []
            for b in beam:
                out, h, c = self.decoder(torch.tensor(b.sequence[-1]), b.hidden, b.cell, encoder_outputs)
                out = F.log_softmax(out, dim=1)
                out = torch.topk(out, self.beam_size, dim=1)
                for j in range(self.beam_size):
                    new_beam.append(beamdata(b.score + out[0][:,j], b.sequence + [out[1][:,j]], h, c))

            beam = sorted(new_beam, key=lambda x: x.score.sum(), reverse=True)[:self.beam_size]

        sequence = torch.zeros(y.size(0), y.size(1), dtype=torch.long).to(device)
        for i in range(y.size(1)):
            sequence[:, i] = torch.tensor(beam[0].sequence[i])

        return sequence

In [None]:
BERT_frozen = BertModel.from_pretrained('bert-base-uncased')
BERT_fine_tuned = BertModel.from_pretrained('bert-base-uncased')

for param in BERT_frozen.parameters():
    param.requires_grad = False

for param in BERT_fine_tuned.parameters():
    param.requires_grad = False

for param in BERT_fine_tuned.encoder.layer[-1].parameters():
    param.requires_grad = True

In [None]:
# Models 3 and 4
# BertSeq2seq

class BertAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(BertAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.attn = nn.Linear(self.hidden_dim*4, self.hidden_dim)
        self.v = nn.Linear(self.hidden_dim, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        hidden = hidden.unsqueeze(1)
        hidden = hidden.repeat(1, encoder_outputs.size(1), 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), 2)))
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=1)

class BertEncoder(nn.Module):
    def __init__(self, BERT_model, hidden_dim = hidden_dim):
        super(BertEncoder, self).__init__()
        self.bert = BERT_model
        self.hidden_dim = hidden_dim
        self.linear_h = nn.Linear(768, self.hidden_dim)
        self.linear_c = nn.Linear(768, self.hidden_dim)

    def forward(self, x):
        x = self.bert(x['input_ids'], attention_mask=x['attention_mask']).last_hidden_state
        h = torch.tanh(self.linear_h(x[:,0]))
        c = torch.tanh(self.linear_c(x[:,0]))
        return x, h, c
    
class BertDecoder(nn.Module):
    def __init__(self, out_vocab = out_vocab, hidden_dim = hidden_dim):
        super(BertDecoder, self).__init__()
        self.embedding = nn.Embedding(len(out_vocab), embedding_dim, padding_idx=0)
        self.embedding_dim = self.embedding.embedding_dim
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.5)
        self.lstm = nn.LSTM(self.embedding_dim + 3*self.hidden_dim, self.hidden_dim, batch_first=True)
        self.linear = nn.Linear(self.hidden_dim, len(out_vocab))
        self.attention = BertAttention(self.hidden_dim)

    def forward(self, x, h, c, encoder_outputs):
        x = x.unsqueeze(1)
        x = self.dropout(self.embedding(x))
        attention_weights = self.attention(encoder_outputs, h)
        attention_weights = attention_weights.unsqueeze(1)
        context = attention_weights.bmm(encoder_outputs)
        x = torch.cat((x, context), dim=2)
        x, (h, c) = self.lstm(x, (h.unsqueeze(0), c.unsqueeze(0)))
        x = self.linear(x.squeeze(1))
        h = h.squeeze(0)
        c = c.squeeze(0)
        return x, h, c
    
class BertSeq2seq(nn.Module):
    def __init__(self, BERT_model, teacher_forcing_ratio = 0.6, beam_size = 10, out_vocab = out_vocab):
        super(BertSeq2seq, self).__init__()
        self.encoder = BertEncoder(BERT_model)
        self.decoder = BertDecoder()
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.beam_size = beam_size
        self.out_vocab = out_vocab

    def forward(self, x, y):
        if(self.training):
            return self.train_forward(x, y)
        else:
            return self.test_forward_beam(x, y)
        
    def train_forward(self, x, y):
        encoder_outputs, h, c = self.encoder(x)
        out = out_vocab['<start>']*torch.ones(y.size(0), dtype = torch.long).to(device)
        outputs = torch.zeros(y.size(0), y.size(1), len(out_vocab)).to(device)
        for i in range(y.size(1)):
            if(np.random.random() < self.teacher_forcing_ratio):
                if(i != 0):
                    out = y[:, i-1]
            else:
                if(i != 0):
                    out = torch.argmax(out, dim=1)
            out, h, c = self.decoder(out, h, c, encoder_outputs)
            outputs[:, i] = out
        return outputs
    
    def test_forward_beam(self, x, y):
        # print(self.beam_size)
        encoder_outputs, h, c = self.encoder(x)
        out = out_vocab['<start>']*torch.ones(y.size(0), dtype = torch.long).to(device)
        beam = [beamdata(0, [out], h, c)]
        for i in range(y.size(1)):
            new_beam = []
            for b in beam:
                out, h, c = self.decoder(torch.tensor(b.sequence[-1]), b.hidden, b.cell, encoder_outputs)
                out = F.log_softmax(out, dim=1)
                out = torch.topk(out, self.beam_size, dim=1)
                for j in range(self.beam_size):
                    new_beam.append(beamdata(b.score + out[0][:,j], b.sequence + [out[1][:,j]], h, c))

            beam = sorted(new_beam, key=lambda x: x.score.sum(), reverse=True)[:self.beam_size]

        sequence = torch.zeros(y.size(0), y.size(1), dtype=torch.long).to(device)
        for i in range(y.size(1)):
            sequence[:, i] = torch.tensor(beam[0].sequence[i])

        return sequence

In [None]:
def train(S, optimizer, criterion, num_epochs = 60, model_no  = '_-_'):
    WHILE_TRAINING_STATISTICS = pd.DataFrame(columns = ['Epoch', 'Loss'])
    WHILE_TRAINING_VALIDATION_STATISTICS = pd.DataFrame(columns = ['Epoch', 'Loss'])

    for n in range(num_epochs):
        S.train()
        for i, (p, l, a) in enumerate(train_loader):
            p = p.to(device)
            l = l.to(device)
            # a = a.to(device)
            optimizer.zero_grad()
            output = S(p, l)
            loss = criterion(output.view(-1, len(out_vocab)), l.view(-1))
            loss.backward()
            optimizer.step()
            if(i%100 == 0):
                print('Epoch:', n, 'Batch:', i, 'Loss:', loss.item())

        train_loss = 0
        val_loss = 0
        with torch.no_grad():
            for i, (p, l, a) in enumerate(train_acc_loader):
                p = p.to(device)
                l = l.to(device)
                # a = a.to(device)
                output = S(p, l)
                loss = criterion(output.view(-1, len(out_vocab)), l.view(-1))
                train_loss += loss.item()

            for i, (p, l, a) in enumerate(dev_acc_loader):
                p = p.to(device)
                l = l.to(device)
                # a = a.to(device)
                output = S(p, l)
                loss = criterion(output.view(-1, len(out_vocab)), l.view(-1))
                val_loss += loss.item()

        train_loss /= len(train_loader)
        val_loss /= len(dev_loader)
        print('*Epoch:', n, 'Train Loss:', train_loss)
        print('*Epoch:', n, 'Validation Loss:', val_loss)

        WHILE_TRAINING_STATISTICS.loc[len(WHILE_TRAINING_STATISTICS)] = [n, loss.item()]
        WHILE_TRAINING_VALIDATION_STATISTICS.loc[len(WHILE_TRAINING_VALIDATION_STATISTICS)] = [n, val_loss]

        WHILE_TRAINING_STATISTICS.to_csv(out_dir + model_no + '_train_statistics.csv', index=False)
        WHILE_TRAINING_VALIDATION_STATISTICS.to_csv(out_dir + model_no + '_validation_statistics.csv', index=False)

        torch.save(S.state_dict(), out_dir + model_no + '_seq2seq.pth')

In [None]:
def beam_search_prediction(mode, loader, S, model_no  = '_-_'):
    df = 0
    with open(dir + mode + '.json', 'r') as f:
        df = json.load(f)

    df_copy = pd.DataFrame(df).copy()
    answers = []

    S.eval()
    for i, (p, l, a) in enumerate(loader):
        p = p.to(device)
        l = l.to(device)
        # a = a.to(device)
        output = S(p, l)

        lst = []
        for j in range(output.size(1)):
            lst.append(rev_out_vocab[output[0][j].item()])

        ans = ","
        # print(lst)
        for j in range(1,len(lst)):
            if(lst[j] == '<eos>'):
                break
            if(lst[j][-1].isdigit()):
                ans += lst[j] + ','
            else:
                ans = ans[:-1] + ')|' + lst[j] + '('

        ans = ans[2:-1] + ')|'
        answers.append(ans)

    

    df_copy['predicted'] = answers
    
    df_copy = df_copy[['Problem', 'answer', 'predicted', 'linear_formula']]
    json_data = df_copy.to_json(orient='records')
    parsed_data = json.loads(json_data)
    script = dir + 'evaluator.py'
    pred_file = out_dir + model_no + '_' + mode + '_predicted.json'
    with open(pred_file, 'w') as f:
        json.dump(parsed_data, f)

    os.system('python3 ' + script + ' ' + pred_file)

    exact_match = 0
    exec_acc = 0
    total = 0
    with open(pred_file, 'r') as f:
        final_json = json.load(f)
        for i in range(len(final_json)):
            if((final_json[i]['linear_formula'] == final_json[i]['predicted']) or ((len(final_json[i]['predicted']) > 0) and (final_json[i]['linear_formula'] == final_json[i]['predicted'][:-1]))):
                exact_match += 1
            if((final_json[i]['predicted_answer'] is not None) and (abs(final_json[i]['answer'] - final_json[i]['predicted_answer']) <= abs(final_json[i]['answer'])*0.02)):
                exec_acc += 1
            total += 1
        
    print('Exact Match:', 100*exact_match/total)
    print('Execution Accuracy:', 100*exec_acc/total)

    return exact_match, exec_acc, total

def accuracy(S, model_no  = '_-_'):
    ACC_DF = pd.DataFrame(columns = ['mode', 'exact_match', 'exec_acc'])

    for mode in ['test', 'dev']:
        exact_match, exec_acc, total = beam_search_prediction(mode, eval(mode + '_acc_loader'), S, model_no)
        ACC_DF.loc[len(ACC_DF)] = [mode, (100 * exact_match)/total, (100 * exec_acc)/total]

    ACC_DF.to_csv(out_dir + model_no + '_accuracy.csv', index=False)

In [None]:
lr = 0.001
S = seq2seq()
S.to(device)
S = nn.DataParallel(S)
optimizer_S = optim.Adam(S.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=out_vocab['<pad>'])

Att_S = AttnSeq2seq()
Att_S.to(device)
Att_S = nn.DataParallel(Att_S)
optimizer_Att_S = optim.Adam(Att_S.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=out_vocab['<pad>'])

Bert_S_frozen = BertSeq2seq(BERT_frozen)
Bert_S_frozen.to(device)
Bert_S_frozen = nn.DataParallel(Bert_S_frozen)
optimizer_Bert_S_frozen = optim.Adam(Bert_S_frozen.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=out_vocab['<pad>'])

Bert_S_fine_tuned = BertSeq2seq(BERT_fine_tuned)
Bert_S_fine_tuned.to(device)
Bert_S_fine_tuned = nn.DataParallel(Bert_S_fine_tuned)
optimizer_Bert_S_fine_tuned = optim.Adam(Bert_S_fine_tuned.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=out_vocab['<pad>'])

In [None]:
S_load = seq2seq()
Att_S_load = AttnSeq2seq()
Bert_S_frozen_load = BertSeq2seq(BERT_frozen)
Bert_S_fine_tuned_load = BertSeq2seq(BERT_fine_tuned)

In [None]:
if(os.path.exists(local_dir + '0_seq2seq.pth')):
    state_dict = torch.load(local_dir + '0_seq2seq.pth', map_location=torch.device('cpu'))
    new_state_dict = {}
    for key, value in state_dict.items():
        if key.startswith('module.'):
            new_key = key[7:]  # Remove the "module." prefix
            new_state_dict[new_key] = value
        else:
            new_state_dict[key] = value

    S_load.load_state_dict(new_state_dict)
    S_load.to(device)
    S_load = nn.DataParallel(S_load)


if(os.path.exists(local_dir + '1_seq2seq.pth')):
    state_dict = torch.load(local_dir + '1_seq2seq.pth', map_location=torch.device('cpu'))
    new_state_dict = {}
    for key, value in state_dict.items():
        if key.startswith('module.'):
            new_key = key[7:]  # Remove the "module." prefix
            new_state_dict[new_key] = value
        else:
            new_state_dict[key] = value

    Att_S_load.load_state_dict(new_state_dict)
    Att_S_load.to(device)
    Att_S_load = nn.DataParallel(Att_S_load)


if(os.path.exists(local_dir + '2_seq2seq.pth')):
    state_dict = torch.load(local_dir + '2_seq2seq.pth', map_location=torch.device('cpu'))
    new_state_dict = {}
    for key, value in state_dict.items():
        if key.startswith('module.'):
            new_key = key[7:]  # Remove the "module." prefix
            new_state_dict[new_key] = value
        else:
            new_state_dict[key] = value

    Bert_S_frozen_load.load_state_dict(new_state_dict)
    Bert_S_frozen_load.to(device)
    Bert_S_frozen_load = nn.DataParallel(Bert_S_frozen_load)


if(os.path.exists(local_dir + '3_seq2seq.pth')):
    state_dict = torch.load(local_dir + '3_seq2seq.pth', map_location=torch.device('cpu'))
    new_state_dict = {}
    for key, value in state_dict.items():
        if key.startswith('module.'):
            new_key = key[7:]  # Remove the "module." prefix
            new_state_dict[new_key] = value
        else:
            new_state_dict[key] = value

    Bert_S_fine_tuned_load.load_state_dict(new_state_dict)
    Bert_S_fine_tuned_load.to(device)
    Bert_S_fine_tuned_load = nn.DataParallel(Bert_S_fine_tuned_load)

In [None]:
models = [S, Att_S, Bert_S_frozen, Bert_S_fine_tuned]
optimizers = [optimizer_S, optimizer_Att_S, optimizer_Bert_S_frozen, optimizer_Bert_S_fine_tuned]
load_models = [S_load, Att_S_load, Bert_S_frozen_load, Bert_S_fine_tuned_load]

In [None]:
# beam_sizes = [1, 10, 20]
# tf_ratios = [0.3, 0.9]
if not load:
    # print(models[model_to_train].module.teacher_forcing_ratio)
    train(models[model_to_train], optimizers[model_to_train], criterion, num_epochs, str(model_to_train))
    # for bm_sz in beam_sizes:
    #     models[model_to_train].module.beam_size = bm_sz
    #     accuracy(models[model_to_train], str(model_to_train) + '_beam_' + str(bm_sz))
    accuracy(models[model_to_train], str(model_to_train))

else:
    accuracy(load_models[model_to_train], str(model_to_train))

In [None]:
models[model_to_train].eval()
for i, (p, l, a) in enumerate(test_acc_loader):
    p = p.to(device)
    l = l.to(device)
    # a = a.to(device)
    models[model_to_train].eval()
    output = models[model_to_train](p, l)
    print(output)
    break

In [None]:
[rev_out_vocab[x] for x in [ 1, 18,  7,  5,  4,  7,  8,  4,  5, 16, 18, 11, 12,  9, 13, 10]]

In [None]:
# saving vocabularies to be loaded for inference
with open(out_dir + 'var/glove_vocab.json', 'w') as f:
    json.dump(vocab, f)

with open(out_dir + 'var/glove_out_vocab.json', 'w') as f:
    json.dump(out_vocab, f)

with open(out_dir + 'var/glove_rev_out_vocab.json', 'w') as f:
    json.dump(rev_out_vocab, f)

torch.save(embedding, out_dir + 'var/glove_embedding.pth')

In [None]:
r_o_v = json.load(open(out_dir + 'var/glove_rev_out_vocab.json', 'r'))
r_o_v = {int(k):v for k,v in r_o_v.items()}