## Preparing Data

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import datasets
import tqdm

In [2]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

### Dataset

In [3]:
import sys

sys.path.insert(1, "minbpe-master")

In [4]:
import pandas as pd

df1 = pd.read_csv("pairs_dataset.csv")
df2 = pd.read_csv("generated_pairs.csv")
df = pd.concat([df1, df2])

In [5]:
df

Unnamed: 0,equation,answer
0,"y\prime= \frac{3}{x^3+x}, \;\;\;\; y(1)=0",y = 3 \ln x -\frac{3}{2} \ln {(x^2+1)} + \frac...
1,y\prime=3xy,y = C e^{\frac{3}{2} x^2}
2,\frac{dy}{dx}= xy^2 + 4x + 2y^2 + 8,y =2 \tan{(x^2 +4x +2C)}
3,"\frac{dy}{dx}= e^{x+2y}, y(0)=1",y = -\frac{1}{2} \ln{(-2 e^x + 2+e^{-2})}
4,y\prime = x e^{2x+y},y = - \ln {(-\frac{1}{2} x e^{2x} + \frac{1}{4...
...,...,...
9530,y^{\prime\prime\prime}-23y^{\prime\prime}+166y...,C_{1}e^{10x}+C_{2}e^{9x}+C_{3}e^{4x}
9531,y^{\prime\prime\prime}-24y^{\prime\prime}+185y...,C_{1}e^{10x}+C_{2}e^{9x}+C_{3}e^{5x}
9532,y^{\prime\prime\prime}-25y^{\prime\prime}+204y...,C_{1}e^{10x}+C_{2}e^{9x}+C_{3}e^{6x}
9533,y^{\prime\prime\prime}-26y^{\prime\prime}+223y...,C_{1}e^{10x}+C_{2}e^{9x}+C_{3}e^{7x}


In [6]:
# # выкинули строки с короткими ответами
# for i, row in df.iterrows():
#     if len(row['answer']) < 5:
#         df = df.drop([i])
# df

In [7]:
# add special tokens
lines = "<sos>" + df["equation"] +' ' + df["answer"] + "<eos>"
str_for_vocab_training = lines.str.cat(sep=" ")

In [8]:
from minbpe import BasicTokenizer

VOCAB_SIZE = 4096

tokenizer = BasicTokenizer()
# doesn't allow adding special tokens; make it learn <pad>
str_for_vocab_training += "<pad>" * 10
tokenizer.train(str_for_vocab_training, vocab_size=VOCAB_SIZE)

In [9]:
eq_lines = "<sos>" + df["equation"] + "<eos>"
ans_lines = "<sos>" + df["answer"] + "<eos>"
eqs_tokenized = np.array(eq_lines.apply(lambda line: list(tokenizer.encode(line))))
ans_tokenized = np.array(ans_lines.apply(lambda line: list(tokenizer.encode(line))))

In [10]:
seq_length = max(len(max(eqs_tokenized, key=len)), len(max(ans_tokenized, key=len)) ) # max seq len

def pad(lines_tokenized_basic):
    for i in range(len(lines_tokenized_basic)):
        while len(lines_tokenized_basic[i]) < seq_length:
            lines_tokenized_basic[i].append(tokenizer.encode("<pad>")[0])
        lines_tokenized_basic[i] = np.array(lines_tokenized_basic[i])
    lines_tokenized_basic = np.array(lines_tokenized_basic)
    return lines_tokenized_basic

eqs_tokenized = pad(eqs_tokenized)
ans_tokenized = pad(ans_tokenized)

In [11]:
len(eqs_tokenized)

10670

In [12]:
from datasets import Dataset, DatasetDict

dataset_dict = {"eqs": eqs_tokenized, "ans": ans_tokenized}
dataset = Dataset.from_dict(dataset_dict)

In [13]:
train_testvalid = dataset.train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
train_test_valid_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

In [14]:
pad_index = tokenizer.encode("<pad>")[0]

## Data Loaders

In [15]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_eqs = [example["eqs"] for example in batch]
        batch_ans = [example["ans"] for example in batch]
        batch_eqs = nn.utils.rnn.pad_sequence(batch_eqs, padding_value=pad_index)
        batch_ans = nn.utils.rnn.pad_sequence(batch_ans, padding_value=pad_index)
        batch = {
            "eqs": batch_eqs,
            "ans": batch_ans,
        }
        return batch

    return collate_fn

In [16]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [17]:
data_type = "torch"
format_columns = ['eqs', 'ans']

train_data = train_test_valid_dataset['train'].with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = train_test_valid_dataset['valid'].with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = train_test_valid_dataset['test'].with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [20]:
batch_size = 128

train_data_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_data_loader = get_data_loader(valid_data, batch_size, pad_index)
test_data_loader = get_data_loader(test_data, batch_size, pad_index)

## Building the Model

### Encoder

In [21]:
class Encoder(nn.Module):
    def __init__(self, rnn_type, input_dim, embedding_dim, hidden_dim, n_layers, dropout, bidirectional=False):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        if rnn_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        elif rnn_type == 'rnn':
            self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden, cell

### Decoder

In [22]:
class Decoder(nn.Module):
    def __init__(self, rnn_type, output_dim, embedding_dim, hidden_dim, n_layers, dropout, bidirectional=False):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        if rnn_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        elif rnn_type == 'rnn':
            self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hidden dim]
        # context = [n layers, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # seq length and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [n layers, batch size, hidden dim]
        # cell = [n layers, batch size, hidden dim]
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch size, output dim]
        return prediction, hidden, cell

### Seq2Seq

In [23]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        # input = [batch size]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, hidden dim]
            # cell = [n layers, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

## Training the Model

### Train functions

In [24]:
# weight initialization
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

In [25]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [26]:
# train loop

def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(tqdm.tqdm(data_loader)):
        src = batch["eqs"].to(device)
        trg = batch["ans"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

### Eval functions

In [27]:
# eval loop

def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["eqs"].to(device)
            trg = batch["ans"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [28]:
def detokenize(sentence):
    full_str = tokenizer.decode(sentence)
    answer = full_str[5:]
    end_token_idx = answer.find("<eos>")
    answer = answer[:end_token_idx]
    answer = answer.replace("<pad>", "")
    return answer

def tokenize(sentence):
    res = tokenizer.encode('<sos>' + sentence + '<eos>')
    while len(res) < seq_length:
        res.append(tokenizer.encode("<pad>")[0])
    return res

In [29]:
def translate_sentence(
    sentence,
    model,
    input_is_tokenized=False,
    device='cuda',
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        if input_is_tokenized:
            ids = sentence
        else:
            ids = tokenize(sentence)
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        hidden, cell = model.encoder(tensor)
        inputs = tokenizer.encode('<sos>')
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == tokenizer.encode('<eos>')[0]:
                break
        tokens = detokenize(inputs)[4:]
    return tokens

In [30]:
import evaluate

bleu = evaluate.load("bleu")

val_references=[detokenize(sentence.tolist()) for sentence in valid_data['ans']]
test_references=[detokenize(sentence.tolist()) for sentence in test_data['ans']]

def bleu_score(preds, refs):
    return bleu.compute(predictions=preds, references=refs)['bleu']

def accuracy(preds, refs):
    equal_count = 0
    for i in range(len(refs)):
        if preds[i] == refs[i]:
            equal_count += 1
    return equal_count/len(refs)

In [33]:
# result saving

result = {'rnn_type': [], 'n_layers': [], 'hidden_dim': [], 'epoch': [], 'val_bleu': [], 'val_accuracy': [], 'test_bleu': [], 'test_accuracy': []}

def update_result():
    result['rnn_type'].append(rnn_type)
    result['n_layers'].append(n_layers)
    result['hidden_dim'].append(hidden_dim)
    result['epoch'].append(epoch+1)
    result['val_bleu'].append(round(val_bleu, 3))
    result['test_bleu'].append(round(test_bleu, 3))
    result['val_accuracy'].append(round(val_accuracy, 3))
    result['test_accuracy'].append(round(test_accuracy, 3))
    
    res_df = pd.DataFrame(result)
    res_df.to_csv('results.csv', index=False)

## Train

In [34]:
input_dim = VOCAB_SIZE
output_dim = VOCAB_SIZE
encoder_embedding_dim = 256
decoder_embedding_dim = 256
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

rnn_type_options = ['lstm']
n_layers_options = [2, 4, 6, 8]
hidden_dim_options = [256, 512, 1024]
for rnn_type in rnn_type_options:
    for hidden_dim in hidden_dim_options:
        for n_layers in n_layers_options:
            encoder = Encoder(
                rnn_type,
                input_dim,
                encoder_embedding_dim,
                hidden_dim,
                n_layers,
                encoder_dropout,
            )

            decoder = Decoder(
                rnn_type,
                output_dim,
                decoder_embedding_dim,
                hidden_dim,
                n_layers,
                decoder_dropout,
            )

            model = Seq2Seq(encoder, decoder, device).to(device)
            model.apply(init_weights)
            print(f"The model has {count_parameters(model):,} trainable parameters")

            optimizer = optim.Adam(model.parameters())
            criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

            n_epochs = 6
            clip = 1.0
            teacher_forcing_ratio = 0.1 # 0.5
            best_valid_loss = float("inf")
            for epoch in range(n_epochs):
                print(f'EPOCH {epoch+1}')
                train_loss = train_fn(
                    model,
                    train_data_loader,
                    optimizer,
                    criterion,
                    clip,
                    teacher_forcing_ratio,
                    device,
                )
                valid_loss = evaluate_fn(
                    model,
                    valid_data_loader,
                    criterion,
                    device,
                )
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    torch.save(model.state_dict(), f"encoder_decoder_models/model_layers-{n_layers}_epoch-{epoch+1}.pt")
                print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
                print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

                # compute metrics
                val_predictions = [translate_sentence(sentence, model, True) for sentence in valid_data['eqs']]
                test_predictions = [translate_sentence(sentence, model, True) for sentence in test_data['eqs']]
                val_bleu = bleu_score(preds=val_predictions, refs=val_references)
                test_bleu = bleu_score(preds=test_predictions, refs=test_references)
                val_accuracy = accuracy(preds=val_predictions, refs=val_references)
                test_accuracy = accuracy(preds=test_predictions, refs=test_references)

                print(f"\tValid BLEU: {val_bleu:7.3f} | Valid Accuracy: {val_accuracy:7.3f}")
                update_result()

The model has 5,255,168 trainable parameters
EPOCH 1


100%|██████████| 67/67 [00:13<00:00,  4.96it/s]


	Train Loss:   4.314 | Train PPL:  74.774
	Valid Loss:   3.460 | Valid PPL:  31.824
	Valid BLEU:   0.254 | Valid Accuracy:   0.000
EPOCH 2


100%|██████████| 67/67 [00:14<00:00,  4.76it/s]


	Train Loss:   3.295 | Train PPL:  26.990
	Valid Loss:   4.193 | Valid PPL:  66.188
	Valid BLEU:   0.216 | Valid Accuracy:   0.000
EPOCH 3


100%|██████████| 67/67 [00:13<00:00,  4.92it/s]


	Train Loss:   3.306 | Train PPL:  27.282
	Valid Loss:   3.274 | Valid PPL:  26.417
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 4


100%|██████████| 67/67 [00:14<00:00,  4.77it/s]


	Train Loss:   3.164 | Train PPL:  23.657
	Valid Loss:   3.242 | Valid PPL:  25.592
	Valid BLEU:   0.418 | Valid Accuracy:   0.000
EPOCH 5


100%|██████████| 67/67 [00:14<00:00,  4.71it/s]


	Train Loss:   3.135 | Train PPL:  22.979
	Valid Loss:   3.232 | Valid PPL:  25.339
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 6


100%|██████████| 67/67 [00:14<00:00,  4.71it/s]


	Train Loss:   3.204 | Train PPL:  24.630
	Valid Loss:   3.249 | Valid PPL:  25.775
	Valid BLEU:   0.413 | Valid Accuracy:   0.000
The model has 7,360,512 trainable parameters
EPOCH 1


100%|██████████| 67/67 [00:17<00:00,  3.79it/s]


	Train Loss:   4.339 | Train PPL:  76.641
	Valid Loss:   3.557 | Valid PPL:  35.052
	Valid BLEU:   0.272 | Valid Accuracy:   0.000
EPOCH 2


100%|██████████| 67/67 [00:17<00:00,  3.73it/s]


	Train Loss:   3.355 | Train PPL:  28.653
	Valid Loss:   3.313 | Valid PPL:  27.457
	Valid BLEU:   0.371 | Valid Accuracy:   0.000
EPOCH 3


100%|██████████| 67/67 [00:17<00:00,  3.78it/s]


	Train Loss:   3.104 | Train PPL:  22.286
	Valid Loss:   3.146 | Valid PPL:  23.247
	Valid BLEU:   0.420 | Valid Accuracy:   0.000
EPOCH 4


100%|██████████| 67/67 [00:17<00:00,  3.73it/s]


	Train Loss:   2.959 | Train PPL:  19.278
	Valid Loss:   3.010 | Valid PPL:  20.292
	Valid BLEU:   0.375 | Valid Accuracy:   0.000
EPOCH 5


100%|██████████| 67/67 [00:17<00:00,  3.77it/s]


	Train Loss:   2.914 | Train PPL:  18.424
	Valid Loss:   2.996 | Valid PPL:  20.010
	Valid BLEU:   0.374 | Valid Accuracy:   0.000
EPOCH 6


100%|██████████| 67/67 [00:17<00:00,  3.82it/s]


	Train Loss:   2.891 | Train PPL:  18.010
	Valid Loss:   2.977 | Valid PPL:  19.637
	Valid BLEU:   0.199 | Valid Accuracy:   0.000
The model has 9,465,856 trainable parameters
EPOCH 1


100%|██████████| 67/67 [00:22<00:00,  3.03it/s]


	Train Loss:   4.281 | Train PPL:  72.333
	Valid Loss:   3.515 | Valid PPL:  33.631
	Valid BLEU:   0.255 | Valid Accuracy:   0.000
EPOCH 2


100%|██████████| 67/67 [00:22<00:00,  3.03it/s]


	Train Loss:   3.313 | Train PPL:  27.475
	Valid Loss:   3.317 | Valid PPL:  27.582
	Valid BLEU:   0.418 | Valid Accuracy:   0.000
EPOCH 3


100%|██████████| 67/67 [00:22<00:00,  3.04it/s]


	Train Loss:   3.204 | Train PPL:  24.642
	Valid Loss:   3.271 | Valid PPL:  26.348
	Valid BLEU:   0.438 | Valid Accuracy:   0.000
EPOCH 4


100%|██████████| 67/67 [00:21<00:00,  3.08it/s]


	Train Loss:   3.165 | Train PPL:  23.683
	Valid Loss:   3.246 | Valid PPL:  25.693
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 5


100%|██████████| 67/67 [00:21<00:00,  3.07it/s]


	Train Loss:   3.143 | Train PPL:  23.172
	Valid Loss:   3.233 | Valid PPL:  25.365
	Valid BLEU:   0.216 | Valid Accuracy:   0.000
EPOCH 6


100%|██████████| 67/67 [00:22<00:00,  3.03it/s]


	Train Loss:   3.131 | Train PPL:  22.896
	Valid Loss:   3.234 | Valid PPL:  25.377
	Valid BLEU:   0.216 | Valid Accuracy:   0.000
The model has 11,571,200 trainable parameters
EPOCH 1


100%|██████████| 67/67 [00:25<00:00,  2.61it/s]


	Train Loss:   4.322 | Train PPL:  75.318
	Valid Loss:   3.565 | Valid PPL:  35.342
	Valid BLEU:   0.254 | Valid Accuracy:   0.000
EPOCH 2


100%|██████████| 67/67 [00:25<00:00,  2.60it/s]


	Train Loss:   3.347 | Train PPL:  28.417
	Valid Loss:   3.335 | Valid PPL:  28.064
	Valid BLEU:   0.440 | Valid Accuracy:   0.000
EPOCH 3


100%|██████████| 67/67 [00:25<00:00,  2.62it/s]


	Train Loss:   3.215 | Train PPL:  24.908
	Valid Loss:   3.279 | Valid PPL:  26.550
	Valid BLEU:   0.438 | Valid Accuracy:   0.000
EPOCH 4


100%|██████████| 67/67 [00:25<00:00,  2.59it/s]


	Train Loss:   3.170 | Train PPL:  23.813
	Valid Loss:   3.253 | Valid PPL:  25.868
	Valid BLEU:   0.438 | Valid Accuracy:   0.000
EPOCH 5


100%|██████████| 67/67 [00:26<00:00,  2.57it/s]


	Train Loss:   3.149 | Train PPL:  23.323
	Valid Loss:   3.235 | Valid PPL:  25.411
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 6


100%|██████████| 67/67 [00:26<00:00,  2.57it/s]


	Train Loss:   3.132 | Train PPL:  22.919
	Valid Loss:   3.229 | Valid PPL:  25.266
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
The model has 11,554,816 trainable parameters
EPOCH 1


100%|██████████| 67/67 [00:19<00:00,  3.41it/s]


	Train Loss:   3.875 | Train PPL:  48.165
	Valid Loss:   3.338 | Valid PPL:  28.162
	Valid BLEU:   0.420 | Valid Accuracy:   0.000
EPOCH 2


100%|██████████| 67/67 [00:19<00:00,  3.40it/s]


	Train Loss:   3.197 | Train PPL:  24.459
	Valid Loss:   3.246 | Valid PPL:  25.697
	Valid BLEU:   0.438 | Valid Accuracy:   0.000
EPOCH 3


100%|██████████| 67/67 [00:19<00:00,  3.39it/s]


	Train Loss:   3.088 | Train PPL:  21.925
	Valid Loss:   3.029 | Valid PPL:  20.678
	Valid BLEU:   0.355 | Valid Accuracy:   0.000
EPOCH 4


100%|██████████| 67/67 [00:19<00:00,  3.47it/s]


	Train Loss:   2.895 | Train PPL:  18.084
	Valid Loss:   2.976 | Valid PPL:  19.605
	Valid BLEU:   0.375 | Valid Accuracy:   0.000
EPOCH 5


100%|██████████| 67/67 [00:19<00:00,  3.48it/s]


	Train Loss:   2.948 | Train PPL:  19.073
	Valid Loss:   2.996 | Valid PPL:  20.011
	Valid BLEU:   0.390 | Valid Accuracy:   0.000
EPOCH 6


100%|██████████| 67/67 [00:19<00:00,  3.44it/s]


	Train Loss:   2.862 | Train PPL:  17.491
	Valid Loss:   2.982 | Valid PPL:  19.719
	Valid BLEU:   0.379 | Valid Accuracy:   0.000
The model has 19,959,808 trainable parameters
EPOCH 1


100%|██████████| 67/67 [00:29<00:00,  2.27it/s]


	Train Loss:   3.823 | Train PPL:  45.735
	Valid Loss:   3.328 | Valid PPL:  27.882
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 2


100%|██████████| 67/67 [00:29<00:00,  2.29it/s]


	Train Loss:   3.236 | Train PPL:  25.439
	Valid Loss:   3.780 | Valid PPL:  43.808
	Valid BLEU:   0.371 | Valid Accuracy:   0.000
EPOCH 3


100%|██████████| 67/67 [00:29<00:00,  2.28it/s]


	Train Loss:   3.167 | Train PPL:  23.730
	Valid Loss:   3.241 | Valid PPL:  25.556
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 4


100%|██████████| 67/67 [00:29<00:00,  2.29it/s]


	Train Loss:   3.137 | Train PPL:  23.042
	Valid Loss:   3.228 | Valid PPL:  25.236
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
EPOCH 5


100%|██████████| 67/67 [00:29<00:00,  2.28it/s]


	Train Loss:   3.119 | Train PPL:  22.619
	Valid Loss:   3.221 | Valid PPL:  25.043
	Valid BLEU:   0.438 | Valid Accuracy:   0.000
EPOCH 6


100%|██████████| 67/67 [00:29<00:00,  2.27it/s]


	Train Loss:   3.111 | Train PPL:  22.447
	Valid Loss:   3.221 | Valid PPL:  25.047
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
The model has 28,364,800 trainable parameters
EPOCH 1


100%|██████████| 67/67 [00:39<00:00,  1.71it/s]


	Train Loss:   3.807 | Train PPL:  45.030
	Valid Loss:   3.323 | Valid PPL:  27.738
	Valid BLEU:   0.254 | Valid Accuracy:   0.000
EPOCH 2


100%|██████████| 67/67 [00:39<00:00,  1.70it/s]


	Train Loss:   3.196 | Train PPL:  24.426
	Valid Loss:   3.260 | Valid PPL:  26.037
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
EPOCH 3


100%|██████████| 67/67 [00:39<00:00,  1.71it/s]


	Train Loss:   3.150 | Train PPL:  23.348
	Valid Loss:   3.241 | Valid PPL:  25.558
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
EPOCH 4


100%|██████████| 67/67 [00:39<00:00,  1.70it/s]


	Train Loss:   3.135 | Train PPL:  22.998
	Valid Loss:   3.233 | Valid PPL:  25.344
	Valid BLEU:   0.438 | Valid Accuracy:   0.000
EPOCH 5


100%|██████████| 67/67 [00:39<00:00,  1.71it/s]


	Train Loss:   3.121 | Train PPL:  22.667
	Valid Loss:   3.218 | Valid PPL:  24.986
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
EPOCH 6


100%|██████████| 67/67 [00:39<00:00,  1.70it/s]


	Train Loss:   3.109 | Train PPL:  22.388
	Valid Loss:   3.220 | Valid PPL:  25.037
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
The model has 36,769,792 trainable parameters
EPOCH 1


100%|██████████| 67/67 [00:48<00:00,  1.38it/s]


	Train Loss:   3.854 | Train PPL:  47.203
	Valid Loss:   3.338 | Valid PPL:  28.158
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
EPOCH 2


100%|██████████| 67/67 [00:48<00:00,  1.37it/s]


	Train Loss:   3.198 | Train PPL:  24.474
	Valid Loss:   3.250 | Valid PPL:  25.787
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 3


100%|██████████| 67/67 [00:48<00:00,  1.38it/s]


	Train Loss:   3.147 | Train PPL:  23.274
	Valid Loss:   3.239 | Valid PPL:  25.518
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 4


100%|██████████| 67/67 [00:48<00:00,  1.37it/s]


	Train Loss:   3.132 | Train PPL:  22.927
	Valid Loss:   3.225 | Valid PPL:  25.161
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
EPOCH 5


100%|██████████| 67/67 [00:48<00:00,  1.38it/s]


	Train Loss:   3.120 | Train PPL:  22.653
	Valid Loss:   3.221 | Valid PPL:  25.044
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 6


100%|██████████| 67/67 [00:48<00:00,  1.38it/s]


	Train Loss:   3.108 | Train PPL:  22.380
	Valid Loss:   3.213 | Valid PPL:  24.857
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
The model has 33,591,296 trainable parameters
EPOCH 1


100%|██████████| 67/67 [00:39<00:00,  1.70it/s]


	Train Loss:   3.649 | Train PPL:  38.432
	Valid Loss:   3.298 | Valid PPL:  27.064
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 2


100%|██████████| 67/67 [00:39<00:00,  1.70it/s]


	Train Loss:   3.167 | Train PPL:  23.731
	Valid Loss:   3.246 | Valid PPL:  25.688
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 3


100%|██████████| 67/67 [00:39<00:00,  1.69it/s]


	Train Loss:   3.131 | Train PPL:  22.901
	Valid Loss:   3.234 | Valid PPL:  25.383
	Valid BLEU:   0.438 | Valid Accuracy:   0.000
EPOCH 4


100%|██████████| 67/67 [00:39<00:00,  1.69it/s]


	Train Loss:   3.205 | Train PPL:  24.646
	Valid Loss:   3.229 | Valid PPL:  25.265
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
EPOCH 5


100%|██████████| 67/67 [00:39<00:00,  1.69it/s]


	Train Loss:   3.092 | Train PPL:  22.030
	Valid Loss:   3.224 | Valid PPL:  25.134
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
EPOCH 6


100%|██████████| 67/67 [00:39<00:00,  1.69it/s]


	Train Loss:   3.080 | Train PPL:  21.758
	Valid Loss:   3.206 | Valid PPL:  24.685
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
The model has 67,178,496 trainable parameters
EPOCH 1


100%|██████████| 67/67 [01:12<00:00,  1.08s/it]


	Train Loss:   3.611 | Train PPL:  37.005
	Valid Loss:   3.292 | Valid PPL:  26.889
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
EPOCH 2


100%|██████████| 67/67 [01:12<00:00,  1.08s/it]


	Train Loss:   3.174 | Train PPL:  23.891
	Valid Loss:   3.253 | Valid PPL:  25.861
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 3


100%|██████████| 67/67 [01:12<00:00,  1.08s/it]


	Train Loss:   3.139 | Train PPL:  23.071
	Valid Loss:   3.234 | Valid PPL:  25.382
	Valid BLEU:   0.419 | Valid Accuracy:   0.000
EPOCH 4


100%|██████████| 67/67 [01:11<00:00,  1.07s/it]


	Train Loss:   3.139 | Train PPL:  23.075
	Valid Loss:   3.222 | Valid PPL:  25.079
	Valid BLEU:   0.438 | Valid Accuracy:   0.000
EPOCH 5


100%|██████████| 67/67 [01:12<00:00,  1.07s/it]


	Train Loss:   3.107 | Train PPL:  22.356
	Valid Loss:   3.208 | Valid PPL:  24.726
	Valid BLEU:   0.439 | Valid Accuracy:   0.000
EPOCH 6


100%|██████████| 67/67 [01:12<00:00,  1.08s/it]


	Train Loss:   3.093 | Train PPL:  22.040
	Valid Loss:   3.217 | Valid PPL:  24.944
	Valid BLEU:   0.419 | Valid Accuracy:   0.001
The model has 100,765,696 trainable parameters
EPOCH 1


  1%|▏         | 1/67 [00:02<02:18,  2.09s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB (GPU 0; 7.78 GiB total capacity; 5.08 GiB already allocated; 180.62 MiB free; 5.58 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF