In [1]:
from pathlib import Path
import os
import torch
from transformers import MarianMTModel, MarianTokenizer
import logging
from datetime import datetime

In [2]:
# connect to google colab
from google.colab import drive
drive.mount("/content/gdrive")


Mounted at /content/gdrive


In [3]:
print(f'@@Configuration START@@')

@@Configuration START@@


In [4]:
# Define configuration
batch_size = 128
d_model = 256
n_head = 8
max_len = 80
ffn_hidden = 512
n_layers=4
drop_prob=0.1
epochs=30
init_lr = 0.00 # having warmup step
eps = 5e-9
weight_decay = 5e-4
warmup_steps=1500
clip = 1

print(f'batch_size: {batch_size}')
print(f'd_model: {d_model}')
print(f'n_head: {n_head}')
print(f'max_len: {max_len}')
print(f'ffn_hidden: {ffn_hidden}')
print(f'n_layers: {n_layers}')
print(f'drop_prob: {drop_prob}')
print(f'epochs: {epochs}')
print(f'init_lr: {init_lr}')
print(f'weight_decay: {weight_decay}')
print(f'clip: {clip}')

batch_size: 128
d_model: 256
n_head: 8
max_len: 80
ffn_hidden: 512
n_layers: 4
drop_prob: 0.1
epochs: 30
init_lr: 0.0
weight_decay: 0.0005
clip: 1


In [5]:
# Configure Device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA as device")
else:
    # Check that MPS is available
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                  "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
                  "and/or you do not have an MPS-enabled device on this machine.")
        device = torch.device("cpu")
        print("Using CPU as device")
    else:
        device = torch.device("mps")
        print("Using MPS as device")

torch.set_default_device(device)

Using CUDA as device


In [6]:
# Define tokenizers
TOKENIZERS_PARALLELISM = True

kr_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ko-en')
en_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ko-en')

enc_voc_size = kr_tokenizer.vocab_size
dec_voc_size = en_tokenizer.vocab_size

print(f'Using kr_tokenizer: Helsinki-NLP/opus-mt-ko-en')
print(f'kr_tokenizer_voc_size(enc_voc_size): {enc_voc_size}')

print(f'Using en_tokenizer: Helsinki-NLP/opus-mt-ko-en')
print(f'en_tokenizer_voc_size(dec_voc_size): {dec_voc_size}')

# Define some variables that are going to be used in future
src_pad_token = kr_tokenizer.pad_token_id
src_eos_token = kr_tokenizer.eos_token_id

trg_pad_token = en_tokenizer.pad_token_id
trg_sos_token = en_tokenizer.bos_token_id
trg_eos_token = en_tokenizer.eos_token_id

print(f'src_pad_token: {src_pad_token}')
print(f'src_eos_token: {src_eos_token}')
print(f'trg_pad_token: {trg_pad_token}')
print(f'trg_sos_token: {trg_sos_token}')
print(f'trg_eos_token: {trg_eos_token}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



Using kr_tokenizer: Helsinki-NLP/opus-mt-ko-en
kr_tokenizer_voc_size(enc_voc_size): 65001
Using en_tokenizer: Helsinki-NLP/opus-mt-ko-en
en_tokenizer_voc_size(dec_voc_size): 65001
src_pad_token: 65000
src_eos_token: 0
trg_pad_token: 65000
trg_sos_token: None
trg_eos_token: 0


In [7]:
# Define path configuration for the project
project_dir = Path("/content/gdrive/MyDrive/Colab Notebooks/en2kr-Translator")
data_dir = project_dir / "data"
model_dir = project_dir / "models"

data_dir.mkdir(parents=True, exist_ok=True)
model_dir.mkdir(parents=True, exist_ok=True)

print(f'project_dir: {project_dir}')
print(f'data_dir: {data_dir}')
print(f'model_dir: {model_dir}')

project_dir: /content/gdrive/MyDrive/Colab Notebooks/en2kr-Translator
data_dir: /content/gdrive/MyDrive/Colab Notebooks/en2kr-Translator/data
model_dir: /content/gdrive/MyDrive/Colab Notebooks/en2kr-Translator/models


In [8]:
# Configure Logger
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

logging_dir = project_dir / "logs"

logging_dir.mkdir(parents=True, exist_ok=True)

log_file = logging_dir / f'log_{timestamp}.log'

logger = logging.getLogger('transformer_log')
logger.setLevel(logging.INFO)

file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)

In [9]:
# Define learning rate scheduler.
# If you want to modify the logic of Scheduler, please modify this class

class LRScheduler:
    def __init__(self, optimizer, d_model, warmup_steps, LR_scale=1):
        self.optimizer = optimizer
        self.step_count = 0
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.LR_scale = LR_scale
        self._d_model_factor = self.LR_scale * (self.d_model ** -0.5)
    def step(self):
        self.step_count += 1
        lr = self.calculate_learning_rate()
        self.optimizer.param_groups[0]['lr'] = lr
    def calculate_learning_rate(self):
        minimum_factor = min(self.step_count ** -0.5, self.step_count * self.warmup_steps ** -1.5)
        return self._d_model_factor * minimum_factor

In [10]:
print(f'@@Configuration END@@')

@@Configuration END@@


In [11]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch

In [12]:
train_data_path = data_dir / "train.parquet"
test_data_path = data_dir / "test.parquet"

df_train = pd.read_parquet(path=train_data_path)
df_test = pd.read_parquet(path=test_data_path)

class en2kr_Train_Dataset(Dataset):
    def __init__(self, max_len):
        self.data = df_train

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[[idx]]
        en_sentence = row["english"].item()
        kr_sentence = row["korean"].item()

        return kr_sentence, en_sentence

class en2kr_Test_Dataset(Dataset):
    def __init__(self, max_len):
        self.data = df_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[[idx]]
        en_sentence = row["english"].item()
        kr_sentence = row["korean"].item()

        return kr_sentence, en_sentence



In [13]:
train_dataset = en2kr_Train_Dataset(max_len=max_len)
test_dataset = en2kr_Test_Dataset(max_len=max_len)

train_dataloader = DataLoader(train_dataset,pin_memory=True, drop_last=True, batch_size=batch_size, shuffle=True, generator=torch.Generator(device=device))
test_dataloader = DataLoader(test_dataset,pin_memory=True, drop_last=True, batch_size=batch_size, generator=torch.Generator(device=device))

# Transformer Model Implementation


In [14]:
# import required packages
import torch
import torch.nn as nn
import math

In [15]:
# Define a Token Embedding
class TokenEmbeddings(nn.Embedding):
    """
    Converting token into embedding vector
    """
    def __init__(self, vocab_size, d_model):
        """
        class for token embedding without positional encoding
        This layer transforms an seq_len token_ids -> (seq_len, d_model)
        Assigning (vector of size d_model) to each tokens

        :param vocab_size: number of vocabs that TokenEmbeddings can handle
        :param d_model: dimension of embedding vector
        """
        super(TokenEmbeddings, self).__init__(vocab_size, d_model, padding_idx=65000)

# Define Positional Encoding
class PositionalEncoding(nn.Module):
    """
    compute reusable sinusoid positional encoding
    """
    def __init__(self, d_model, max_len, device):
        """
        construct sinusoid positional encoding that is going to be reused everytime when it is needed

        :param d_model: dimension of embedding vector
        :param max_len: maximum sequence length of token(a.k.a window size of attention method)
        """
        super(PositionalEncoding, self).__init__()

        # define a max_len * d_model size encoding matrix
        self.encoding = torch.zeros(max_len, d_model, device=device)

        # since positional encoding is not learnable, we turn off the gradient engine
        self.encoding.requires_grad = False

        # define a position at the sequence
        pos = torch.arange(0, max_len, device=device)
        # expand the max_len vector to max_len * 1 matrix
        pos = pos.float().unsqueeze(dim=1)

        _2i = torch.arange(0, d_model, step=2, device=device).float()

        # define a sinusoid positional encoding
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))

    def forward(self, x):
        batch_size, seq_len = x.shape

        return self.encoding[:seq_len, :]

# Define Transformer Embedding
class TransformerEmbedding(nn.Module):
    """
    token embedding + positional encoding
    """
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        """
        initialize the embedding class for word+position embedding

        :param vocab_size: number of vocabs that TokenEmbeddings can handle
        :param d_model: dimension of embedding vector
        :param max_len: maximum sequence length of token(a.k.a window size of attention method)
        :param drop_prob: dropout probability to reduce overfitting
        """
        super(TransformerEmbedding, self).__init__()
        self.token_emb = TokenEmbeddings(vocab_size, d_model)
        self.position_emb = PositionalEncoding(d_model, max_len, device)
        self.dropout = nn.Dropout(p=drop_prob)
        self.scale = torch.sqrt(torch.tensor(d_model, dtype=torch.float32))

    def forward(self, x):
        tok_emb = self.scale * self.token_emb(x)
        pos_emb = self.position_emb(x)

        return self.dropout(tok_emb+pos_emb)


In [16]:
# Define Attention Block
class AttentionBlock(nn.Module):
    """
    compute scale dot product attention for Query, Key, Value
    """
    def __init__(self):
        super(AttentionBlock, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, eps=1e-12):
        batch_size, head, length, d_tensor = k.shape

        # calculate the k_T
        k_T = k.transpose(2, 3)

        # calculate the attention weight
        att_weight = (q @ k_T) / math.sqrt(d_tensor)

        # if there are any masks that needs to be applied
        if mask is not None:
            att_weight = att_weight.masked_fill(mask == 0, -1e10)

        # calculate the softmax
        # att_weight shape: batch_size, head, seq_len_query, seq_len_key
        att_weight = self.softmax(att_weight)

        # att_weight @ v shape: batch_size, head, seq_len_query, d_tensor
        return att_weight @ v, att_weight

# Define MultiHeadAttention Block
class MultiHeadAttentionBlock(nn.Module):
    """
    define multi head attention block using AttentionBlock module
    """
    def __init__(self, d_model, n_head):
        """
        Multi-head self-attention utilize the parallelism of GPU

        :param d_model: dimension of embedding vector
        :param n_head: number of heads
        """
        super(MultiHeadAttentionBlock, self).__init__()
        self.n_head = n_head
        self.attention = AttentionBlock()
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)

        # in the paper, d_v = d_k = d_q
        self.Wv = nn.Linear(d_model, d_model)

        self.Wconcat = nn.Linear(d_model, d_model)

    def split(self, tensor):
        """
        split the tensor by number of head

        :param tensor: tensor of shape batch_size  * seq_len * d_model
        :return: return tensor of shape batch_size * n_head * seq_len * d_tensor
        """
        batch_size, seq_len, d_model = tensor.shape

        d_tensor = d_model // self.n_head

        tensor = tensor.view(batch_size, seq_len, self.n_head, d_tensor).transpose(1, 2)

        return tensor

    def concat(self, tensor):
        """
        concat tensor. Inverse operation of split

        :param tensor: tensor of shape batch_size * n_head * seq_len * d_tensor
        :return: return tensor of shape batch_size * seq_len * d_model
        """
        batch_size, n_head, seq_len, d_tensor = tensor.shape

        d_model = n_head * d_tensor
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        return tensor


    def forward(self, q, k, v, mask=None):
        # apply linear transformation to derive q, k, v
        q, k, v = self.Wq(q), self.Wk(k), self.Wv(v)

        # split the tensor by number of heads
        q, k, v = self.split(q), self.split(k), self.split(v)

        # apply attention to q, k, v
        out, attn_weights = self.attention(q, k, v, mask=mask)

        # current attn_weights shape is batch_size * n_head * q_len * k_len
        # mean it by dim 1
        # eventually changing shape into batch_size * q_len * k_len
        attn_weight = attn_weights.mean(dim=1)

        # concat
        out = self.concat(out)

        # apply concat weight
        out = self.Wconcat(out)
        return out, attn_weight


In [17]:
# define FeedForward Network
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(FeedForwardBlock, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [18]:
# Define Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(drop_prob)

        self.ffn = FeedForwardBlock(d_model, ffn_hidden, drop_prob)
        self.dropout2 = nn.Dropout(drop_prob)

    def forward(self, x, src_mask):
        residual = x
        x = self.norm(x)
        x, attn_weight = self.attention(q=x, k=x, v=x, mask=src_mask)

        x = self.dropout1(x)
        x = self.norm(x + residual)

        residual = x
        x = self.ffn(x)

        x =  self.dropout2(x)
        x = x + residual

        return x, attn_weight

# Define Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.enc_dec_attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = FeedForwardBlock(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, trg_mask, src_mask):
        residual = dec
        dec = self.norm1(dec)

        x, attn_weight1 = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)

        x = self.dropout1(x)
        x = self.norm2(x + residual)

        residual = x
        x, attn_weight2 = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)
        x = self.dropout2(x)
        x = self.norm3(x + residual)

        residual = x
        x = self.ffn(x)
        x = self.dropout3(x)
        x = x + residual

        return x, attn_weight1, attn_weight2

In [19]:
# Define Encoder Model
class Encoder(nn.Module):
    """
    Encoder for Transformer
    """
    def __init__(self, embedding, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super(Encoder, self).__init__()
        self.emb = embedding
        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model, ffn_hidden=ffn_hidden, n_head=n_head, drop_prob=drop_prob) for _ in range(n_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, src_mask):
        x = self.emb(x)
        attn_weights = []
        # get the mean of attention map batch_size * seq_len_src * seq_len_src
        for layer in self.layers:
            x, attn_weight = layer(x, src_mask)
            attn_weights.append(attn_weight)

        x = self.norm(x)

        return x, torch.mean(torch.stack(attn_weights), dim=0)


class Decoder(nn.Module):
    """
    Decoder for Transformer
    """
    def __init__(self, embedding, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super(Decoder, self).__init__()
        self.emb = embedding

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model, ffn_hidden=ffn_hidden, n_head=n_head, drop_prob=drop_prob) for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)
        self.norm = nn.LayerNorm(d_model)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.emb(trg)
        attn_weights_1 = []
        attn_weights_2 = []
        for layer in self.layers:
            trg, attn_weight_1, attn_weight_2 = layer(trg, enc_src, trg_mask, src_mask)
            attn_weights_1.append(attn_weight_1)
            attn_weights_2.append(attn_weight_2)

        trg = self.norm(trg)
        output = self.linear(trg)
        return output, torch.mean(torch.stack(attn_weights_1), dim=0), torch.mean(torch.stack(attn_weights_2), dim=0)


# Define Transformer Model
class Transformer(nn.Module):
    """
    Transformer Model
    """
    def __init__(self, src_pad_token, trg_pad_token, enc_voc_size, dec_voc_size, n_head, max_len, d_model, ffn_hidden, n_layers, drop_prob, device):
        """
        Constructing Transformer Model

        :param src_pad_token: embedding vector that represents <pad> in source
        :param trg_pad_token: embedding vector that represents <pad> in target
        :params enc_voc_size: number of vocabs that encoderEmbedder can handle
        :params dec_voc_size: number of vocabs that decoderEmbedder can handle
        :params ffn_hidden: hidden vector dimension for fastfeedforward layer
        :params n_layers: number of EncoderLayer/DecoderLayer used
        :params drop_prob: dropout probability
        """
        super(Transformer, self).__init__()

        self.emb = TransformerEmbedding(d_model=d_model, max_len=max_len, vocab_size=dec_voc_size, drop_prob=drop_prob, device=device)
        self.src_pad_token = src_pad_token
        self.trg_pad_token = trg_pad_token
        self.device = device
        self.encoder = Encoder(embedding=self.emb, d_model=d_model, n_head=n_head, max_len=max_len, ffn_hidden=ffn_hidden, enc_voc_size=enc_voc_size, drop_prob=drop_prob, n_layers=n_layers, device=device)
        self.decoder = Decoder(embedding=self.emb, d_model=d_model, n_head=n_head, max_len=max_len, ffn_hidden=ffn_hidden, dec_voc_size=dec_voc_size, drop_prob=drop_prob, n_layers=n_layers, device=device)

    def make_src_mask(self, src):
        # print(f'src: {src}')
        # print(f'src_pad_token: {self.src_pad_token}')
        # print(f'src != self.src_pad_token: {src != self.src_pad_token}')
        src_mask = (src != self.src_pad_token).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_token).unsqueeze(1).unsqueeze(3)
        trg_len = trg.shape[1]

        # make a look-ahead mask using torch.tril
        # [[1 0 0]
        #  [1 1 0]
        #  [1 1 1]]
        trg_sub_mask = torch.tril(torch.ones(trg_len, trg_len)).type(torch.ByteTensor).to(self.device)

        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask


    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src, enc_self_attn_weight = self.encoder(src, src_mask)
        output, dec_self_attn_weight, enc_dec_attn_weight = self.decoder(trg, enc_src, trg_mask, src_mask)

        # current output shape is batch_size * dec_voc_size, which is desirable.
        # We don't need to apply softmax because we are going to use CrossEntropyLoss as loss function
        # which automatically applies log-softmax during calculation

        return output

# Train the Model using datas

In [20]:
from torch.optim import Adam
from datetime import datetime
import torch
from tqdm import tqdm

In [21]:
# Prepare the model
model = Transformer(
    src_pad_token=src_pad_token,
    trg_pad_token=trg_pad_token,
    enc_voc_size=enc_voc_size,
    dec_voc_size=dec_voc_size,
    n_head=n_head,
    max_len=max_len,
    d_model=d_model,
    ffn_hidden=ffn_hidden,
    n_layers=n_layers,
    drop_prob=drop_prob,
    device=device).to(device)

model.train()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

logger.info(f'model parameter #: {count_parameters(model)}')


INFO:transformer_log:model parameter #: 38616041


In [22]:
# Setup optimizer
optimizer = Adam(params=model.parameters(), lr=init_lr, weight_decay=weight_decay, eps=eps, betas=(0.9, 0.98))

# Set Noam Scheduler
scheduler = LRScheduler(optimizer, d_model, warmup_steps)
# Setup loss function for training
loss_func = nn.CrossEntropyLoss(ignore_index=src_pad_token)




In [23]:
# store lr rate history per steps
lr_history = []
# store loss history per steps
train_loss_history = []

In [24]:
def train_epoch(epoch_num):
    model.train()
    train_epoch_loss = 0

    for step, (kr_sentences, en_sentences) in tqdm(enumerate(train_dataloader)):

        # tokenize kr_sentence
        kr_tokenized = kr_tokenizer(kr_sentences, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids

        # tokenize en_sentence
        # make en_sentence start with eos token(this is because current tokenizer don't have an sos token.)
        en_sentences = ['</s> ' + s for s in en_sentences]
        en_tokenized = en_tokenizer(en_sentences, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids


        kr_tokenized = kr_tokenized.to(device)
        en_tokenized = en_tokenized.to(device)

        # out is the dec_voc_size vector
        # during training, we exclude sep token

        # remove eos token if the sentence is too long, and gets truncated.
        # so we can prevent early-stopping(early-eos)
        # out: batch_size * max_len * dec_voc_size
        out = model(kr_tokenized, en_tokenized[:, :-1])

        # remove sos token from en_tokenized when calculating loss because out will not include eos token in front of the sentence.
        # en_tokenized: batch_size * (max_len-1)
        en_tokenized = en_tokenized[:, 1:].to(device)

        # out: batch_size * (max_len - 1) * dec_voc_size
        out = out.permute(0, 2, 1).to(device)

        loss = loss_func(out, en_tokenized)
        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        scheduler.step()

        train_epoch_loss += loss.item()

        if step % 200 == 0:
            print(f'    EPOCH #{epoch_num} STEP #{step} | loss: {loss.item()}, avg_loss: {train_epoch_loss / (step + 1)}')
            logger.info(f'    EPOCH #{epoch_num} STEP #{step} | loss: {loss.item()}, avg_loss: {train_epoch_loss / (step + 1)}')



    train_step_loss = train_epoch_loss / (step+1)
    # After training epoch, do evaluation

    return train_step_loss

In [25]:
# evaluate the model
def evaluate():
    model.eval()
    test_epoch_loss = 0
    test_bleu_loss = 0

    with torch.no_grad():
        for step, (kr_sentences, en_sentences) in tqdm(enumerate(test_dataloader)):
            # tokenize kr_sentence
            kr_tokenized = kr_tokenizer(kr_sentences, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids

            # tokenize en_sentence
            # make en_sentence start with eos token(this is because current tokenizer don't have an sos token.)
            en_sentences = ['</s> ' + s for s in en_sentences]
            en_tokenized = en_tokenizer(en_sentences, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids

            kr_tokenized = kr_tokenized.to(device)
            en_tokenized = en_tokenized.to(device)


            # this does not remove the eos token
            # FIXME
            out = model(kr_tokenized, en_tokenized[:, :-1])


            # remove sos token from en_tokenized when calculating loss because out will not include sos token.
            en_tokenized = en_tokenized[:, 1:].to(device)

            out = out.permute(0, 2, 1).to(device)

            loss = loss_func(out, en_tokenized)
            test_epoch_loss += loss.item()

            # calcuate the bleu
            # TODO
        test_step_loss = test_epoch_loss / (step + 1)
    return test_step_loss

In [26]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

for epoch in range(epochs):
    print(f'Epoch #{epoch} Start: current LR {optimizer.param_groups[0]["lr"]}')
    logger.info(f'Epoch #{epoch} Start: current LR {optimizer.param_groups[0]["lr"]}')

    train_loss = train_epoch(epoch)
    test_loss = evaluate()
    lr_history.append(optimizer.param_groups[0]["lr"])
    train_loss_history.append(train_loss)

    logger.info(f'Epoch #{epoch} End: Train Loss {train_loss}, Test Loss {test_loss}')

    model_path = model_dir / f'model_{timestamp}_{epoch}'
    torch.save(model.state_dict(), model_path)


INFO:transformer_log:Epoch #0 Start: current LR 0.0


Epoch #0 Start: current LR 0.0


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #0 STEP #0 | loss: 11.192879676818848, avg_loss: 11.192879676818848
1it [00:02,  2.64s/it]

    EPOCH #0 STEP #0 | loss: 11.192879676818848, avg_loss: 11.192879676818848


200it [03:31,  1.08s/it]INFO:transformer_log:    EPOCH #0 STEP #200 | loss: 5.794293403625488, avg_loss: 8.4648562545207
201it [03:32,  1.08s/it]

    EPOCH #0 STEP #200 | loss: 5.794293403625488, avg_loss: 8.4648562545207


400it [07:10,  1.09s/it]INFO:transformer_log:    EPOCH #0 STEP #400 | loss: 4.258087635040283, avg_loss: 6.6905902175237415
401it [07:11,  1.09s/it]

    EPOCH #0 STEP #400 | loss: 4.258087635040283, avg_loss: 6.6905902175237415


600it [10:48,  1.10s/it]INFO:transformer_log:    EPOCH #0 STEP #600 | loss: 3.7373459339141846, avg_loss: 5.779763215393473
601it [10:49,  1.09s/it]

    EPOCH #0 STEP #600 | loss: 3.7373459339141846, avg_loss: 5.779763215393473


751it [13:32,  1.08s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #0 End: Train Loss 5.3613887623686605, Test Loss 3.517535721134936
INFO:transformer_log:Epoch #1 Start: current LR 0.0008079473591671584


Epoch #1 Start: current LR 0.0008079473591671584


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #1 STEP #0 | loss: 3.6640496253967285, avg_loss: 3.6640496253967285
1it [00:01,  1.10s/it]

    EPOCH #1 STEP #0 | loss: 3.6640496253967285, avg_loss: 3.6640496253967285


200it [03:37,  1.10s/it]INFO:transformer_log:    EPOCH #1 STEP #200 | loss: 3.357304096221924, avg_loss: 3.4735156979726916
201it [03:39,  1.09s/it]

    EPOCH #1 STEP #200 | loss: 3.357304096221924, avg_loss: 3.4735156979726916


400it [07:15,  1.11s/it]INFO:transformer_log:    EPOCH #1 STEP #400 | loss: 3.248926877975464, avg_loss: 3.3852961687673058
401it [07:16,  1.10s/it]

    EPOCH #1 STEP #400 | loss: 3.248926877975464, avg_loss: 3.3852961687673058


600it [10:53,  1.07s/it]INFO:transformer_log:    EPOCH #1 STEP #600 | loss: 3.164234161376953, avg_loss: 3.3193627098039067
601it [10:54,  1.08s/it]

    EPOCH #1 STEP #600 | loss: 3.164234161376953, avg_loss: 3.3193627098039067


751it [13:37,  1.09s/it]
117it [00:46,  2.49it/s]
INFO:transformer_log:Epoch #1 End: Train Loss 3.2787253085211336, Test Loss 3.007636869055593
INFO:transformer_log:Epoch #2 Start: current LR 0.0016126683068472122


Epoch #2 Start: current LR 0.0016126683068472122


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #2 STEP #0 | loss: 3.0943310260772705, avg_loss: 3.0943310260772705
1it [00:01,  1.06s/it]

    EPOCH #2 STEP #0 | loss: 3.0943310260772705, avg_loss: 3.0943310260772705


200it [03:36,  1.09s/it]INFO:transformer_log:    EPOCH #2 STEP #200 | loss: 3.0065340995788574, avg_loss: 3.0544604353643767
201it [03:38,  1.09s/it]

    EPOCH #2 STEP #200 | loss: 3.0065340995788574, avg_loss: 3.0544604353643767


400it [07:14,  1.10s/it]INFO:transformer_log:    EPOCH #2 STEP #400 | loss: 2.9626917839050293, avg_loss: 3.0245151852729015
401it [07:15,  1.08s/it]

    EPOCH #2 STEP #400 | loss: 2.9626917839050293, avg_loss: 3.0245151852729015


600it [10:51,  1.09s/it]INFO:transformer_log:    EPOCH #2 STEP #600 | loss: 2.8922317028045654, avg_loss: 2.993729266073065
601it [10:52,  1.10s/it]

    EPOCH #2 STEP #600 | loss: 2.8922317028045654, avg_loss: 2.993729266073065


751it [13:35,  1.09s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #2 End: Train Loss 2.9739354075826756, Test Loss 2.7817565681587935
INFO:transformer_log:Epoch #3 Start: current LR 0.0013167381587112536


Epoch #3 Start: current LR 0.0013167381587112536


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #3 STEP #0 | loss: 2.93554949760437, avg_loss: 2.93554949760437
1it [00:01,  1.08s/it]

    EPOCH #3 STEP #0 | loss: 2.93554949760437, avg_loss: 2.93554949760437


200it [03:37,  1.09s/it]INFO:transformer_log:    EPOCH #3 STEP #200 | loss: 2.802155017852783, avg_loss: 2.8477361273409714
201it [03:38,  1.10s/it]

    EPOCH #3 STEP #200 | loss: 2.802155017852783, avg_loss: 2.8477361273409714


400it [07:14,  1.08s/it]INFO:transformer_log:    EPOCH #3 STEP #400 | loss: 2.8594045639038086, avg_loss: 2.82850431444639
401it [07:15,  1.08s/it]

    EPOCH #3 STEP #400 | loss: 2.8594045639038086, avg_loss: 2.82850431444639


600it [10:51,  1.09s/it]INFO:transformer_log:    EPOCH #3 STEP #600 | loss: 2.8233559131622314, avg_loss: 2.8138847402645624
601it [10:52,  1.09s/it]

    EPOCH #3 STEP #600 | loss: 2.8233559131622314, avg_loss: 2.8138847402645624


751it [13:35,  1.09s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #3 End: Train Loss 2.800794782397274, Test Loss 2.6188059933165198
INFO:transformer_log:Epoch #4 Start: current LR 0.0011403286955762918


Epoch #4 Start: current LR 0.0011403286955762918


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #4 STEP #0 | loss: 2.7193191051483154, avg_loss: 2.7193191051483154
1it [00:01,  1.07s/it]

    EPOCH #4 STEP #0 | loss: 2.7193191051483154, avg_loss: 2.7193191051483154


200it [03:36,  1.09s/it]INFO:transformer_log:    EPOCH #4 STEP #200 | loss: 2.750437021255493, avg_loss: 2.708526180751288
201it [03:37,  1.09s/it]

    EPOCH #4 STEP #200 | loss: 2.750437021255493, avg_loss: 2.708526180751288


400it [07:13,  1.08s/it]INFO:transformer_log:    EPOCH #4 STEP #400 | loss: 2.6326487064361572, avg_loss: 2.692179409345784
401it [07:14,  1.08s/it]

    EPOCH #4 STEP #400 | loss: 2.6326487064361572, avg_loss: 2.692179409345784


600it [10:49,  1.08s/it]INFO:transformer_log:    EPOCH #4 STEP #600 | loss: 2.6497390270233154, avg_loss: 2.676328188567709
601it [10:51,  1.08s/it]

    EPOCH #4 STEP #600 | loss: 2.6497390270233154, avg_loss: 2.676328188567709


751it [13:33,  1.08s/it]
117it [00:46,  2.54it/s]
INFO:transformer_log:Epoch #4 End: Train Loss 2.6663938238522342, Test Loss 2.476601083054502
INFO:transformer_log:Epoch #5 Start: current LR 0.001019940992000901


Epoch #5 Start: current LR 0.001019940992000901


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #5 STEP #0 | loss: 2.598076105117798, avg_loss: 2.598076105117798
1it [00:01,  1.15s/it]

    EPOCH #5 STEP #0 | loss: 2.598076105117798, avg_loss: 2.598076105117798


200it [03:36,  1.08s/it]INFO:transformer_log:    EPOCH #5 STEP #200 | loss: 2.614712953567505, avg_loss: 2.5888289254696213
201it [03:37,  1.08s/it]

    EPOCH #5 STEP #200 | loss: 2.614712953567505, avg_loss: 2.5888289254696213


400it [07:12,  1.08s/it]INFO:transformer_log:    EPOCH #5 STEP #400 | loss: 2.604459762573242, avg_loss: 2.5794092193803286
401it [07:14,  1.07s/it]

    EPOCH #5 STEP #400 | loss: 2.604459762573242, avg_loss: 2.5794092193803286


600it [10:49,  1.08s/it]INFO:transformer_log:    EPOCH #5 STEP #600 | loss: 2.529207468032837, avg_loss: 2.5710525917332503
601it [10:50,  1.10s/it]

    EPOCH #5 STEP #600 | loss: 2.529207468032837, avg_loss: 2.5710525917332503


751it [13:32,  1.08s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #5 End: Train Loss 2.566030092467957, Test Loss 2.3918925289414887
INFO:transformer_log:Epoch #6 Start: current LR 0.0009310744810718159


Epoch #6 Start: current LR 0.0009310744810718159


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #6 STEP #0 | loss: 2.546046495437622, avg_loss: 2.546046495437622
1it [00:01,  1.08s/it]

    EPOCH #6 STEP #0 | loss: 2.546046495437622, avg_loss: 2.546046495437622


200it [03:36,  1.10s/it]INFO:transformer_log:    EPOCH #6 STEP #200 | loss: 2.5118184089660645, avg_loss: 2.5106667440329025
201it [03:37,  1.08s/it]

    EPOCH #6 STEP #200 | loss: 2.5118184089660645, avg_loss: 2.5106667440329025


400it [07:12,  1.11s/it]INFO:transformer_log:    EPOCH #6 STEP #400 | loss: 2.5679330825805664, avg_loss: 2.509803544850718
401it [07:13,  1.11s/it]

    EPOCH #6 STEP #400 | loss: 2.5679330825805664, avg_loss: 2.509803544850718


600it [10:48,  1.07s/it]INFO:transformer_log:    EPOCH #6 STEP #600 | loss: 2.5690512657165527, avg_loss: 2.504730463424657
601it [10:49,  1.07s/it]

    EPOCH #6 STEP #600 | loss: 2.5690512657165527, avg_loss: 2.504730463424657


751it [13:31,  1.08s/it]
117it [00:46,  2.53it/s]
INFO:transformer_log:Epoch #6 End: Train Loss 2.5004557928296127, Test Loss 2.326510859350873
INFO:transformer_log:Epoch #7 Start: current LR 0.0008620074689615852


Epoch #7 Start: current LR 0.0008620074689615852


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #7 STEP #0 | loss: 2.513542652130127, avg_loss: 2.513542652130127
1it [00:01,  1.07s/it]

    EPOCH #7 STEP #0 | loss: 2.513542652130127, avg_loss: 2.513542652130127


200it [03:36,  1.07s/it]INFO:transformer_log:    EPOCH #7 STEP #200 | loss: 2.527355194091797, avg_loss: 2.4584266961510504
201it [03:37,  1.08s/it]

    EPOCH #7 STEP #200 | loss: 2.527355194091797, avg_loss: 2.4584266961510504


400it [07:12,  1.08s/it]INFO:transformer_log:    EPOCH #7 STEP #400 | loss: 2.546833038330078, avg_loss: 2.459385071609383
401it [07:13,  1.08s/it]

    EPOCH #7 STEP #400 | loss: 2.546833038330078, avg_loss: 2.459385071609383


600it [10:48,  1.08s/it]INFO:transformer_log:    EPOCH #7 STEP #600 | loss: 2.459254503250122, avg_loss: 2.456580246546106
601it [10:49,  1.08s/it]

    EPOCH #7 STEP #600 | loss: 2.459254503250122, avg_loss: 2.456580246546106


751it [13:31,  1.08s/it]
117it [00:46,  2.54it/s]
INFO:transformer_log:Epoch #7 End: Train Loss 2.454974147514719, Test Loss 2.286653400486351
INFO:transformer_log:Epoch #8 Start: current LR 0.0008063341534236061


Epoch #8 Start: current LR 0.0008063341534236061


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #8 STEP #0 | loss: 2.3810677528381348, avg_loss: 2.3810677528381348
1it [00:01,  1.08s/it]

    EPOCH #8 STEP #0 | loss: 2.3810677528381348, avg_loss: 2.3810677528381348


200it [03:36,  1.06s/it]INFO:transformer_log:    EPOCH #8 STEP #200 | loss: 2.3813986778259277, avg_loss: 2.4223272373427207
201it [03:37,  1.06s/it]

    EPOCH #8 STEP #200 | loss: 2.3813986778259277, avg_loss: 2.4223272373427207


400it [07:12,  1.08s/it]INFO:transformer_log:    EPOCH #8 STEP #400 | loss: 2.3053183555603027, avg_loss: 2.421563883374754
401it [07:13,  1.09s/it]

    EPOCH #8 STEP #400 | loss: 2.3053183555603027, avg_loss: 2.421563883374754


600it [10:48,  1.09s/it]INFO:transformer_log:    EPOCH #8 STEP #600 | loss: 2.3699846267700195, avg_loss: 2.4220751525955073
601it [10:49,  1.10s/it]

    EPOCH #8 STEP #600 | loss: 2.3699846267700195, avg_loss: 2.4220751525955073


751it [13:31,  1.08s/it]
117it [00:46,  2.53it/s]
INFO:transformer_log:Epoch #8 End: Train Loss 2.42227483367158, Test Loss 2.25637934758113
INFO:transformer_log:Epoch #9 Start: current LR 0.0007602191303841945


Epoch #9 Start: current LR 0.0007602191303841945


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #9 STEP #0 | loss: 2.3033785820007324, avg_loss: 2.3033785820007324
1it [00:01,  1.07s/it]

    EPOCH #9 STEP #0 | loss: 2.3033785820007324, avg_loss: 2.3033785820007324


200it [03:36,  1.09s/it]INFO:transformer_log:    EPOCH #9 STEP #200 | loss: 2.3482370376586914, avg_loss: 2.39664029956457
201it [03:37,  1.08s/it]

    EPOCH #9 STEP #200 | loss: 2.3482370376586914, avg_loss: 2.39664029956457


400it [07:11,  1.07s/it]INFO:transformer_log:    EPOCH #9 STEP #400 | loss: 2.407325029373169, avg_loss: 2.400298870709769
401it [07:13,  1.09s/it]

    EPOCH #9 STEP #400 | loss: 2.407325029373169, avg_loss: 2.400298870709769


600it [10:47,  1.08s/it]INFO:transformer_log:    EPOCH #9 STEP #600 | loss: 2.367227077484131, avg_loss: 2.397463801299871
601it [10:48,  1.08s/it]

    EPOCH #9 STEP #600 | loss: 2.367227077484131, avg_loss: 2.397463801299871


751it [13:30,  1.08s/it]
117it [00:46,  2.53it/s]
INFO:transformer_log:Epoch #9 End: Train Loss 2.3973726851644908, Test Loss 2.22817685869005
INFO:transformer_log:Epoch #10 Start: current LR 0.0007212071918539712


Epoch #10 Start: current LR 0.0007212071918539712


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #10 STEP #0 | loss: 2.3131399154663086, avg_loss: 2.3131399154663086
1it [00:01,  1.05s/it]

    EPOCH #10 STEP #0 | loss: 2.3131399154663086, avg_loss: 2.3131399154663086


200it [03:36,  1.07s/it]INFO:transformer_log:    EPOCH #10 STEP #200 | loss: 2.4162118434906006, avg_loss: 2.369162501387335
201it [03:37,  1.07s/it]

    EPOCH #10 STEP #200 | loss: 2.4162118434906006, avg_loss: 2.369162501387335


400it [07:11,  1.09s/it]INFO:transformer_log:    EPOCH #10 STEP #400 | loss: 2.2996764183044434, avg_loss: 2.3725855427787192
401it [07:12,  1.08s/it]

    EPOCH #10 STEP #400 | loss: 2.2996764183044434, avg_loss: 2.3725855427787192


600it [10:47,  1.09s/it]INFO:transformer_log:    EPOCH #10 STEP #600 | loss: 2.3802244663238525, avg_loss: 2.375287556211881
601it [10:48,  1.10s/it]

    EPOCH #10 STEP #600 | loss: 2.3802244663238525, avg_loss: 2.375287556211881


751it [13:30,  1.08s/it]
117it [00:46,  2.51it/s]
INFO:transformer_log:Epoch #10 End: Train Loss 2.376867717814033, Test Loss 2.2078211470546885
INFO:transformer_log:Epoch #11 Start: current LR 0.00068764407652763


Epoch #11 Start: current LR 0.00068764407652763


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #11 STEP #0 | loss: 2.3187649250030518, avg_loss: 2.3187649250030518
1it [00:01,  1.07s/it]

    EPOCH #11 STEP #0 | loss: 2.3187649250030518, avg_loss: 2.3187649250030518


200it [03:36,  1.07s/it]INFO:transformer_log:    EPOCH #11 STEP #200 | loss: 2.3160223960876465, avg_loss: 2.350449442270383
201it [03:37,  1.07s/it]

    EPOCH #11 STEP #200 | loss: 2.3160223960876465, avg_loss: 2.350449442270383


400it [07:11,  1.09s/it]INFO:transformer_log:    EPOCH #11 STEP #400 | loss: 2.3689560890197754, avg_loss: 2.356463586303064
401it [07:12,  1.09s/it]

    EPOCH #11 STEP #400 | loss: 2.3689560890197754, avg_loss: 2.356463586303064


600it [10:47,  1.08s/it]INFO:transformer_log:    EPOCH #11 STEP #600 | loss: 2.434105157852173, avg_loss: 2.359317590710327
601it [10:48,  1.09s/it]

    EPOCH #11 STEP #600 | loss: 2.434105157852173, avg_loss: 2.359317590710327


751it [13:29,  1.08s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #11 End: Train Loss 2.360064449069027, Test Loss 2.1942401462131076
INFO:transformer_log:Epoch #12 Start: current LR 0.0006583690793556268


Epoch #12 Start: current LR 0.0006583690793556268


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #12 STEP #0 | loss: 2.339672565460205, avg_loss: 2.339672565460205
1it [00:01,  1.08s/it]

    EPOCH #12 STEP #0 | loss: 2.339672565460205, avg_loss: 2.339672565460205


200it [03:35,  1.07s/it]INFO:transformer_log:    EPOCH #12 STEP #200 | loss: 2.3094494342803955, avg_loss: 2.3340278229310143
201it [03:36,  1.07s/it]

    EPOCH #12 STEP #200 | loss: 2.3094494342803955, avg_loss: 2.3340278229310143


400it [07:10,  1.08s/it]INFO:transformer_log:    EPOCH #12 STEP #400 | loss: 2.3554375171661377, avg_loss: 2.3430111955228887
401it [07:11,  1.08s/it]

    EPOCH #12 STEP #400 | loss: 2.3554375171661377, avg_loss: 2.3430111955228887


600it [10:46,  1.09s/it]INFO:transformer_log:    EPOCH #12 STEP #600 | loss: 2.2718756198883057, avg_loss: 2.344238138040171
601it [10:47,  1.09s/it]

    EPOCH #12 STEP #600 | loss: 2.2718756198883057, avg_loss: 2.344238138040171


751it [13:28,  1.08s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #12 End: Train Loss 2.3460400914066484, Test Loss 2.1765591812948895
INFO:transformer_log:Epoch #13 Start: current LR 0.0006325405511974286


Epoch #13 Start: current LR 0.0006325405511974286


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #13 STEP #0 | loss: 2.290393829345703, avg_loss: 2.290393829345703
1it [00:01,  1.09s/it]

    EPOCH #13 STEP #0 | loss: 2.290393829345703, avg_loss: 2.290393829345703


200it [03:35,  1.07s/it]INFO:transformer_log:    EPOCH #13 STEP #200 | loss: 2.3801331520080566, avg_loss: 2.323786180410812
201it [03:36,  1.07s/it]

    EPOCH #13 STEP #200 | loss: 2.3801331520080566, avg_loss: 2.323786180410812


400it [07:10,  1.08s/it]INFO:transformer_log:    EPOCH #13 STEP #400 | loss: 2.3452816009521484, avg_loss: 2.331804698244889
401it [07:11,  1.07s/it]

    EPOCH #13 STEP #400 | loss: 2.3452816009521484, avg_loss: 2.331804698244889


600it [10:46,  1.08s/it]INFO:transformer_log:    EPOCH #13 STEP #600 | loss: 2.2771265506744385, avg_loss: 2.3314522820185504
601it [10:47,  1.10s/it]

    EPOCH #13 STEP #600 | loss: 2.2771265506744385, avg_loss: 2.3314522820185504


751it [13:29,  1.08s/it]
117it [00:46,  2.51it/s]
INFO:transformer_log:Epoch #13 End: Train Loss 2.333092386649547, Test Loss 2.1646342725835295
INFO:transformer_log:Epoch #14 Start: current LR 0.0006095313267361893


Epoch #14 Start: current LR 0.0006095313267361893


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #14 STEP #0 | loss: 2.303311824798584, avg_loss: 2.303311824798584
1it [00:01,  1.08s/it]

    EPOCH #14 STEP #0 | loss: 2.303311824798584, avg_loss: 2.303311824798584


200it [03:35,  1.07s/it]INFO:transformer_log:    EPOCH #14 STEP #200 | loss: 2.2706010341644287, avg_loss: 2.3190982899262536
201it [03:36,  1.07s/it]

    EPOCH #14 STEP #200 | loss: 2.2706010341644287, avg_loss: 2.3190982899262536


400it [07:10,  1.08s/it]INFO:transformer_log:    EPOCH #14 STEP #400 | loss: 2.325218439102173, avg_loss: 2.321465338257483
401it [07:11,  1.08s/it]

    EPOCH #14 STEP #400 | loss: 2.325218439102173, avg_loss: 2.321465338257483


600it [10:46,  1.08s/it]INFO:transformer_log:    EPOCH #14 STEP #600 | loss: 2.3415634632110596, avg_loss: 2.322407292050252
601it [10:47,  1.08s/it]

    EPOCH #14 STEP #600 | loss: 2.3415634632110596, avg_loss: 2.322407292050252


751it [13:28,  1.08s/it]
117it [00:46,  2.50it/s]
INFO:transformer_log:Epoch #14 End: Train Loss 2.3228148394989745, Test Loss 2.1538144258352427
INFO:transformer_log:Epoch #15 Start: current LR 0.000588863206289254


Epoch #15 Start: current LR 0.000588863206289254


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #15 STEP #0 | loss: 2.2336416244506836, avg_loss: 2.2336416244506836
1it [00:01,  1.09s/it]

    EPOCH #15 STEP #0 | loss: 2.2336416244506836, avg_loss: 2.2336416244506836


200it [03:35,  1.06s/it]INFO:transformer_log:    EPOCH #15 STEP #200 | loss: 2.31746768951416, avg_loss: 2.3029082865264283
201it [03:36,  1.07s/it]

    EPOCH #15 STEP #200 | loss: 2.31746768951416, avg_loss: 2.3029082865264283


400it [07:11,  1.08s/it]INFO:transformer_log:    EPOCH #15 STEP #400 | loss: 2.3472044467926025, avg_loss: 2.307257553585747
401it [07:12,  1.09s/it]

    EPOCH #15 STEP #400 | loss: 2.3472044467926025, avg_loss: 2.307257553585747


600it [10:47,  1.08s/it]INFO:transformer_log:    EPOCH #15 STEP #600 | loss: 2.290898323059082, avg_loss: 2.3102143532821224
601it [10:48,  1.08s/it]

    EPOCH #15 STEP #600 | loss: 2.290898323059082, avg_loss: 2.3102143532821224


751it [13:30,  1.08s/it]
117it [00:46,  2.53it/s]
INFO:transformer_log:Epoch #15 End: Train Loss 2.3127788205280124, Test Loss 2.1454758766369944
INFO:transformer_log:Epoch #16 Start: current LR 0.0005701643477881459


Epoch #16 Start: current LR 0.0005701643477881459


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #16 STEP #0 | loss: 2.250819444656372, avg_loss: 2.250819444656372
1it [00:01,  1.08s/it]

    EPOCH #16 STEP #0 | loss: 2.250819444656372, avg_loss: 2.250819444656372


200it [03:35,  1.08s/it]INFO:transformer_log:    EPOCH #16 STEP #200 | loss: 2.3250396251678467, avg_loss: 2.2978903108568334
201it [03:37,  1.09s/it]

    EPOCH #16 STEP #200 | loss: 2.3250396251678467, avg_loss: 2.2978903108568334


400it [07:11,  1.07s/it]INFO:transformer_log:    EPOCH #16 STEP #400 | loss: 2.2670650482177734, avg_loss: 2.3015007895424477
401it [07:12,  1.06s/it]

    EPOCH #16 STEP #400 | loss: 2.2670650482177734, avg_loss: 2.3015007895424477


600it [10:47,  1.09s/it]INFO:transformer_log:    EPOCH #16 STEP #600 | loss: 2.3461990356445312, avg_loss: 2.302342875031584
601it [10:48,  1.11s/it]

    EPOCH #16 STEP #600 | loss: 2.3461990356445312, avg_loss: 2.302342875031584


751it [13:31,  1.08s/it]
117it [00:46,  2.53it/s]
INFO:transformer_log:Epoch #16 End: Train Loss 2.303367731256904, Test Loss 2.1331202566114245
INFO:transformer_log:Epoch #17 Start: current LR 0.0005531406658569243


Epoch #17 Start: current LR 0.0005531406658569243


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #17 STEP #0 | loss: 2.280168056488037, avg_loss: 2.280168056488037
1it [00:01,  1.11s/it]

    EPOCH #17 STEP #0 | loss: 2.280168056488037, avg_loss: 2.280168056488037


200it [03:35,  1.08s/it]INFO:transformer_log:    EPOCH #17 STEP #200 | loss: 2.271796703338623, avg_loss: 2.286269983842005
201it [03:37,  1.08s/it]

    EPOCH #17 STEP #200 | loss: 2.271796703338623, avg_loss: 2.286269983842005


400it [07:11,  1.07s/it]INFO:transformer_log:    EPOCH #17 STEP #400 | loss: 2.274602174758911, avg_loss: 2.2908872202447523
401it [07:12,  1.07s/it]

    EPOCH #17 STEP #400 | loss: 2.274602174758911, avg_loss: 2.2908872202447523


600it [10:47,  1.10s/it]INFO:transformer_log:    EPOCH #17 STEP #600 | loss: 2.319093704223633, avg_loss: 2.2933179526876493
601it [10:48,  1.11s/it]

    EPOCH #17 STEP #600 | loss: 2.319093704223633, avg_loss: 2.2933179526876493


751it [13:30,  1.08s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #17 End: Train Loss 2.294818141647725, Test Loss 2.1259492421761537
INFO:transformer_log:Epoch #18 Start: current LR 0.000537556102282404


Epoch #18 Start: current LR 0.000537556102282404


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #18 STEP #0 | loss: 2.2741899490356445, avg_loss: 2.2741899490356445
1it [00:01,  1.05s/it]

    EPOCH #18 STEP #0 | loss: 2.2741899490356445, avg_loss: 2.2741899490356445


200it [03:35,  1.07s/it]INFO:transformer_log:    EPOCH #18 STEP #200 | loss: 2.287102460861206, avg_loss: 2.275792684128035
201it [03:36,  1.09s/it]

    EPOCH #18 STEP #200 | loss: 2.287102460861206, avg_loss: 2.275792684128035


400it [07:10,  1.06s/it]INFO:transformer_log:    EPOCH #18 STEP #400 | loss: 2.3129820823669434, avg_loss: 2.282081961929055
401it [07:12,  1.06s/it]

    EPOCH #18 STEP #400 | loss: 2.3129820823669434, avg_loss: 2.282081961929055


600it [10:46,  1.08s/it]INFO:transformer_log:    EPOCH #18 STEP #600 | loss: 2.3266544342041016, avg_loss: 2.285452700692683
601it [10:47,  1.09s/it]

    EPOCH #18 STEP #600 | loss: 2.3266544342041016, avg_loss: 2.285452700692683


751it [13:28,  1.08s/it]
117it [00:46,  2.50it/s]
INFO:transformer_log:Epoch #18 End: Train Loss 2.2873931318402447, Test Loss 2.1174146181497817
INFO:transformer_log:Epoch #19 Start: current LR 0.0005232186890985908


Epoch #19 Start: current LR 0.0005232186890985908


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #19 STEP #0 | loss: 2.1636691093444824, avg_loss: 2.1636691093444824
1it [00:01,  1.07s/it]

    EPOCH #19 STEP #0 | loss: 2.1636691093444824, avg_loss: 2.1636691093444824


200it [03:35,  1.07s/it]INFO:transformer_log:    EPOCH #19 STEP #200 | loss: 2.2613890171051025, avg_loss: 2.274138622616061
201it [03:36,  1.08s/it]

    EPOCH #19 STEP #200 | loss: 2.2613890171051025, avg_loss: 2.274138622616061


400it [07:10,  1.07s/it]INFO:transformer_log:    EPOCH #19 STEP #400 | loss: 2.1894822120666504, avg_loss: 2.2825956522972506
401it [07:11,  1.07s/it]

    EPOCH #19 STEP #400 | loss: 2.1894822120666504, avg_loss: 2.2825956522972506


600it [10:45,  1.07s/it]INFO:transformer_log:    EPOCH #19 STEP #600 | loss: 2.230677604675293, avg_loss: 2.2796501173155876
601it [10:46,  1.07s/it]

    EPOCH #19 STEP #600 | loss: 2.230677604675293, avg_loss: 2.2796501173155876


751it [13:28,  1.08s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #19 End: Train Loss 2.280840906417481, Test Loss 2.1093885969911885
INFO:transformer_log:Epoch #20 Start: current LR 0.0005099704960004505


Epoch #20 Start: current LR 0.0005099704960004505


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #20 STEP #0 | loss: 2.3570659160614014, avg_loss: 2.3570659160614014
1it [00:01,  1.06s/it]

    EPOCH #20 STEP #0 | loss: 2.3570659160614014, avg_loss: 2.3570659160614014


200it [03:35,  1.07s/it]INFO:transformer_log:    EPOCH #20 STEP #200 | loss: 2.2301692962646484, avg_loss: 2.263437144198821
201it [03:36,  1.07s/it]

    EPOCH #20 STEP #200 | loss: 2.2301692962646484, avg_loss: 2.263437144198821


400it [07:11,  1.08s/it]INFO:transformer_log:    EPOCH #20 STEP #400 | loss: 2.213413715362549, avg_loss: 2.268369811431428
401it [07:12,  1.09s/it]

    EPOCH #20 STEP #400 | loss: 2.213413715362549, avg_loss: 2.268369811431428


600it [10:47,  1.08s/it]INFO:transformer_log:    EPOCH #20 STEP #600 | loss: 2.3123791217803955, avg_loss: 2.2720053977458528
601it [10:48,  1.07s/it]

    EPOCH #20 STEP #600 | loss: 2.3123791217803955, avg_loss: 2.2720053977458528


751it [13:31,  1.08s/it]
117it [00:46,  2.53it/s]
INFO:transformer_log:Epoch #20 End: Train Loss 2.273905244870446, Test Loss 2.1068759473979983
INFO:transformer_log:Epoch #21 Start: current LR 0.0004976802442484392


Epoch #21 Start: current LR 0.0004976802442484392


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #21 STEP #0 | loss: 2.1625354290008545, avg_loss: 2.1625354290008545
1it [00:01,  1.06s/it]

    EPOCH #21 STEP #0 | loss: 2.1625354290008545, avg_loss: 2.1625354290008545


200it [03:35,  1.09s/it]INFO:transformer_log:    EPOCH #21 STEP #200 | loss: 2.1870782375335693, avg_loss: 2.2585325347843455
201it [03:36,  1.10s/it]

    EPOCH #21 STEP #200 | loss: 2.1870782375335693, avg_loss: 2.2585325347843455


400it [07:11,  1.07s/it]INFO:transformer_log:    EPOCH #21 STEP #400 | loss: 2.274221658706665, avg_loss: 2.2618035342627927
401it [07:12,  1.08s/it]

    EPOCH #21 STEP #400 | loss: 2.274221658706665, avg_loss: 2.2618035342627927


600it [10:46,  1.09s/it]INFO:transformer_log:    EPOCH #21 STEP #600 | loss: 2.1730995178222656, avg_loss: 2.2662681390759154
601it [10:47,  1.09s/it]

    EPOCH #21 STEP #600 | loss: 2.1730995178222656, avg_loss: 2.2662681390759154


751it [13:29,  1.08s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #21 End: Train Loss 2.268457015884542, Test Loss 2.0971909190854454
INFO:transformer_log:Epoch #22 Start: current LR 0.0004862377895554484


Epoch #22 Start: current LR 0.0004862377895554484


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #22 STEP #0 | loss: 2.259512424468994, avg_loss: 2.259512424468994
1it [00:01,  1.08s/it]

    EPOCH #22 STEP #0 | loss: 2.259512424468994, avg_loss: 2.259512424468994


200it [03:35,  1.06s/it]INFO:transformer_log:    EPOCH #22 STEP #200 | loss: 2.2242183685302734, avg_loss: 2.2510370745587704
201it [03:36,  1.06s/it]

    EPOCH #22 STEP #200 | loss: 2.2242183685302734, avg_loss: 2.2510370745587704


400it [07:11,  1.07s/it]INFO:transformer_log:    EPOCH #22 STEP #400 | loss: 2.24467134475708, avg_loss: 2.257526742549906
401it [07:12,  1.06s/it]

    EPOCH #22 STEP #400 | loss: 2.24467134475708, avg_loss: 2.257526742549906


600it [10:46,  1.09s/it]INFO:transformer_log:    EPOCH #22 STEP #600 | loss: 2.3237271308898926, avg_loss: 2.2612639449400436
601it [10:48,  1.10s/it]

    EPOCH #22 STEP #600 | loss: 2.3237271308898926, avg_loss: 2.2612639449400436


751it [13:29,  1.08s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #22 End: Train Loss 2.262025714079327, Test Loss 2.091952710070162
INFO:transformer_log:Epoch #23 Start: current LR 0.0004755499395811177


Epoch #23 Start: current LR 0.0004755499395811177


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #23 STEP #0 | loss: 2.216994047164917, avg_loss: 2.216994047164917
1it [00:01,  1.09s/it]

    EPOCH #23 STEP #0 | loss: 2.216994047164917, avg_loss: 2.216994047164917


200it [03:35,  1.08s/it]INFO:transformer_log:    EPOCH #23 STEP #200 | loss: 2.2484428882598877, avg_loss: 2.2473377446037026
201it [03:36,  1.09s/it]

    EPOCH #23 STEP #200 | loss: 2.2484428882598877, avg_loss: 2.2473377446037026


400it [07:10,  1.07s/it]INFO:transformer_log:    EPOCH #23 STEP #400 | loss: 2.2096409797668457, avg_loss: 2.250634179745529
401it [07:11,  1.07s/it]

    EPOCH #23 STEP #400 | loss: 2.2096409797668457, avg_loss: 2.250634179745529


600it [10:45,  1.08s/it]INFO:transformer_log:    EPOCH #23 STEP #600 | loss: 2.3178629875183105, avg_loss: 2.2544830964131286
601it [10:47,  1.08s/it]

    EPOCH #23 STEP #600 | loss: 2.3178629875183105, avg_loss: 2.2544830964131286


751it [13:28,  1.08s/it]
117it [00:46,  2.54it/s]
INFO:transformer_log:Epoch #23 End: Train Loss 2.256232444836836, Test Loss 2.0866300927268133
INFO:transformer_log:Epoch #24 Start: current LR 0.00046553724053590795


Epoch #24 Start: current LR 0.00046553724053590795


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #24 STEP #0 | loss: 2.281357526779175, avg_loss: 2.281357526779175
1it [00:01,  1.07s/it]

    EPOCH #24 STEP #0 | loss: 2.281357526779175, avg_loss: 2.281357526779175


200it [03:35,  1.07s/it]INFO:transformer_log:    EPOCH #24 STEP #200 | loss: 2.2679409980773926, avg_loss: 2.245059328885814
201it [03:36,  1.07s/it]

    EPOCH #24 STEP #200 | loss: 2.2679409980773926, avg_loss: 2.245059328885814


400it [07:10,  1.08s/it]INFO:transformer_log:    EPOCH #24 STEP #400 | loss: 2.252469301223755, avg_loss: 2.246543173778087
401it [07:11,  1.08s/it]

    EPOCH #24 STEP #400 | loss: 2.252469301223755, avg_loss: 2.246543173778087


600it [10:45,  1.07s/it]INFO:transformer_log:    EPOCH #24 STEP #600 | loss: 2.2326927185058594, avg_loss: 2.250105325076822
601it [10:46,  1.06s/it]

    EPOCH #24 STEP #600 | loss: 2.2326927185058594, avg_loss: 2.250105325076822


751it [13:27,  1.08s/it]
117it [00:46,  2.50it/s]
INFO:transformer_log:Epoch #24 End: Train Loss 2.2509631796302236, Test Loss 2.0784980792265673
INFO:transformer_log:Epoch #25 Start: current LR 0.0004561314782305167


Epoch #25 Start: current LR 0.0004561314782305167


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #25 STEP #0 | loss: 2.2045509815216064, avg_loss: 2.2045509815216064
1it [00:01,  1.06s/it]

    EPOCH #25 STEP #0 | loss: 2.2045509815216064, avg_loss: 2.2045509815216064


200it [03:35,  1.07s/it]INFO:transformer_log:    EPOCH #25 STEP #200 | loss: 2.294724464416504, avg_loss: 2.235782070539484
201it [03:36,  1.08s/it]

    EPOCH #25 STEP #200 | loss: 2.294724464416504, avg_loss: 2.235782070539484


400it [07:10,  1.07s/it]INFO:transformer_log:    EPOCH #25 STEP #400 | loss: 2.239886999130249, avg_loss: 2.2409254285760056
401it [07:11,  1.06s/it]

    EPOCH #25 STEP #400 | loss: 2.239886999130249, avg_loss: 2.2409254285760056


600it [10:45,  1.10s/it]INFO:transformer_log:    EPOCH #25 STEP #600 | loss: 2.2222914695739746, avg_loss: 2.24394391935796
601it [10:46,  1.08s/it]

    EPOCH #25 STEP #600 | loss: 2.2222914695739746, avg_loss: 2.24394391935796


751it [13:27,  1.08s/it]
117it [00:46,  2.52it/s]
INFO:transformer_log:Epoch #25 End: Train Loss 2.2462209525025796, Test Loss 2.078074803719154
INFO:transformer_log:Epoch #26 Start: current LR 0.0004472737131271783


Epoch #26 Start: current LR 0.0004472737131271783


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #26 STEP #0 | loss: 2.223590612411499, avg_loss: 2.223590612411499
1it [00:01,  1.08s/it]

    EPOCH #26 STEP #0 | loss: 2.223590612411499, avg_loss: 2.223590612411499


200it [03:35,  1.06s/it]INFO:transformer_log:    EPOCH #26 STEP #200 | loss: 2.275543212890625, avg_loss: 2.229415746470589
201it [03:36,  1.07s/it]

    EPOCH #26 STEP #200 | loss: 2.275543212890625, avg_loss: 2.229415746470589


400it [07:10,  1.06s/it]INFO:transformer_log:    EPOCH #26 STEP #400 | loss: 2.2900888919830322, avg_loss: 2.2354471909435016
401it [07:11,  1.06s/it]

    EPOCH #26 STEP #400 | loss: 2.2900888919830322, avg_loss: 2.2354471909435016


600it [10:44,  1.08s/it]INFO:transformer_log:    EPOCH #26 STEP #600 | loss: 2.2595419883728027, avg_loss: 2.2393146957613266
601it [10:45,  1.08s/it]

    EPOCH #26 STEP #600 | loss: 2.2595419883728027, avg_loss: 2.2393146957613266


751it [13:27,  1.07s/it]
117it [00:46,  2.54it/s]
INFO:transformer_log:Epoch #26 End: Train Loss 2.2410100287349817, Test Loss 2.0736853261279244
INFO:transformer_log:Epoch #27 Start: current LR 0.0004389127195704179


Epoch #27 Start: current LR 0.0004389127195704179


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #27 STEP #0 | loss: 2.1538331508636475, avg_loss: 2.1538331508636475
1it [00:01,  1.08s/it]

    EPOCH #27 STEP #0 | loss: 2.1538331508636475, avg_loss: 2.1538331508636475


200it [03:35,  1.07s/it]INFO:transformer_log:    EPOCH #27 STEP #200 | loss: 2.258121967315674, avg_loss: 2.223795948930048
201it [03:36,  1.07s/it]

    EPOCH #27 STEP #200 | loss: 2.258121967315674, avg_loss: 2.223795948930048


400it [07:09,  1.06s/it]INFO:transformer_log:    EPOCH #27 STEP #400 | loss: 2.341005325317383, avg_loss: 2.2315998933559045
401it [07:10,  1.06s/it]

    EPOCH #27 STEP #400 | loss: 2.341005325317383, avg_loss: 2.2315998933559045


600it [10:44,  1.07s/it]INFO:transformer_log:    EPOCH #27 STEP #600 | loss: 2.2349026203155518, avg_loss: 2.234368188607316
601it [10:45,  1.07s/it]

    EPOCH #27 STEP #600 | loss: 2.2349026203155518, avg_loss: 2.234368188607316


751it [13:27,  1.07s/it]
117it [00:46,  2.54it/s]
INFO:transformer_log:Epoch #27 End: Train Loss 2.2363144656154668, Test Loss 2.0709364098361416
INFO:transformer_log:Epoch #28 Start: current LR 0.0004310037344807926


Epoch #28 Start: current LR 0.0004310037344807926


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #28 STEP #0 | loss: 2.209545373916626, avg_loss: 2.209545373916626
1it [00:01,  1.07s/it]

    EPOCH #28 STEP #0 | loss: 2.209545373916626, avg_loss: 2.209545373916626


200it [03:36,  1.10s/it]INFO:transformer_log:    EPOCH #28 STEP #200 | loss: 2.257068395614624, avg_loss: 2.220054334668971
201it [03:37,  1.11s/it]

    EPOCH #28 STEP #200 | loss: 2.257068395614624, avg_loss: 2.220054334668971


400it [07:12,  1.09s/it]INFO:transformer_log:    EPOCH #28 STEP #400 | loss: 2.2130095958709717, avg_loss: 2.22941979327404
401it [07:13,  1.08s/it]

    EPOCH #28 STEP #400 | loss: 2.2130095958709717, avg_loss: 2.22941979327404


600it [10:47,  1.08s/it]INFO:transformer_log:    EPOCH #28 STEP #600 | loss: 2.335045099258423, avg_loss: 2.232526950153852
601it [10:48,  1.09s/it]

    EPOCH #28 STEP #600 | loss: 2.335045099258423, avg_loss: 2.232526950153852


751it [13:30,  1.08s/it]
117it [00:46,  2.50it/s]
INFO:transformer_log:Epoch #28 End: Train Loss 2.233150813931949, Test Loss 2.0651538433172765
INFO:transformer_log:Epoch #29 Start: current LR 0.00042350744551607184


Epoch #29 Start: current LR 0.00042350744551607184


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #29 STEP #0 | loss: 2.202333927154541, avg_loss: 2.202333927154541
1it [00:01,  1.08s/it]

    EPOCH #29 STEP #0 | loss: 2.202333927154541, avg_loss: 2.202333927154541


200it [03:35,  1.07s/it]INFO:transformer_log:    EPOCH #29 STEP #200 | loss: 2.309645891189575, avg_loss: 2.219084382650271
201it [03:36,  1.07s/it]

    EPOCH #29 STEP #200 | loss: 2.309645891189575, avg_loss: 2.219084382650271


400it [07:10,  1.09s/it]INFO:transformer_log:    EPOCH #29 STEP #400 | loss: 2.195406198501587, avg_loss: 2.223116462664711
401it [07:11,  1.09s/it]

    EPOCH #29 STEP #400 | loss: 2.195406198501587, avg_loss: 2.223116462664711


600it [10:46,  1.10s/it]INFO:transformer_log:    EPOCH #29 STEP #600 | loss: 2.207244634628296, avg_loss: 2.225370728434025
601it [10:47,  1.10s/it]

    EPOCH #29 STEP #600 | loss: 2.207244634628296, avg_loss: 2.225370728434025


751it [13:28,  1.08s/it]
117it [00:46,  2.51it/s]
INFO:transformer_log:Epoch #29 End: Train Loss 2.2278473335956925, Test Loss 2.061577013415149
