In [1]:
from pathlib import Path
import os
import torch
from transformers import MarianMTModel, MarianTokenizer
import logging
from datetime import datetime

In [2]:
# connect to google colab
from google.colab import drive
drive.mount("/content/gdrive")


Mounted at /content/gdrive


In [3]:
print(f'@@Configuration START@@')

@@Configuration START@@


In [4]:
# Define configuration
batch_size = 128
d_model = 256
n_head = 8
max_len = 80
ffn_hidden = 512
n_layers=4
drop_prob=0.1
epochs=30
init_lr = 0.00 # having warmup step
eps = 5e-9
weight_decay = 5e-4
warmup_steps=1500
clip = 1

print(f'batch_size: {batch_size}')
print(f'd_model: {d_model}')
print(f'n_head: {n_head}')
print(f'max_len: {max_len}')
print(f'ffn_hidden: {ffn_hidden}')
print(f'n_layers: {n_layers}')
print(f'drop_prob: {drop_prob}')
print(f'epochs: {epochs}')
print(f'init_lr: {init_lr}')
print(f'weight_decay: {weight_decay}')
print(f'clip: {clip}')

batch_size: 128
d_model: 256
n_head: 8
max_len: 80
ffn_hidden: 512
n_layers: 4
drop_prob: 0.1
epochs: 30
init_lr: 0.0
weight_decay: 0.0005
clip: 1


In [5]:
# Configure Device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA as device")
else:
    # Check that MPS is available
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                  "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
                  "and/or you do not have an MPS-enabled device on this machine.")
        device = torch.device("cpu")
        print("Using CPU as device")
    else:
        device = torch.device("mps")
        print("Using MPS as device")

torch.set_default_device(device)

MPS not available because the current PyTorch install was not built with MPS enabled.
Using CPU as device


In [6]:
# Define tokenizers
TOKENIZERS_PARALLELISM = True

kr_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ko-en')
en_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ko-en')

enc_voc_size = kr_tokenizer.vocab_size
dec_voc_size = en_tokenizer.vocab_size

print(f'Using kr_tokenizer: Helsinki-NLP/opus-mt-ko-en')
print(f'kr_tokenizer_voc_size(enc_voc_size): {enc_voc_size}')

print(f'Using en_tokenizer: Helsinki-NLP/opus-mt-ko-en')
print(f'en_tokenizer_voc_size(dec_voc_size): {dec_voc_size}')

# Define some variables that are going to be used in future
src_pad_token = kr_tokenizer.pad_token_id
src_eos_token = kr_tokenizer.eos_token_id

trg_pad_token = en_tokenizer.pad_token_id
trg_sos_token = en_tokenizer.bos_token_id
trg_eos_token = en_tokenizer.eos_token_id

print(f'src_pad_token: {src_pad_token}')
print(f'src_eos_token: {src_eos_token}')
print(f'trg_pad_token: {trg_pad_token}')
print(f'trg_sos_token: {trg_sos_token}')
print(f'trg_eos_token: {trg_eos_token}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/842k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



Using kr_tokenizer: Helsinki-NLP/opus-mt-ko-en
kr_tokenizer_voc_size(enc_voc_size): 65001
Using en_tokenizer: Helsinki-NLP/opus-mt-ko-en
en_tokenizer_voc_size(dec_voc_size): 65001
src_pad_token: 65000
src_eos_token: 0
trg_pad_token: 65000
trg_sos_token: None
trg_eos_token: 0


In [29]:
# Define path configuration for the project
project_dir = Path("/content/gdrive/MyDrive/Colab Notebooks/en2kr-Translator")
data_dir = project_dir / "data"
model_dir = project_dir / "models"

data_dir.mkdir(parents=True, exist_ok=True)
model_dir.mkdir(parents=True, exist_ok=True)

print(f'project_dir: {project_dir}')
print(f'data_dir: {data_dir}')
print(f'model_dir: {model_dir}')

project_dir: /content/gdrive/MyDrive/Colab Notebooks/en2kr-Translator
data_dir: /content/gdrive/MyDrive/Colab Notebooks/en2kr-Translator/data
model_dir: /content/gdrive/MyDrive/Colab Notebooks/en2kr-Translator/models


In [10]:
# Configure Logger
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

logging_dir = project_dir / "logs"

logging_dir.mkdir(parents=True, exist_ok=True)

log_file = logging_dir / f'log_{timestamp}.log'

logger = logging.getLogger('transformer_log')
logger.setLevel(logging.INFO)

file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)

In [9]:
# Define learning rate scheduler.
# If you want to modify the logic of Scheduler, please modify this class

class LRScheduler:
    def __init__(self, optimizer, d_model, warmup_steps, LR_scale=1):
        self.optimizer = optimizer
        self.step_count = 0
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        self.LR_scale = LR_scale
        self._d_model_factor = self.LR_scale * (self.d_model ** -0.5)
    def step(self):
        self.step_count += 1
        lr = self.calculate_learning_rate()
        self.optimizer.param_groups[0]['lr'] = lr
    def calculate_learning_rate(self):
        minimum_factor = min(self.step_count ** -0.5, self.step_count * self.warmup_steps ** -1.5)
        return self._d_model_factor * minimum_factor

In [11]:
print(f'@@Configuration END@@')

@@Configuration END@@


In [12]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch

In [14]:
train_data_path = data_dir / "train.parquet"
test_data_path = data_dir / "test.parquet"

df_train = pd.read_parquet(path=train_data_path)
df_test = pd.read_parquet(path=test_data_path)

class en2kr_Train_Dataset(Dataset):
    def __init__(self, max_len):
        self.data = df_train

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[[idx]]
        en_sentence = row["english"].item()
        kr_sentence = row["korean"].item()

        return kr_sentence, en_sentence

class en2kr_Test_Dataset(Dataset):
    def __init__(self, max_len):
        self.data = df_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[[idx]]
        en_sentence = row["english"].item()
        kr_sentence = row["korean"].item()

        return kr_sentence, en_sentence



In [15]:
train_dataset = en2kr_Train_Dataset(max_len=max_len)
test_dataset = en2kr_Test_Dataset(max_len=max_len)

train_dataloader = DataLoader(train_dataset,pin_memory=True, drop_last=True, batch_size=batch_size, shuffle=True, generator=torch.Generator(device=device))
test_dataloader = DataLoader(test_dataset,pin_memory=True, drop_last=True, batch_size=batch_size, generator=torch.Generator(device=device))

# Transformer Model Implementation


In [16]:
# import required packages
import torch
import torch.nn as nn
import math

In [17]:
# Define a Token Embedding
class TokenEmbeddings(nn.Embedding):
    """
    Converting token into embedding vector
    """
    def __init__(self, vocab_size, d_model):
        """
        class for token embedding without positional encoding
        This layer transforms an seq_len token_ids -> (seq_len, d_model)
        Assigning (vector of size d_model) to each tokens

        :param vocab_size: number of vocabs that TokenEmbeddings can handle
        :param d_model: dimension of embedding vector
        """
        super(TokenEmbeddings, self).__init__(vocab_size, d_model, padding_idx=65000)

# Define Positional Encoding
class PositionalEncoding(nn.Module):
    """
    compute reusable sinusoid positional encoding
    """
    def __init__(self, d_model, max_len, device):
        """
        construct sinusoid positional encoding that is going to be reused everytime when it is needed

        :param d_model: dimension of embedding vector
        :param max_len: maximum sequence length of token(a.k.a window size of attention method)
        """
        super(PositionalEncoding, self).__init__()

        # define a max_len * d_model size encoding matrix
        self.encoding = torch.zeros(max_len, d_model, device=device)

        # since positional encoding is not learnable, we turn off the gradient engine
        self.encoding.requires_grad = False

        # define a position at the sequence
        pos = torch.arange(0, max_len, device=device)
        # expand the max_len vector to max_len * 1 matrix
        pos = pos.float().unsqueeze(dim=1)

        _2i = torch.arange(0, d_model, step=2, device=device).float()

        # define a sinusoid positional encoding
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))

    def forward(self, x):
        batch_size, seq_len = x.shape

        return self.encoding[:seq_len, :]

# Define Transformer Embedding
class TransformerEmbedding(nn.Module):
    """
    token embedding + positional encoding
    """
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        """
        initialize the embedding class for word+position embedding

        :param vocab_size: number of vocabs that TokenEmbeddings can handle
        :param d_model: dimension of embedding vector
        :param max_len: maximum sequence length of token(a.k.a window size of attention method)
        :param drop_prob: dropout probability to reduce overfitting
        """
        super(TransformerEmbedding, self).__init__()
        self.token_emb = TokenEmbeddings(vocab_size, d_model)
        self.position_emb = PositionalEncoding(d_model, max_len, device)
        self.dropout = nn.Dropout(p=drop_prob)
        self.scale = torch.sqrt(torch.tensor(d_model, dtype=torch.float32))

    def forward(self, x):
        tok_emb = self.scale * self.token_emb(x)
        pos_emb = self.position_emb(x)

        return self.dropout(tok_emb+pos_emb)


In [18]:
# Define Attention Block
class AttentionBlock(nn.Module):
    """
    compute scale dot product attention for Query, Key, Value
    """
    def __init__(self):
        super(AttentionBlock, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, eps=1e-12):
        batch_size, head, length, d_tensor = k.shape

        # calculate the k_T
        k_T = k.transpose(2, 3)

        # calculate the attention weight
        att_weight = (q @ k_T) / math.sqrt(d_tensor)

        # if there are any masks that needs to be applied
        if mask is not None:
            att_weight = att_weight.masked_fill(mask == 0, -1e10)

        # calculate the softmax
        # att_weight shape: batch_size, head, seq_len_query, seq_len_key
        att_weight = self.softmax(att_weight)

        # att_weight @ v shape: batch_size, head, seq_len_query, d_tensor
        return att_weight @ v, att_weight

# Define MultiHeadAttention Block
class MultiHeadAttentionBlock(nn.Module):
    """
    define multi head attention block using AttentionBlock module
    """
    def __init__(self, d_model, n_head):
        """
        Multi-head self-attention utilize the parallelism of GPU

        :param d_model: dimension of embedding vector
        :param n_head: number of heads
        """
        super(MultiHeadAttentionBlock, self).__init__()
        self.n_head = n_head
        self.attention = AttentionBlock()
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)

        # in the paper, d_v = d_k = d_q
        self.Wv = nn.Linear(d_model, d_model)

        self.Wconcat = nn.Linear(d_model, d_model)

    def split(self, tensor):
        """
        split the tensor by number of head

        :param tensor: tensor of shape batch_size  * seq_len * d_model
        :return: return tensor of shape batch_size * n_head * seq_len * d_tensor
        """
        batch_size, seq_len, d_model = tensor.shape

        d_tensor = d_model // self.n_head

        tensor = tensor.view(batch_size, seq_len, self.n_head, d_tensor).transpose(1, 2)

        return tensor

    def concat(self, tensor):
        """
        concat tensor. Inverse operation of split

        :param tensor: tensor of shape batch_size * n_head * seq_len * d_tensor
        :return: return tensor of shape batch_size * seq_len * d_model
        """
        batch_size, n_head, seq_len, d_tensor = tensor.shape

        d_model = n_head * d_tensor
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        return tensor


    def forward(self, q, k, v, mask=None):
        # apply linear transformation to derive q, k, v
        q, k, v = self.Wq(q), self.Wk(k), self.Wv(v)

        # split the tensor by number of heads
        q, k, v = self.split(q), self.split(k), self.split(v)

        # apply attention to q, k, v
        out, attn_weights = self.attention(q, k, v, mask=mask)

        # current attn_weights shape is batch_size * n_head * q_len * k_len
        # mean it by dim 1
        # eventually changing shape into batch_size * q_len * k_len
        attn_weight = attn_weights.mean(dim=1)

        # concat
        out = self.concat(out)

        # apply concat weight
        out = self.Wconcat(out)
        return out, attn_weight


In [19]:
# define FeedForward Network
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(FeedForwardBlock, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [20]:
# Define Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(drop_prob)

        self.ffn = FeedForwardBlock(d_model, ffn_hidden, drop_prob)
        self.dropout2 = nn.Dropout(drop_prob)

    def forward(self, x, src_mask):
        residual = x
        x = self.norm(x)
        x, attn_weight = self.attention(q=x, k=x, v=x, mask=src_mask)

        x = self.dropout1(x)
        x = self.norm(x + residual)

        residual = x
        x = self.ffn(x)

        x =  self.dropout2(x)
        x = x + residual

        return x, attn_weight

# Define Decoder Layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.enc_dec_attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = FeedForwardBlock(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, trg_mask, src_mask):
        residual = dec
        dec = self.norm1(dec)

        x, attn_weight1 = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)

        x = self.dropout1(x)
        x = self.norm2(x + residual)

        residual = x
        x, attn_weight2 = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)
        x = self.dropout2(x)
        x = self.norm3(x + residual)

        residual = x
        x = self.ffn(x)
        x = self.dropout3(x)
        x = x + residual

        return x, attn_weight1, attn_weight2

In [21]:
# Define Encoder Model
class Encoder(nn.Module):
    """
    Encoder for Transformer
    """
    def __init__(self, embedding, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super(Encoder, self).__init__()
        self.emb = embedding
        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model, ffn_hidden=ffn_hidden, n_head=n_head, drop_prob=drop_prob) for _ in range(n_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, src_mask):
        x = self.emb(x)
        attn_weights = []
        # get the mean of attention map batch_size * seq_len_src * seq_len_src
        for layer in self.layers:
            x, attn_weight = layer(x, src_mask)
            attn_weights.append(attn_weight)

        x = self.norm(x)

        return x, torch.mean(torch.stack(attn_weights), dim=0)


class Decoder(nn.Module):
    """
    Decoder for Transformer
    """
    def __init__(self, embedding, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device):
        super(Decoder, self).__init__()
        self.emb = embedding

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model, ffn_hidden=ffn_hidden, n_head=n_head, drop_prob=drop_prob) for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)
        self.norm = nn.LayerNorm(d_model)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.emb(trg)
        attn_weights_1 = []
        attn_weights_2 = []
        for layer in self.layers:
            trg, attn_weight_1, attn_weight_2 = layer(trg, enc_src, trg_mask, src_mask)
            attn_weights_1.append(attn_weight_1)
            attn_weights_2.append(attn_weight_2)

        trg = self.norm(trg)
        output = self.linear(trg)
        return output, torch.mean(torch.stack(attn_weights_1), dim=0), torch.mean(torch.stack(attn_weights_2), dim=0)


# Define Transformer Model
class Transformer(nn.Module):
    """
    Transformer Model
    """
    def __init__(self, src_pad_token, trg_pad_token, enc_voc_size, dec_voc_size, n_head, max_len, d_model, ffn_hidden, n_layers, drop_prob, device):
        """
        Constructing Transformer Model

        :param src_pad_token: embedding vector that represents <pad> in source
        :param trg_pad_token: embedding vector that represents <pad> in target
        :params enc_voc_size: number of vocabs that encoderEmbedder can handle
        :params dec_voc_size: number of vocabs that decoderEmbedder can handle
        :params ffn_hidden: hidden vector dimension for fastfeedforward layer
        :params n_layers: number of EncoderLayer/DecoderLayer used
        :params drop_prob: dropout probability
        """
        super(Transformer, self).__init__()

        self.emb = TransformerEmbedding(d_model=d_model, max_len=max_len, vocab_size=dec_voc_size, drop_prob=drop_prob, device=device)
        self.src_pad_token = src_pad_token
        self.trg_pad_token = trg_pad_token
        self.device = device
        self.encoder = Encoder(embedding=self.emb, d_model=d_model, n_head=n_head, max_len=max_len, ffn_hidden=ffn_hidden, enc_voc_size=enc_voc_size, drop_prob=drop_prob, n_layers=n_layers, device=device)
        self.decoder = Decoder(embedding=self.emb, d_model=d_model, n_head=n_head, max_len=max_len, ffn_hidden=ffn_hidden, dec_voc_size=dec_voc_size, drop_prob=drop_prob, n_layers=n_layers, device=device)

    def make_src_mask(self, src):
        # print(f'src: {src}')
        # print(f'src_pad_token: {self.src_pad_token}')
        # print(f'src != self.src_pad_token: {src != self.src_pad_token}')
        src_mask = (src != self.src_pad_token).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_token).unsqueeze(1).unsqueeze(3)
        trg_len = trg.shape[1]

        # make a look-ahead mask using torch.tril
        # [[1 0 0]
        #  [1 1 0]
        #  [1 1 1]]
        trg_sub_mask = torch.tril(torch.ones(trg_len, trg_len)).type(torch.ByteTensor).to(self.device)

        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask


    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src, enc_self_attn_weight = self.encoder(src, src_mask)
        output, dec_self_attn_weight, enc_dec_attn_weight = self.decoder(trg, enc_src, trg_mask, src_mask)

        # current output shape is batch_size * dec_voc_size, which is desirable.
        # We don't need to apply softmax because we are going to use CrossEntropyLoss as loss function
        # which automatically applies log-softmax during calculation

        return output

# Train the Model using datas

In [22]:
from torch.optim import Adam
from datetime import datetime
import torch
from tqdm import tqdm

In [23]:
# Prepare the model
model = Transformer(
    src_pad_token=src_pad_token,
    trg_pad_token=trg_pad_token,
    enc_voc_size=enc_voc_size,
    dec_voc_size=dec_voc_size,
    n_head=n_head,
    max_len=max_len,
    d_model=d_model,
    ffn_hidden=ffn_hidden,
    n_layers=n_layers,
    drop_prob=drop_prob,
    device=device).to(device)

model.train()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

logger.info(f'model parameter #: {count_parameters(model)}')


INFO:transformer_log:model parameter #: 38616041


model parameter #: 38616041


In [24]:
# Setup optimizer
optimizer = Adam(params=model.parameters(), lr=init_lr, weight_decay=weight_decay, eps=eps, betas=(0.9, 0.98))

# Set Noam Scheduler
scheduler = LRScheduler(optimizer, d_model, warmup_steps)
# Setup loss function for training
loss_func = nn.CrossEntropyLoss(ignore_index=src_pad_token)




In [26]:
# store lr rate history per steps
lr_history = []
# store loss history per steps
train_loss_history = []

In [27]:
def train_epoch(epoch_num):
    model.train()
    train_epoch_loss = 0

    for step, (kr_sentences, en_sentences) in tqdm(enumerate(train_dataloader)):

        # tokenize kr_sentence
        kr_tokenized = kr_tokenizer(kr_sentences, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids

        # tokenize en_sentence
        # make en_sentence start with eos token(this is because current tokenizer don't have an sos token.)
        en_sentences = ['</s> ' + s for s in en_sentences]
        en_tokenized = en_tokenizer(en_sentences, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids


        kr_tokenized = kr_tokenized.to(device)
        en_tokenized = en_tokenized.to(device)

        # out is the dec_voc_size vector
        # during training, we exclude sep token

        # remove eos token if the sentence is too long, and gets truncated.
        # so we can prevent early-stopping(early-eos)
        # out: batch_size * max_len * dec_voc_size
        out = model(kr_tokenized, en_tokenized[:, :-1])

        # remove sos token from en_tokenized when calculating loss because out will not include eos token in front of the sentence.
        # en_tokenized: batch_size * (max_len-1)
        en_tokenized = en_tokenized[:, 1:].to(device)

        # out: batch_size * (max_len - 1) * dec_voc_size
        out = out.permute(0, 2, 1).to(device)

        loss = loss_func(out, en_tokenized)
        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        scheduler.step()

        train_epoch_loss += loss.item()

        if step % 200 == 0:
            print(f'    EPOCH #{epoch_num} STEP #{step} | loss: {loss.item()}, avg_loss: {train_epoch_loss / (step + 1)}')
            logger.info(f'    EPOCH #{epoch_num} STEP #{step} | loss: {loss.item()}, avg_loss: {train_epoch_loss / (step + 1)}')



    train_step_loss = train_epoch_loss / (step+1)
    # After training epoch, do evaluation

    return train_step_loss

In [28]:
# evaluate the model
def evaluate():
    model.eval()
    test_epoch_loss = 0
    test_bleu_loss = 0

    with torch.no_grad():
        for step, (kr_sentences, en_sentences) in tqdm(enumerate(test_dataloader)):
            # tokenize kr_sentence
            kr_tokenized = kr_tokenizer(kr_sentences, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids

            # tokenize en_sentence
            # make en_sentence start with eos token(this is because current tokenizer don't have an sos token.)
            en_sentences = ['</s> ' + s for s in en_sentences]
            en_tokenized = en_tokenizer(en_sentences, padding=True, truncation=True, max_length=max_len, return_tensors="pt").input_ids

            kr_tokenized = kr_tokenized.to(device)
            en_tokenized = en_tokenized.to(device)


            # this does not remove the eos token
            # FIXME
            out = model(kr_tokenized, en_tokenized[:, :-1])


            # remove sos token from en_tokenized when calculating loss because out will not include sos token.
            en_tokenized = en_tokenized[:, 1:].to(device)

            out = out.permute(0, 2, 1).to(device)

            loss = loss_func(out, en_tokenized)
            test_epoch_loss += loss.item()

            # calcuate the bleu
            # TODO
        test_step_loss = test_epoch_loss / (step + 1)
    return test_step_loss

In [None]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

for epoch in range(epochs):
    print(f'Epoch #{epoch} Start: current LR {optimizer.param_groups[0]["lr"]}')
    logger.info(f'Epoch #{epoch} Start: current LR {optimizer.param_groups[0]["lr"]}')

    train_loss = train_epoch(epoch)
    test_loss = evaluate()
    lr_history.append(optimizer.param_groups[0]["lr"])
    train_loss_history.append(train_loss)

    logger.info(f'Epoch #{epoch} End: Train Loss {train_loss}, Test Loss {test_loss}')

    model_path = model_dir / f'model_{timestamp}_{epoch}'
    torch.save(model.state_dict(), model_path)


INFO:transformer_log:Epoch #0 Start: current LR 0.0


Epoch #0 Start: current LR 0.0


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #0 STEP #0 | loss: 11.262101173400879, avg_loss: 11.262101173400879
1it [00:02,  2.60s/it]

    EPOCH #0 STEP #0 | loss: 11.262101173400879, avg_loss: 11.262101173400879


200it [03:25,  1.04s/it]INFO:transformer_log:    EPOCH #0 STEP #200 | loss: 5.835330486297607, avg_loss: 8.487609255966262
201it [03:26,  1.04s/it]

    EPOCH #0 STEP #200 | loss: 5.835330486297607, avg_loss: 8.487609255966262


400it [06:54,  1.03s/it]INFO:transformer_log:    EPOCH #0 STEP #400 | loss: 4.338562488555908, avg_loss: 6.745410742010559
401it [06:55,  1.04s/it]

    EPOCH #0 STEP #400 | loss: 4.338562488555908, avg_loss: 6.745410742010559


600it [10:23,  1.04s/it]INFO:transformer_log:    EPOCH #0 STEP #600 | loss: 3.7966251373291016, avg_loss: 5.832336837955799
601it [10:24,  1.04s/it]

    EPOCH #0 STEP #600 | loss: 3.7966251373291016, avg_loss: 5.832336837955799


751it [13:00,  1.04s/it]
117it [00:44,  2.61it/s]
INFO:transformer_log:Epoch #0 End: Train Loss 5.415197627045978, Test Loss 3.6042336663629255


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #1 Start: current LR 0.0008079473591671584


Epoch #1 Start: current LR 0.0008079473591671584


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #1 STEP #0 | loss: 3.740907669067383, avg_loss: 3.740907669067383
1it [00:01,  1.07s/it]

    EPOCH #1 STEP #0 | loss: 3.740907669067383, avg_loss: 3.740907669067383


200it [03:28,  1.05s/it]INFO:transformer_log:    EPOCH #1 STEP #200 | loss: 3.4674575328826904, avg_loss: 3.587228509324107
201it [03:29,  1.05s/it]

    EPOCH #1 STEP #200 | loss: 3.4674575328826904, avg_loss: 3.587228509324107


400it [06:56,  1.03s/it]INFO:transformer_log:    EPOCH #1 STEP #400 | loss: 3.3164093494415283, avg_loss: 3.488378534887794
401it [06:57,  1.03s/it]

    EPOCH #1 STEP #400 | loss: 3.3164093494415283, avg_loss: 3.488378534887794


600it [10:24,  1.04s/it]INFO:transformer_log:    EPOCH #1 STEP #600 | loss: 3.19938588142395, avg_loss: 3.406042192224258
601it [10:25,  1.04s/it]

    EPOCH #1 STEP #600 | loss: 3.19938588142395, avg_loss: 3.406042192224258


751it [13:01,  1.04s/it]
117it [00:44,  2.64it/s]
INFO:transformer_log:Epoch #1 End: Train Loss 3.354925332469407, Test Loss 3.0285583642812877


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #2 Start: current LR 0.0016126683068472122


Epoch #2 Start: current LR 0.0016126683068472122


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #2 STEP #0 | loss: 3.1124417781829834, avg_loss: 3.1124417781829834
1it [00:01,  1.04s/it]

    EPOCH #2 STEP #0 | loss: 3.1124417781829834, avg_loss: 3.1124417781829834


200it [03:27,  1.04s/it]INFO:transformer_log:    EPOCH #2 STEP #200 | loss: 3.039987802505493, avg_loss: 3.080760373404963
201it [03:28,  1.05s/it]

    EPOCH #2 STEP #200 | loss: 3.039987802505493, avg_loss: 3.080760373404963


400it [06:54,  1.03s/it]INFO:transformer_log:    EPOCH #2 STEP #400 | loss: 2.9744746685028076, avg_loss: 3.048292428180761
401it [06:55,  1.03s/it]

    EPOCH #2 STEP #400 | loss: 2.9744746685028076, avg_loss: 3.048292428180761


600it [10:22,  1.04s/it]INFO:transformer_log:    EPOCH #2 STEP #600 | loss: 2.908728837966919, avg_loss: 3.0155440408259184
601it [10:23,  1.04s/it]

    EPOCH #2 STEP #600 | loss: 2.908728837966919, avg_loss: 3.0155440408259184


751it [12:58,  1.04s/it]
117it [00:44,  2.63it/s]
INFO:transformer_log:Epoch #2 End: Train Loss 2.995168057008685, Test Loss 2.79028967099312


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #3 Start: current LR 0.0013167381587112536


Epoch #3 Start: current LR 0.0013167381587112536


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #3 STEP #0 | loss: 2.953566551208496, avg_loss: 2.953566551208496
1it [00:01,  1.09s/it]

    EPOCH #3 STEP #0 | loss: 2.953566551208496, avg_loss: 2.953566551208496


200it [03:26,  1.03s/it]INFO:transformer_log:    EPOCH #3 STEP #200 | loss: 2.828357458114624, avg_loss: 2.8627579129157374
201it [03:27,  1.03s/it]

    EPOCH #3 STEP #200 | loss: 2.828357458114624, avg_loss: 2.8627579129157374


400it [06:53,  1.03s/it]INFO:transformer_log:    EPOCH #3 STEP #400 | loss: 2.8756933212280273, avg_loss: 2.8411093085186736
401it [06:54,  1.03s/it]

    EPOCH #3 STEP #400 | loss: 2.8756933212280273, avg_loss: 2.8411093085186736


600it [10:20,  1.05s/it]INFO:transformer_log:    EPOCH #3 STEP #600 | loss: 2.8225276470184326, avg_loss: 2.825134875572065
601it [10:21,  1.06s/it]

    EPOCH #3 STEP #600 | loss: 2.8225276470184326, avg_loss: 2.825134875572065


751it [12:57,  1.03s/it]
117it [00:44,  2.65it/s]
INFO:transformer_log:Epoch #3 End: Train Loss 2.8114220718251723, Test Loss 2.6191907744122367


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #4 Start: current LR 0.0011403286955762918


Epoch #4 Start: current LR 0.0011403286955762918


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #4 STEP #0 | loss: 2.716238498687744, avg_loss: 2.716238498687744
1it [00:01,  1.11s/it]

    EPOCH #4 STEP #0 | loss: 2.716238498687744, avg_loss: 2.716238498687744


200it [03:26,  1.03s/it]INFO:transformer_log:    EPOCH #4 STEP #200 | loss: 2.7343924045562744, avg_loss: 2.717776502542828
201it [03:27,  1.03s/it]

    EPOCH #4 STEP #200 | loss: 2.7343924045562744, avg_loss: 2.717776502542828


400it [06:53,  1.03s/it]INFO:transformer_log:    EPOCH #4 STEP #400 | loss: 2.655251979827881, avg_loss: 2.7038119832179195
401it [06:54,  1.03s/it]

    EPOCH #4 STEP #400 | loss: 2.655251979827881, avg_loss: 2.7038119832179195


600it [10:20,  1.03s/it]INFO:transformer_log:    EPOCH #4 STEP #600 | loss: 2.680978775024414, avg_loss: 2.6898159453158765
601it [10:21,  1.03s/it]

    EPOCH #4 STEP #600 | loss: 2.680978775024414, avg_loss: 2.6898159453158765


751it [12:56,  1.03s/it]
117it [00:44,  2.65it/s]
INFO:transformer_log:Epoch #4 End: Train Loss 2.681112371018025, Test Loss 2.4962531358767777


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #5 Start: current LR 0.001019940992000901


Epoch #5 Start: current LR 0.001019940992000901


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #5 STEP #0 | loss: 2.608363151550293, avg_loss: 2.608363151550293
1it [00:01,  1.09s/it]

    EPOCH #5 STEP #0 | loss: 2.608363151550293, avg_loss: 2.608363151550293


200it [03:26,  1.03s/it]INFO:transformer_log:    EPOCH #5 STEP #200 | loss: 2.6229848861694336, avg_loss: 2.608959869365787
201it [03:27,  1.03s/it]

    EPOCH #5 STEP #200 | loss: 2.6229848861694336, avg_loss: 2.608959869365787


400it [06:53,  1.04s/it]INFO:transformer_log:    EPOCH #5 STEP #400 | loss: 2.6015169620513916, avg_loss: 2.5998931787257775
401it [06:54,  1.05s/it]

    EPOCH #5 STEP #400 | loss: 2.6015169620513916, avg_loss: 2.5998931787257775


600it [10:20,  1.03s/it]INFO:transformer_log:    EPOCH #5 STEP #600 | loss: 2.547780990600586, avg_loss: 2.5918190261091847
601it [10:21,  1.03s/it]

    EPOCH #5 STEP #600 | loss: 2.547780990600586, avg_loss: 2.5918190261091847


751it [12:56,  1.03s/it]
117it [00:44,  2.65it/s]
INFO:transformer_log:Epoch #5 End: Train Loss 2.58665850064091, Test Loss 2.413894294673561


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #6 Start: current LR 0.0009310744810718159


Epoch #6 Start: current LR 0.0009310744810718159


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #6 STEP #0 | loss: 2.5670981407165527, avg_loss: 2.5670981407165527
1it [00:01,  1.13s/it]

    EPOCH #6 STEP #0 | loss: 2.5670981407165527, avg_loss: 2.5670981407165527


200it [03:26,  1.04s/it]INFO:transformer_log:    EPOCH #6 STEP #200 | loss: 2.5373146533966064, avg_loss: 2.530717410851474
201it [03:27,  1.04s/it]

    EPOCH #6 STEP #200 | loss: 2.5373146533966064, avg_loss: 2.530717410851474


400it [06:52,  1.03s/it]INFO:transformer_log:    EPOCH #6 STEP #400 | loss: 2.596623182296753, avg_loss: 2.5305071507308847
401it [06:53,  1.03s/it]

    EPOCH #6 STEP #400 | loss: 2.596623182296753, avg_loss: 2.5305071507308847


600it [10:18,  1.02s/it]INFO:transformer_log:    EPOCH #6 STEP #600 | loss: 2.6215968132019043, avg_loss: 2.5259674464208315
601it [10:19,  1.03s/it]

    EPOCH #6 STEP #600 | loss: 2.6215968132019043, avg_loss: 2.5259674464208315


751it [12:54,  1.03s/it]
117it [00:44,  2.66it/s]
INFO:transformer_log:Epoch #6 End: Train Loss 2.521704122642385, Test Loss 2.3469215886205688


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #7 Start: current LR 0.0008620074689615852


Epoch #7 Start: current LR 0.0008620074689615852


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #7 STEP #0 | loss: 2.5214755535125732, avg_loss: 2.5214755535125732
1it [00:01,  1.08s/it]

    EPOCH #7 STEP #0 | loss: 2.5214755535125732, avg_loss: 2.5214755535125732


200it [03:26,  1.02s/it]INFO:transformer_log:    EPOCH #7 STEP #200 | loss: 2.539085865020752, avg_loss: 2.480068436902554
201it [03:27,  1.03s/it]

    EPOCH #7 STEP #200 | loss: 2.539085865020752, avg_loss: 2.480068436902554


400it [06:52,  1.04s/it]INFO:transformer_log:    EPOCH #7 STEP #400 | loss: 2.590907573699951, avg_loss: 2.48189290503314
401it [06:53,  1.04s/it]

    EPOCH #7 STEP #400 | loss: 2.590907573699951, avg_loss: 2.48189290503314


600it [10:18,  1.03s/it]INFO:transformer_log:    EPOCH #7 STEP #600 | loss: 2.4955945014953613, avg_loss: 2.4791693219329276
601it [10:19,  1.03s/it]

    EPOCH #7 STEP #600 | loss: 2.4955945014953613, avg_loss: 2.4791693219329276


751it [12:54,  1.03s/it]
117it [00:44,  2.64it/s]
INFO:transformer_log:Epoch #7 End: Train Loss 2.4771473071229124, Test Loss 2.305621306101481


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #8 Start: current LR 0.0008063341534236061


Epoch #8 Start: current LR 0.0008063341534236061


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #8 STEP #0 | loss: 2.3918261528015137, avg_loss: 2.3918261528015137
1it [00:01,  1.08s/it]

    EPOCH #8 STEP #0 | loss: 2.3918261528015137, avg_loss: 2.3918261528015137


200it [03:26,  1.02s/it]INFO:transformer_log:    EPOCH #8 STEP #200 | loss: 2.4026448726654053, avg_loss: 2.444480218697543
201it [03:27,  1.02s/it]

    EPOCH #8 STEP #200 | loss: 2.4026448726654053, avg_loss: 2.444480218697543


400it [06:53,  1.03s/it]INFO:transformer_log:    EPOCH #8 STEP #400 | loss: 2.3294589519500732, avg_loss: 2.4431577763355286
401it [06:54,  1.03s/it]

    EPOCH #8 STEP #400 | loss: 2.3294589519500732, avg_loss: 2.4431577763355286


600it [10:20,  1.04s/it]INFO:transformer_log:    EPOCH #8 STEP #600 | loss: 2.3798153400421143, avg_loss: 2.443874267095734
601it [10:21,  1.06s/it]

    EPOCH #8 STEP #600 | loss: 2.3798153400421143, avg_loss: 2.443874267095734


751it [12:56,  1.03s/it]
117it [00:44,  2.65it/s]
INFO:transformer_log:Epoch #8 End: Train Loss 2.4438089118022894, Test Loss 2.275652767246605


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #9 Start: current LR 0.0007602191303841945


Epoch #9 Start: current LR 0.0007602191303841945


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #9 STEP #0 | loss: 2.3266310691833496, avg_loss: 2.3266310691833496
1it [00:01,  1.06s/it]

    EPOCH #9 STEP #0 | loss: 2.3266310691833496, avg_loss: 2.3266310691833496


200it [03:26,  1.03s/it]INFO:transformer_log:    EPOCH #9 STEP #200 | loss: 2.377389669418335, avg_loss: 2.4155072691428723
201it [03:27,  1.04s/it]

    EPOCH #9 STEP #200 | loss: 2.377389669418335, avg_loss: 2.4155072691428723


400it [06:53,  1.04s/it]INFO:transformer_log:    EPOCH #9 STEP #400 | loss: 2.4148387908935547, avg_loss: 2.4198702725389056
401it [06:54,  1.04s/it]

    EPOCH #9 STEP #400 | loss: 2.4148387908935547, avg_loss: 2.4198702725389056


600it [10:19,  1.02s/it]INFO:transformer_log:    EPOCH #9 STEP #600 | loss: 2.4010980129241943, avg_loss: 2.417525745667951
601it [10:20,  1.01s/it]

    EPOCH #9 STEP #600 | loss: 2.4010980129241943, avg_loss: 2.417525745667951


751it [12:52,  1.03s/it]
117it [00:43,  2.70it/s]
INFO:transformer_log:Epoch #9 End: Train Loss 2.4173902455722285, Test Loss 2.2479656781905737


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #10 Start: current LR 0.0007212071918539712


Epoch #10 Start: current LR 0.0007212071918539712


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #10 STEP #0 | loss: 2.317659854888916, avg_loss: 2.317659854888916
1it [00:01,  1.08s/it]

    EPOCH #10 STEP #0 | loss: 2.317659854888916, avg_loss: 2.317659854888916


200it [03:22,  1.00it/s]INFO:transformer_log:    EPOCH #10 STEP #200 | loss: 2.4331417083740234, avg_loss: 2.389517360658788
201it [03:23,  1.00it/s]

    EPOCH #10 STEP #200 | loss: 2.4331417083740234, avg_loss: 2.389517360658788


400it [06:44,  1.00it/s]INFO:transformer_log:    EPOCH #10 STEP #400 | loss: 2.3021950721740723, avg_loss: 2.392109762700716
401it [06:45,  1.00s/it]

    EPOCH #10 STEP #400 | loss: 2.3021950721740723, avg_loss: 2.392109762700716


600it [10:06,  1.01s/it]INFO:transformer_log:    EPOCH #10 STEP #600 | loss: 2.4070730209350586, avg_loss: 2.394721201771309
601it [10:07,  1.01s/it]

    EPOCH #10 STEP #600 | loss: 2.4070730209350586, avg_loss: 2.394721201771309


751it [12:39,  1.01s/it]
117it [00:43,  2.70it/s]
INFO:transformer_log:Epoch #10 End: Train Loss 2.3963993408391064, Test Loss 2.225695182115604


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #11 Start: current LR 0.00068764407652763


Epoch #11 Start: current LR 0.00068764407652763


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #11 STEP #0 | loss: 2.350355386734009, avg_loss: 2.350355386734009
1it [00:01,  1.09s/it]

    EPOCH #11 STEP #0 | loss: 2.350355386734009, avg_loss: 2.350355386734009


200it [03:22,  1.01s/it]INFO:transformer_log:    EPOCH #11 STEP #200 | loss: 2.3438503742218018, avg_loss: 2.369565511817363
201it [03:23,  1.02s/it]

    EPOCH #11 STEP #200 | loss: 2.3438503742218018, avg_loss: 2.369565511817363


400it [06:44,  1.00s/it]INFO:transformer_log:    EPOCH #11 STEP #400 | loss: 2.3934993743896484, avg_loss: 2.375563057283511
401it [06:45,  1.01s/it]

    EPOCH #11 STEP #400 | loss: 2.3934993743896484, avg_loss: 2.375563057283511


600it [10:06,  1.00s/it]INFO:transformer_log:    EPOCH #11 STEP #600 | loss: 2.4376025199890137, avg_loss: 2.3779592359323867
601it [10:07,  1.00s/it]

    EPOCH #11 STEP #600 | loss: 2.4376025199890137, avg_loss: 2.3779592359323867


751it [12:39,  1.01s/it]
117it [00:43,  2.69it/s]
INFO:transformer_log:Epoch #11 End: Train Loss 2.3788197904071224, Test Loss 2.2084135968460994


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #12 Start: current LR 0.0006583690793556268


Epoch #12 Start: current LR 0.0006583690793556268


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #12 STEP #0 | loss: 2.3528263568878174, avg_loss: 2.3528263568878174
1it [00:01,  1.11s/it]

    EPOCH #12 STEP #0 | loss: 2.3528263568878174, avg_loss: 2.3528263568878174


200it [03:25,  1.04s/it]INFO:transformer_log:    EPOCH #12 STEP #200 | loss: 2.3208167552948, avg_loss: 2.3522817981776907
201it [03:26,  1.03s/it]

    EPOCH #12 STEP #200 | loss: 2.3208167552948, avg_loss: 2.3522817981776907


400it [06:51,  1.04s/it]INFO:transformer_log:    EPOCH #12 STEP #400 | loss: 2.382563829421997, avg_loss: 2.360957660579919
401it [06:52,  1.04s/it]

    EPOCH #12 STEP #400 | loss: 2.382563829421997, avg_loss: 2.360957660579919


600it [10:17,  1.03s/it]INFO:transformer_log:    EPOCH #12 STEP #600 | loss: 2.282118797302246, avg_loss: 2.362019465092613
601it [10:18,  1.03s/it]

    EPOCH #12 STEP #600 | loss: 2.282118797302246, avg_loss: 2.362019465092613


751it [12:53,  1.03s/it]
117it [00:44,  2.64it/s]
INFO:transformer_log:Epoch #12 End: Train Loss 2.3640382077182815, Test Loss 2.1930754877563214


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #13 Start: current LR 0.0006325405511974286


Epoch #13 Start: current LR 0.0006325405511974286


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #13 STEP #0 | loss: 2.3217568397521973, avg_loss: 2.3217568397521973
1it [00:01,  1.08s/it]

    EPOCH #13 STEP #0 | loss: 2.3217568397521973, avg_loss: 2.3217568397521973


200it [03:26,  1.05s/it]INFO:transformer_log:    EPOCH #13 STEP #200 | loss: 2.397812843322754, avg_loss: 2.340242194892162
201it [03:27,  1.04s/it]

    EPOCH #13 STEP #200 | loss: 2.397812843322754, avg_loss: 2.340242194892162


400it [06:52,  1.03s/it]INFO:transformer_log:    EPOCH #13 STEP #400 | loss: 2.3807358741760254, avg_loss: 2.3497255228998655
401it [06:53,  1.02s/it]

    EPOCH #13 STEP #400 | loss: 2.3807358741760254, avg_loss: 2.3497255228998655


600it [10:18,  1.02s/it]INFO:transformer_log:    EPOCH #13 STEP #600 | loss: 2.324002742767334, avg_loss: 2.3498924841698314
601it [10:19,  1.02s/it]

    EPOCH #13 STEP #600 | loss: 2.324002742767334, avg_loss: 2.3498924841698314


751it [12:54,  1.03s/it]
117it [00:44,  2.62it/s]
INFO:transformer_log:Epoch #13 End: Train Loss 2.351563645425078, Test Loss 2.183786559308696


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #14 Start: current LR 0.0006095313267361893


Epoch #14 Start: current LR 0.0006095313267361893


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #14 STEP #0 | loss: 2.341146945953369, avg_loss: 2.341146945953369
1it [00:01,  1.05s/it]

    EPOCH #14 STEP #0 | loss: 2.341146945953369, avg_loss: 2.341146945953369


200it [03:25,  1.03s/it]INFO:transformer_log:    EPOCH #14 STEP #200 | loss: 2.3042144775390625, avg_loss: 2.337575901800127
201it [03:26,  1.02s/it]

    EPOCH #14 STEP #200 | loss: 2.3042144775390625, avg_loss: 2.337575901800127


400it [06:51,  1.02s/it]INFO:transformer_log:    EPOCH #14 STEP #400 | loss: 2.3506784439086914, avg_loss: 2.3399122468848477
401it [06:52,  1.02s/it]

    EPOCH #14 STEP #400 | loss: 2.3506784439086914, avg_loss: 2.3399122468848477


600it [10:17,  1.04s/it]INFO:transformer_log:    EPOCH #14 STEP #600 | loss: 2.36564302444458, avg_loss: 2.3404295250103995
601it [10:18,  1.04s/it]

    EPOCH #14 STEP #600 | loss: 2.36564302444458, avg_loss: 2.3404295250103995


751it [12:53,  1.03s/it]
117it [00:44,  2.65it/s]
INFO:transformer_log:Epoch #14 End: Train Loss 2.3409832723924864, Test Loss 2.1690596209632025


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #15 Start: current LR 0.000588863206289254


Epoch #15 Start: current LR 0.000588863206289254


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #15 STEP #0 | loss: 2.2513625621795654, avg_loss: 2.2513625621795654
1it [00:01,  1.06s/it]

    EPOCH #15 STEP #0 | loss: 2.2513625621795654, avg_loss: 2.2513625621795654


200it [03:25,  1.01s/it]INFO:transformer_log:    EPOCH #15 STEP #200 | loss: 2.32627534866333, avg_loss: 2.3221153062374436
201it [03:26,  1.01s/it]

    EPOCH #15 STEP #200 | loss: 2.32627534866333, avg_loss: 2.3221153062374436


400it [06:51,  1.04s/it]INFO:transformer_log:    EPOCH #15 STEP #400 | loss: 2.3550057411193848, avg_loss: 2.326201130921704
401it [06:52,  1.05s/it]

    EPOCH #15 STEP #400 | loss: 2.3550057411193848, avg_loss: 2.326201130921704


600it [10:18,  1.03s/it]INFO:transformer_log:    EPOCH #15 STEP #600 | loss: 2.3174116611480713, avg_loss: 2.3287653189133883
601it [10:19,  1.03s/it]

    EPOCH #15 STEP #600 | loss: 2.3174116611480713, avg_loss: 2.3287653189133883


751it [12:53,  1.03s/it]
117it [00:44,  2.64it/s]
INFO:transformer_log:Epoch #15 End: Train Loss 2.3309486979015976, Test Loss 2.1610866306174517


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

INFO:transformer_log:Epoch #16 Start: current LR 0.0005701643477881459


Epoch #16 Start: current LR 0.0005701643477881459


0it [00:00, ?it/s]INFO:transformer_log:    EPOCH #16 STEP #0 | loss: 2.2755160331726074, avg_loss: 2.2755160331726074
1it [00:01,  1.09s/it]

    EPOCH #16 STEP #0 | loss: 2.2755160331726074, avg_loss: 2.2755160331726074


200it [03:25,  1.02s/it]INFO:transformer_log:    EPOCH #16 STEP #200 | loss: 2.3466994762420654, avg_loss: 2.3155270891996165
201it [03:26,  1.02s/it]

    EPOCH #16 STEP #200 | loss: 2.3466994762420654, avg_loss: 2.3155270891996165


273it [04:40,  1.05s/it]