In [1]:
from transformers import BertTokenizerFast, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd 
import os 
import torch

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [3]:
# define device 
# configuration 

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA as device")
else:
    # Check that MPS is available
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                  "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
                  "and/or you do not have an MPS-enabled device on this machine.")
        device = torch.device("cpu")
        print("Using CPU as device")
    else:
        device = torch.device("mps")
        print("Using MPS as device")
    
torch.set_default_device(device)

Using CUDA as device


In [4]:
current_path = os.getcwd()
preprocessed_directory = preprocessed_directory = os.path.join(current_path, "preprocessed")

In [5]:

# import tokenizers
kr_tokenizer = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")
en_tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")



In [6]:
# Test tokenizers 
tmp_kr_sentence = "오늘 하교길에 길고양이를 보았는데, 너무 귀여워서 집에 데려가고 싶었다. 하지만 그러지는 않았다."
tmp_en_sentence = "The cat I saw during heading home today was so cute, that I wanted to bring it to home."

tmp_kr_tokenized = kr_tokenizer(tmp_kr_sentence, add_special_tokens=True, padding="max_length", max_length=256, truncation=True)
tmp_en_tokenized = en_tokenizer(tmp_en_sentence, add_special_tokens=True, padding="max_length", max_length=256, truncation=True)

# print(kr_tokenizer.convert_ids_to_tokens(tmp_kr_tokenized.input_ids))
# print(en_tokenizer.convert_ids_to_tokens(tmp_en_tokenized.input_ids))

# print(kr_tokenizer.decode(tmp_kr_tokenized.input_ids, skip_special_tokens=True))

# check if both tokenizer has pad token 
# print(kr_tokenizer.pad_token)
# print(en_tokenizer.pad_token)

In [7]:
df_train = pd.read_parquet(path="/kaggle/input/train.parquet")
df_test = pd.read_parquet(path="/kaggle/input/test.parquet")
df_validation = pd.read_parquet(path="/kaggle/input/validation.parquet")

class en2kr_Train_Dataset(Dataset): 
    def __init__(self, max_len): 
        self.data = df_train
        self.max_len = max_len 
        self.kr_tokenizer = kr_tokenizer
        self.en_tokenizer = en_tokenizer
        
    def __len__(self): 
        return len(self.data) 

    def __getitem__(self, idx): 
        row = self.data.iloc[[idx]]
        en_sentence = row["english"].item()
        kr_sentence = row["korean"].item()
        kr_tokenized_ids = self.kr_tokenizer(kr_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids
        en_tokenized_ids = self.en_tokenizer(en_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids

        # kr_tokenized = self.kr_tokenizer.convert_ids_to_tokens(kr_tokenized_ids)
        # en_tokenized = self.en_tokenizer.convert_ids_to_tokens(en_tokenized_ids)

        kr_tokenized_ids = torch.IntTensor(kr_tokenized_ids)
        en_tokenized_ids = torch.IntTensor(en_tokenized_ids)
        return kr_tokenized_ids, en_tokenized_ids
        
class en2kr_Test_Dataset(Dataset): 
    def __init__(self, max_len): 
        self.data = df_test
        self.max_len = max_len 
        self.kr_tokenizer = kr_tokenizer
        self.en_tokenizer = en_tokenizer
        
    def __len__(self): 
        return len(self.data) 

    def __getitem__(self, idx): 
        row = self.data.iloc[[idx]]
        en_sentence = row["english"].item()
        kr_sentence = row["korean"].item()
        kr_tokenized_ids = self.kr_tokenizer(kr_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids
        en_tokenized_ids = self.en_tokenizer(en_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids

        # kr_tokenized = self.kr_tokenizer.convert_ids_to_tokens(kr_tokenized_ids)
        # en_tokenized = self.en_tokenizer.convert_ids_to_tokens(en_tokenized_ids)
        
        kr_tokenized_ids = torch.IntTensor(kr_tokenized_ids)
        en_tokenized_ids = torch.IntTensor(en_tokenized_ids)
        
        return kr_tokenized_ids, en_tokenized_ids

class en2kr_Validation_Dataset(Dataset): 
    def __init__(self, max_len): 
        self.data = df_validation
        self.max_len = max_len 
        self.kr_tokenizer = kr_tokenizer
        self.en_tokenizer = en_tokenizer
        
    def __len__(self): 
        return len(self.data) 

    def __getitem__(self, idx): 
        row = self.data.iloc[[idx]]
        en_sentence = row["english"].item()
        kr_sentence = row["korean"].item()
        kr_tokenized_ids = self.kr_tokenizer(kr_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids
        en_tokenized_ids = self.en_tokenizer(en_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids

        # kr_tokenized = self.kr_tokenizer.convert_ids_to_tokens(kr_tokenized_ids)
        # en_tokenized = self.en_tokenizer.convert_ids_to_tokens(en_tokenized_ids)
        
        kr_tokenized_ids = torch.IntTensor(kr_tokenized_ids)
        en_tokenized_ids = torch.IntTensor(en_tokenized_ids)
        
        return kr_tokenized_ids, en_tokenized_ids

In [8]:
batch_size = 128

train_dataset = en2kr_Train_Dataset(max_len=128)
test_dataset = en2kr_Test_Dataset(max_len=128)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=True,  shuffle=True, generator=torch.Generator(device=device), pin_memory=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=True, generator=torch.Generator(device=device), pin_memory=True, num_workers=4)

# Transformer Model Implementation 


In [9]:
# import required packages
import torch 
import torch.nn as nn 
import copy 
import math 
from torch.nn.functional import log_softmax
import pandas as pd 
from torch.utils.data import Dataset, DataLoader
import spacy 


In [10]:
# Define a Token Embedding 
class TokenEmbeddings(nn.Embedding): 
    """
    Converting token into embedding vector
    """
    def __init__(self, vocab_size, d_model):
        """
        class for token embedding without positional encoding

        :param vocab_size: number of vocabs that TokenEmbeddings can handle
        :param d_model: dimension of embedding vector
        """
        super(TokenEmbeddings, self).__init__(vocab_size, d_model, padding_idx=1)

# Define Positional Encoding 
class PositionalEncoding(nn.Module): 
    """ 
    compute reusable sinusoid positional encoding
    """
    def __init__(self, d_model, max_len, device): 
        """
        construct sinusoid positional encoding that is going to be reused everytime when it is needed

        :param d_model: dimension of embedding vector
        :param max_len: maximum sequence length of token(a.k.a window size of attention method)
        """
        super(PositionalEncoding, self).__init__()

        # define a max_len * d_model size encoding matrix
        self.encoding = torch.zeros(max_len, d_model, device=device)

        # since positional encoding is not learnable, we turn off the gradient engine
        self.encoding.requires_grad = False

        # define a position at the sequence
        pos = torch.arange(0, max_len, device=device)
        # expand the max_len vector to max_len * 1 matrix 
        pos = pos.float().unsqueeze(dim=1)

        _2i = torch.arange(0, d_model, step=2, device=device).float()

        # define a sinusoid positional encoding
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))

    def forward(self, x): 
        batch_size, seq_len = x.shape

        return self.encoding[:seq_len, :]

# Define Transformer Embedding 
class TransformerEmbedding(nn.Module): 
    """
    token embedding + positional encoding
    """
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device): 
        """
        initialize the embedding class for word+position embedding

        :param vocab_size: number of vocabs that TokenEmbeddings can handle
        :param d_model: dimension of embedding vector
        :param max_len: maximum sequence length of token(a.k.a window size of attention method)
        :param drop_prob: dropout probability to reduce overfitting
        """
        super(TransformerEmbedding, self).__init__()
        self.token_emb = TokenEmbeddings(vocab_size, d_model)
        self.position_emb = PositionalEncoding(d_model, max_len, device)
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x): 
        tok_emb = self.token_emb(x)
        pos_emb = self.position_emb(x)

        return self.dropout(tok_emb+pos_emb)


In [11]:
# Define Attention Block 
class AttentionBlock(nn.Module): 
    """
    compute scale dot product attention for Query, Key, Value
    """
    def __init__(self):
        super(AttentionBlock, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, eps=1e-12): 
        batch_size, head, length, d_tensor = k.shape

        # calculate the k_T
        k_T = k.transpose(2, 3)

        # calculate the attention weight 
        att_weight = (q @ k_T) / math.sqrt(d_tensor)

        # if there are any masks that needs to be applied
        if mask is not None:
            att_weight = att_weight.masked_fill(mask == 0, -10000)

        # calculate the softmax 
        # att_weight shape: batch_size, head, seq_len, seq_len
        att_weight = self.softmax(att_weight)

        # att_weight @ v shape: batch_size, head, seq_len, d_model
        return att_weight @ v, att_weight

# Define MultiHeadAttention Block 
class MultiHeadAttentionBlock(nn.Module): 
    """
    define multi head attention block using AttentionBlock module
    """
    def __init__(self, d_model, n_head): 
        """
        Multi-head self-attention utilize the parallelism of GPU

        :param d_model: dimension of embedding vector
        :param n_head: number of heads
        """
        super(MultiHeadAttentionBlock, self).__init__()
        self.n_head = n_head
        self.attention = AttentionBlock()
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)

        # in the paper, d_v = d_k = d_q
        self.Wv = nn.Linear(d_model, d_model)

        self.Wconcat = nn.Linear(d_model, d_model)

    def split(self, tensor): 
        """
        split the tensor by number of head 

        :param tensor: tensor of shape batch_size  * seq_len * d_model
        :return: return tensor of shape batch_size * n_head * seq_len * d_tensor
        """
        batch_size, seq_len, d_model = tensor.shape 

        d_tensor = d_model // self.n_head

        tensor = tensor.view(batch_size, seq_len, self.n_head, d_tensor).transpose(1, 2)

        return tensor 

    def concat(self, tensor): 
        """
        concat tensor. Inverse operation of split

        :param tensor: tensor of shape batch_size * n_head * seq_len * d_tensor 
        :return: return tensor of shape batch_size * seq_len * d_model
        """
        batch_size, n_head, seq_len, d_tensor = tensor.shape

        d_model = n_head * d_tensor
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        return tensor 
    

    def forward(self, q, k, v, mask=None): 
        # apply linear transformation to derive q, k, v 
        q, k, v = self.Wq(q), self.Wk(k), self.Wv(v)

        # split the tensor by number of heads
        q, k, v = self.split(q), self.split(k), self.split(v)

        # apply attention to q, k, v 
        out, att_weight = self.attention(q, k, v, mask=mask)

        # concat 
        out = self.concat(out)

        # apply concat weight 
        out = self.Wconcat(out)
        return out 
        

In [12]:
# Define LayerNorm 
class LayerNorm(nn.Module): 
    """
    Normalize all features for each samples. 
    """
    def __init__(self, d_model, eps=1e-12): 
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps 

    def forward(self, x): 
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False)

        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta

        return out 

In [13]:
# define FeedForward Network 
class FeedForwardBlock(nn.Module): 
    def __init__(self, d_model, hidden, drop_prob=0.1): 
        super(FeedForwardBlock, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden) 
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU() 
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x): 
        x = self.linear1(x) 
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x) 
        return x 

In [14]:
# Define Encoder Layer 
class EncoderLayer(nn.Module): 
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob): 
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm1 = LayerNorm(d_model) 
        self.dropout1 = nn.Dropout(drop_prob)

        self.ffn = FeedForwardBlock(d_model, ffn_hidden, drop_prob)
        self.norm2 = LayerNorm(d_model)
        self.dropout2 = nn.Dropout(drop_prob)

    def forward(self, x, src_mask): 
        residual = x 
        x = self.attention(q=x, k=x, v=x, mask=src_mask)

        x = self.dropout1(x) 
        x = self.norm1(x + residual)

        residual = x 
        x = self.ffn(x) 

        x =  self.dropout2(x)
        x = self.norm2(x + residual)

        return x 

# Define Decoder Layer 
class DecoderLayer(nn.Module): 
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob): 
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm1 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.enc_dec_attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm2 = LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = FeedForwardBlock(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = LayerNorm(d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, trg_mask, src_mask): 
        residual = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)

        x = self.dropout1(x)
        x = self.norm1(x + residual)

        if enc is not None: 
            residual = x 
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)
            x = self.dropout2(x) 
            x = self.norm2(x + residual)

        residual = x 
        x = self.ffn(x)
        x = self.dropout3(x)
        x = self.norm3(x + residual)

        return x 

In [15]:
# Define Encoder Model
class Encoder(nn.Module): 
    """
    Encoder for Transformer
    """
    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device): 
        super(Encoder, self).__init__()
        self.emb = TransformerEmbedding(d_model=d_model, max_len=max_len, vocab_size=enc_voc_size, drop_prob=drop_prob, device=device)
        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model, ffn_hidden=ffn_hidden, n_head=n_head, drop_prob=drop_prob) for _ in range(n_layers)])

    def forward(self, x, src_mask): 
        x = self.emb(x) 
        for layer in self.layers: 
            x = layer(x, src_mask)

        return x

        
class Decoder(nn.Module): 
    """
    Decoder for Transformer
    """
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device): 
        super(Decoder, self).__init__()
        self.emb = TransformerEmbedding(d_model=d_model, max_len=max_len, vocab_size=dec_voc_size, drop_prob=drop_prob, device=device)

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model, ffn_hidden=ffn_hidden, n_head=n_head, drop_prob=drop_prob) for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, enc_src, trg_mask, src_mask): 
        trg = self.emb(trg)
        for layer in self.layers: 
            trg = layer(trg, enc_src, trg_mask, src_mask) 

        output = self.linear(trg)
        return output
    

# Define Transformer Model 
class Transformer(nn.Module): 
    """
    Transformer Model
    """
    def __init__(self, src_pad_token, trg_pad_token, trg_sos_token, enc_voc_size, dec_voc_size, n_head, max_len, d_model, ffn_hidden, n_layers, drop_prob, device): 
        """
        Constructing Transformer Model 

        :param src_pad_token: embedding vector that represents <pad> in source 
        :param trg_pad_token: embedding vector that represents <pad> in target 
        :param trg_sos_token: embedding vector that represents <sos> in target 
        :params enc_voc_size: number of vocabs that encoderEmbedder can handle
        :params dec_voc_size: number of vocabs that decoderEmbedder can handle
        :params ffn_hidden: hidden vector dimension for fastfeedforward layer 
        :params n_layers: number of EncoderLayer/DecoderLayer used
        :params drop_prob: dropout probability
        """
        super(Transformer, self).__init__()

        self.src_pad_token = src_pad_token
        self.trg_pad_token = trg_pad_token
        self.trg_sos_token = trg_sos_token
        self.device = device
        self.n_head = n_head
        self.encoder = Encoder(d_model=d_model, n_head=n_head, max_len=max_len, ffn_hidden=ffn_hidden, enc_voc_size=enc_voc_size, drop_prob=drop_prob, n_layers=n_layers, device=device)
        self.decoder = Decoder(d_model=d_model, n_head=n_head, max_len=max_len, ffn_hidden=ffn_hidden, dec_voc_size=dec_voc_size, drop_prob=drop_prob, n_layers=n_layers, device=device)

    def make_src_mask(self, src): 
        # print(f'src: {src}')
        # print(f'src_pad_token: {self.src_pad_token}')
        # print(f'src != self.src_pad_token: {src != self.src_pad_token}')
        src_mask = (src != self.src_pad_token).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg): 
        trg_pad_mask = (trg != self.trg_pad_token).unsqueeze(1).unsqueeze(3)
        trg_len = trg.shape[1]

        # make a look-ahead mask using torch.tril 
        # [[1 0 0]
        #  [1 1 0]
        #  [1 1 1]]
        trg_sub_mask = torch.tril(torch.ones(trg_len, trg_len)).type(torch.ByteTensor).to(self.device)

        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask
        
    
    def forward(self, src, trg): 
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output

In [16]:
import math 
from collections import Counter 
import numpy 

# compute the statistics for BLEU 
def bleu_stats(hypothesis, reference): 
    stats = [] 
    stats.append(len(hypothesis))
    stats.append(len(reference))

    for n in range(1, 5): 
        s_ngrams = Counter(
            [tuple(hypothesis[i:i+n]) for i in range(len(hypothesis) + 1 - n)]
        )

        r_ngrams = Counter(
            [tuple(reference[i:i+n]) for i in range(len(reference) + 1 - n)]
        )

        stats.append(max([sum((s_ngrams & r_ngrams).values()), 0]))
        stats.append(max([len(hypothesis) + 1 - n, 0]))

    return stats 

def bleu(stats): 
    for i in stats:
        if i == 0:
            return 0 

    (h_len, r_len) = stats[:2]
    log_bleu_prec = sum(
        [math.log(float(x)/y) for x, y in zip(stats[2::2], stats[3::2])]
    ) / 4.

    return math.exp(min([0, 1 - float(r_len) / h_len]) + log_bleu_prec)

def get_bleu(hypotheses, reference):
    """Get validation BLEU score for dev set."""
    stats = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    for hyp, ref in zip(hypotheses, reference):
        stats += np.array(bleu_stats(hyp, ref))
    return 100 * bleu(stats)
    

# Train the Model using datas

In [17]:
from torch.optim import Adam
from datetime import datetime
import torch 
from tqdm import tqdm

In [18]:
# define device 
# configuration 

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA as device")
else:
    # Check that MPS is available
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                  "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
                  "and/or you do not have an MPS-enabled device on this machine.")
        device = torch.device("cpu")
        print("Using CPU as device")
    else:
        device = torch.device("mps")
        print("Using MPS as device")
    
torch.set_default_device(device)

Using CUDA as device


In [19]:
# Define some configuration of training 
d_model = 256 
n_head = 8
max_len = 128
ffn_hidden = 256 
n_layers=6
drop_prob=0.1
epochs=300
init_lr = 1e-3
weight_decay = 5e-4
clip = 1

In [20]:
# Define some configuration of training 

src_pad_token = kr_tokenizer.pad_token_id
trg_pad_token = en_tokenizer.pad_token_id
trg_sos_token = en_tokenizer.sep_token_id
enc_voc_size = kr_tokenizer.vocab_size
dec_voc_size = en_tokenizer.vocab_size

print(f'src_pad_token: {src_pad_token}')
print(f'trg_pad_token: {trg_pad_token}')
print(f'trg_sos_token: {trg_sos_token}')
print(f'enc_voc_size: {enc_voc_size}')
print(f'dec_voc_size: {dec_voc_size}')

src_pad_token: 0
trg_pad_token: 0
trg_sos_token: 102
enc_voc_size: 42000
dec_voc_size: 30522


In [21]:
# Prepare the model 
model = Transformer(src_pad_token, trg_pad_token, trg_sos_token, enc_voc_size, dec_voc_size, n_head, max_len, d_model, ffn_hidden, n_layers, drop_prob, device).to(device)
model.train()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'model parameter #: {count_parameters(model)}')


model parameter #: 32741178


In [22]:
# Setup optimizer 
optimizer = Adam(params=model.parameters(), lr=init_lr, weight_decay=weight_decay)

loss_func = nn.CrossEntropyLoss(ignore_index=src_pad_token)


In [23]:
def train_epoch(epoch_num): 
    train_epoch_loss = 0 

    for step, (kr_tokenized, en_tokenized) in tqdm(enumerate(train_dataloader)): 
        optimizer.zero_grad()

        kr_tokenized = kr_tokenized.to(device)
        en_tokenized = en_tokenized.to(device)

        out = model(kr_tokenized, en_tokenized[:, :-1])

        # remove sos token from en_tokenized when calculating loss because out will not include sos token. 
        en_tokenized = en_tokenized[:, 1:].contiguous().view(-1)

        out = out.contiguous().view(-1, out.shape[-1])

        loss = loss_func(out.to(device), en_tokenized.type(torch.LongTensor).to(device))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        train_epoch_loss += loss.item()
        
        if step % 100 == 0:
            print(f'EPOCH #{epoch_num} STEP #{step} | loss: {loss.item()}, avg_loss: {train_epoch_loss / (step + 1)}')

    train_step_loss = train_epoch_loss / (step+1)
    # After training epoch, do evaluation 

    return train_step_loss
    

# evaluate the model 
def evaluate(): 
    model.eval()
    test_epoch_loss = 0 
    test_bleu_loss = 0
    
    with torch.no_grad(): 
        for step, (kr_tokenized, en_tokenized) in tqdm(enumerate(test_dataloader)): 
            kr_tokenized = kr_tokenized.to(device)
            en_tokenized = en_tokenized.to(device)

            out = model(kr_tokenized, en_tokenized[:, :-1])

            # remove sos token from en_tokenized when calculating loss because out will not include sos token. 
            en_tokenized = en_tokenized[:, 1:].contiguous().view(-1)
    
            out = out.contiguous().view(-1, out.shape[-1])
            loss = loss_func(out.to(device), en_tokenized.type(torch.LongTensor).to(device))
            test_epoch_loss += loss.item()

            # calcuate the bleu 
            # TODO
    return test_step_loss

In [None]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

for epoch in range(epochs):
    train_loss = train_epoch(epoch)
    test_loss = evaluate()

    best_vloss = 100_000_000

    print(f'Epoch {epoch}: Train Loss {train_loss}, Test Loss {test_loss}')

    if test_loss < best_vloss:
        best_vloss = avg_vloss 
        model_path = f'/kaggle/working/model_{timestamp}_{epoch}' 
        torch.save(model.state_dict(), model_path)  

1it [00:01,  1.10s/it]

EPOCH #0 STEP #0 | loss: 10.46674919128418, avg_loss: 10.46674919128418


101it [00:47,  2.15it/s]

EPOCH #0 STEP #100 | loss: 6.184701442718506, avg_loss: 6.895652855976974


201it [01:34,  2.14it/s]

EPOCH #0 STEP #200 | loss: 5.654942512512207, avg_loss: 6.381246419688362


301it [02:20,  2.15it/s]

EPOCH #0 STEP #300 | loss: 5.396417140960693, avg_loss: 6.094745647075564


401it [03:07,  2.15it/s]

EPOCH #0 STEP #400 | loss: 5.4601521492004395, avg_loss: 5.914357733548133


501it [03:53,  2.15it/s]

EPOCH #0 STEP #500 | loss: 5.0429558753967285, avg_loss: 5.7819255668959935


601it [04:40,  2.14it/s]

EPOCH #0 STEP #600 | loss: 5.208008289337158, avg_loss: 5.6821281445799965


701it [05:27,  2.14it/s]

EPOCH #0 STEP #700 | loss: 5.175655841827393, avg_loss: 5.60462308304116


784it [06:05,  2.13it/s]