In [1]:
from transformers import BertTokenizerFast, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import pandas as pd 
import os 
import torch

In [2]:
# define device 
# configuration 

TOKENIZERS_PARALLELISM = True

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA as device")
else:
    # Check that MPS is available
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                  "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
                  "and/or you do not have an MPS-enabled device on this machine.")
        device = torch.device("cpu")
        print("Using CPU as device")
    else:
        device = torch.device("mps")
        print("Using MPS as device")
    
torch.set_default_device(device)

Using CUDA as device


In [3]:
current_path = os.getcwd()
preprocessed_directory = preprocessed_directory = os.path.join(current_path, "preprocessed")

In [4]:

# import tokenizers
kr_tokenizer = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")
en_tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased")



In [5]:
# Test tokenizers 
tmp_kr_sentence = "오늘 하교길에 길고양이를 보았는데, 너무 귀여워서 집에 데려가고 싶었다. 하지만 그러지는 않았다."
tmp_en_sentence = "The cat I saw during heading home today was so cute, that I wanted to bring it to home."

tmp_kr_tokenized = kr_tokenizer(tmp_kr_sentence, add_special_tokens=True, padding="max_length", max_length=256, truncation=True)
tmp_en_tokenized = en_tokenizer(tmp_en_sentence, add_special_tokens=True, padding="max_length", max_length=256, truncation=True)

# print(kr_tokenizer.convert_ids_to_tokens(tmp_kr_tokenized.input_ids))
# print(en_tokenizer.convert_ids_to_tokens(tmp_en_tokenized.input_ids))

# print(kr_tokenizer.decode(tmp_kr_tokenized.input_ids, skip_special_tokens=True))

# check if both tokenizer has pad token 
# print(kr_tokenizer.pad_token)
# print(en_tokenizer.pad_token)

In [6]:
df_train = pd.read_parquet(path="/kaggle/input/en2kr-translation/train.parquet")
df_test = pd.read_parquet(path="/kaggle/input/en2kr-translation/test.parquet")
df_validation = pd.read_parquet(path="/kaggle/input/en2kr-translation/validation.parquet")



class en2kr_Train_Dataset(Dataset): 
    def __init__(self, max_len): 
        self.data = df_train
        self.max_len = max_len 
        self.kr_tokenizer = kr_tokenizer
        self.en_tokenizer = en_tokenizer
        
    def __len__(self): 
        return len(self.data) 

    def __getitem__(self, idx): 
        row = self.data.iloc[[idx]]
        en_sentence = row["english"].item()
        kr_sentence = row["korean"].item()
        kr_tokenized_ids = self.kr_tokenizer(kr_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids
        en_tokenized_ids = self.en_tokenizer(en_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids

        # kr_tokenized = self.kr_tokenizer.convert_ids_to_tokens(kr_tokenized_ids)
        # en_tokenized = self.en_tokenizer.convert_ids_to_tokens(en_tokenized_ids)

        kr_tokenized_ids = torch.IntTensor(kr_tokenized_ids)
        en_tokenized_ids = torch.IntTensor(en_tokenized_ids)
        return kr_tokenized_ids, en_tokenized_ids
        
class en2kr_Test_Dataset(Dataset): 
    def __init__(self, max_len): 
        self.data = df_test
        self.max_len = max_len 
        self.kr_tokenizer = kr_tokenizer
        self.en_tokenizer = en_tokenizer
        
    def __len__(self): 
        return len(self.data) 

    def __getitem__(self, idx): 
        row = self.data.iloc[[idx]]
        en_sentence = row["english"].item()
        kr_sentence = row["korean"].item()
        kr_tokenized_ids = self.kr_tokenizer(kr_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids
        en_tokenized_ids = self.en_tokenizer(en_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids

        # kr_tokenized = self.kr_tokenizer.convert_ids_to_tokens(kr_tokenized_ids)
        # en_tokenized = self.en_tokenizer.convert_ids_to_tokens(en_tokenized_ids)
        
        kr_tokenized_ids = torch.IntTensor(kr_tokenized_ids)
        en_tokenized_ids = torch.IntTensor(en_tokenized_ids)
        
        return kr_tokenized_ids, en_tokenized_ids

class en2kr_Validation_Dataset(Dataset): 
    def __init__(self, max_len): 
        self.data = df_validation
        self.max_len = max_len 
        self.kr_tokenizer = kr_tokenizer
        self.en_tokenizer = en_tokenizer
        
    def __len__(self): 
        return len(self.data) 

    def __getitem__(self, idx): 
        row = self.data.iloc[[idx]]
        en_sentence = row["english"].item()
        kr_sentence = row["korean"].item()
        kr_tokenized_ids = self.kr_tokenizer(kr_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids
        en_tokenized_ids = self.en_tokenizer(en_sentence, add_special_tokens=True, padding="max_length", max_length=self.max_len, truncation=True).input_ids

        # kr_tokenized = self.kr_tokenizer.convert_ids_to_tokens(kr_tokenized_ids)
        # en_tokenized = self.en_tokenizer.convert_ids_to_tokens(en_tokenized_ids)
        
        kr_tokenized_ids = torch.IntTensor(kr_tokenized_ids)
        en_tokenized_ids = torch.IntTensor(en_tokenized_ids)
        
        return kr_tokenized_ids, en_tokenized_ids

In [7]:
batch_size = 128

train_dataset = en2kr_Train_Dataset(max_len=128)
test_dataset = en2kr_Test_Dataset(max_len=128)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size,drop_last=True,  shuffle=True, generator=torch.Generator(device=device))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=True, generator=torch.Generator(device=device))

# Transformer Model Implementation 


In [8]:
# import required packages
import torch 
import torch.nn as nn 
import copy 
import math 
from torch.nn.functional import log_softmax
import pandas as pd 
from torch.utils.data import Dataset, DataLoader
import spacy 


In [9]:
# Define a Token Embedding 
class TokenEmbeddings(nn.Embedding): 
    """
    Converting token into embedding vector
    """
    def __init__(self, vocab_size, d_model):
        """
        class for token embedding without positional encoding

        :param vocab_size: number of vocabs that TokenEmbeddings can handle
        :param d_model: dimension of embedding vector
        """
        super(TokenEmbeddings, self).__init__(vocab_size, d_model, padding_idx=1)

# Define Positional Encoding 
class PositionalEncoding(nn.Module): 
    """ 
    compute reusable sinusoid positional encoding
    """
    def __init__(self, d_model, max_len, device): 
        """
        construct sinusoid positional encoding that is going to be reused everytime when it is needed

        :param d_model: dimension of embedding vector
        :param max_len: maximum sequence length of token(a.k.a window size of attention method)
        """
        super(PositionalEncoding, self).__init__()

        # define a max_len * d_model size encoding matrix
        self.encoding = torch.zeros(max_len, d_model, device=device)

        # since positional encoding is not learnable, we turn off the gradient engine
        self.encoding.requires_grad = False

        # define a position at the sequence
        pos = torch.arange(0, max_len, device=device)
        # expand the max_len vector to max_len * 1 matrix 
        pos = pos.float().unsqueeze(dim=1)

        _2i = torch.arange(0, d_model, step=2, device=device).float()

        # define a sinusoid positional encoding
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))

    def forward(self, x): 
        batch_size, seq_len = x.shape

        return self.encoding[:seq_len, :]

# Define Transformer Embedding 
class TransformerEmbedding(nn.Module): 
    """
    token embedding + positional encoding
    """
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device): 
        """
        initialize the embedding class for word+position embedding

        :param vocab_size: number of vocabs that TokenEmbeddings can handle
        :param d_model: dimension of embedding vector
        :param max_len: maximum sequence length of token(a.k.a window size of attention method)
        :param drop_prob: dropout probability to reduce overfitting
        """
        super(TransformerEmbedding, self).__init__()
        self.token_emb = TokenEmbeddings(vocab_size, d_model)
        self.position_emb = PositionalEncoding(d_model, max_len, device)
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x): 
        tok_emb = self.token_emb(x)
        pos_emb = self.position_emb(x)

        return self.dropout(tok_emb+pos_emb)


In [10]:
# Define Attention Block 
class AttentionBlock(nn.Module): 
    """
    compute scale dot product attention for Query, Key, Value
    """
    def __init__(self):
        super(AttentionBlock, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, eps=1e-12): 
        batch_size, head, length, d_tensor = k.shape

        # calculate the k_T
        k_T = k.transpose(2, 3)

        # calculate the attention weight 
        att_weight = (q @ k_T) / math.sqrt(d_tensor)

        # if there are any masks that needs to be applied
        if mask is not None:
            att_weight = att_weight.masked_fill(mask == 0, -10000)

        # calculate the softmax 
        # att_weight shape: batch_size, head, seq_len, seq_len
        att_weight = self.softmax(att_weight)

        # att_weight @ v shape: batch_size, head, seq_len, d_model
        return att_weight @ v, att_weight

# Define MultiHeadAttention Block 
class MultiHeadAttentionBlock(nn.Module): 
    """
    define multi head attention block using AttentionBlock module
    """
    def __init__(self, d_model, n_head): 
        """
        Multi-head self-attention utilize the parallelism of GPU

        :param d_model: dimension of embedding vector
        :param n_head: number of heads
        """
        super(MultiHeadAttentionBlock, self).__init__()
        self.n_head = n_head
        self.attention = AttentionBlock()
        self.Wq = nn.Linear(d_model, d_model)
        self.Wk = nn.Linear(d_model, d_model)

        # in the paper, d_v = d_k = d_q
        self.Wv = nn.Linear(d_model, d_model)

        self.Wconcat = nn.Linear(d_model, d_model)

    def split(self, tensor): 
        """
        split the tensor by number of head 

        :param tensor: tensor of shape batch_size  * seq_len * d_model
        :return: return tensor of shape batch_size * n_head * seq_len * d_tensor
        """
        batch_size, seq_len, d_model = tensor.shape 

        d_tensor = d_model // self.n_head

        tensor = tensor.view(batch_size, seq_len, self.n_head, d_tensor).transpose(1, 2)

        return tensor 

    def concat(self, tensor): 
        """
        concat tensor. Inverse operation of split

        :param tensor: tensor of shape batch_size * n_head * seq_len * d_tensor 
        :return: return tensor of shape batch_size * seq_len * d_model
        """
        batch_size, n_head, seq_len, d_tensor = tensor.shape

        d_model = n_head * d_tensor
        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        return tensor 
    

    def forward(self, q, k, v, mask=None): 
        # apply linear transformation to derive q, k, v 
        q, k, v = self.Wq(q), self.Wk(k), self.Wv(v)

        # split the tensor by number of heads
        q, k, v = self.split(q), self.split(k), self.split(v)

        # apply attention to q, k, v 
        out, att_weight = self.attention(q, k, v, mask=mask)

        # concat 
        out = self.concat(out)

        # apply concat weight 
        out = self.Wconcat(out)
        return out 
        

In [11]:
# Define LayerNorm 
class LayerNorm(nn.Module): 
    """
    Normalize all features for each samples. 
    """
    def __init__(self, d_model, eps=1e-12): 
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps 

    def forward(self, x): 
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False)

        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta

        return out 

In [12]:
# define FeedForward Network 
class FeedForwardBlock(nn.Module): 
    def __init__(self, d_model, hidden, drop_prob=0.1): 
        super(FeedForwardBlock, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden) 
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU() 
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x): 
        x = self.linear1(x) 
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x) 
        return x 

In [13]:
# Define Encoder Layer 
class EncoderLayer(nn.Module): 
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob): 
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm1 = LayerNorm(d_model) 
        self.dropout1 = nn.Dropout(drop_prob)

        self.ffn = FeedForwardBlock(d_model, ffn_hidden, drop_prob)
        self.norm2 = LayerNorm(d_model)
        self.dropout2 = nn.Dropout(drop_prob)

    def forward(self, x, src_mask): 
        residual = x 
        x = self.attention(q=x, k=x, v=x, mask=src_mask)

        x = self.dropout1(x) 
        x = self.norm1(x + residual)

        residual = x 
        x = self.ffn(x) 

        x =  self.dropout2(x)
        x = self.norm2(x + residual)

        return x 

# Define Decoder Layer 
class DecoderLayer(nn.Module): 
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob): 
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm1 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.enc_dec_attention = MultiHeadAttentionBlock(d_model, n_head)
        self.norm2 = LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = FeedForwardBlock(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = LayerNorm(d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, trg_mask, src_mask): 
        residual = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)

        x = self.dropout1(x)
        x = self.norm1(x + residual)

        if enc is not None: 
            residual = x 
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)
            x = self.dropout2(x) 
            x = self.norm2(x + residual)

        residual = x 
        x = self.ffn(x)
        x = self.dropout3(x)
        x = self.norm3(x + residual)

        return x 

In [14]:
# Define Encoder Model
class Encoder(nn.Module): 
    """
    Encoder for Transformer
    """
    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device): 
        super(Encoder, self).__init__()
        self.emb = TransformerEmbedding(d_model=d_model, max_len=max_len, vocab_size=enc_voc_size, drop_prob=drop_prob, device=device)
        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model, ffn_hidden=ffn_hidden, n_head=n_head, drop_prob=drop_prob) for _ in range(n_layers)])

    def forward(self, x, src_mask): 
        x = self.emb(x) 
        for layer in self.layers: 
            x = layer(x, src_mask)

        return x

        
class Decoder(nn.Module): 
    """
    Decoder for Transformer
    """
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob, device): 
        super(Decoder, self).__init__()
        self.emb = TransformerEmbedding(d_model=d_model, max_len=max_len, vocab_size=dec_voc_size, drop_prob=drop_prob, device=device)

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model, ffn_hidden=ffn_hidden, n_head=n_head, drop_prob=drop_prob) for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, enc_src, trg_mask, src_mask): 
        trg = self.emb(trg)
        for layer in self.layers: 
            trg = layer(trg, enc_src, trg_mask, src_mask) 

        output = self.linear(trg)
        return output
    

# Define Transformer Model 
class Transformer(nn.Module): 
    """
    Transformer Model
    """
    def __init__(self, src_pad_token, trg_pad_token, trg_sos_token, enc_voc_size, dec_voc_size, n_head, max_len, ffn_hidden, n_layers, drop_prob, device): 
        """
        Constructing Transformer Model 

        :param src_pad_token: embedding vector that represents <pad> in source 
        :param trg_pad_token: embedding vector that represents <pad> in target 
        :param trg_sos_token: embedding vector that represents <sos> in target 
        :params enc_voc_size: number of vocabs that encoderEmbedder can handle
        :params dec_voc_size: number of vocabs that decoderEmbedder can handle
        :params ffn_hidden: hidden vector dimension for fastfeedforward layer 
        :params n_layers: number of EncoderLayer/DecoderLayer used
        :params drop_prob: dropout probability
        """
        super(Transformer, self).__init__()

        self.src_pad_token = src_pad_token
        self.trg_pad_token = trg_pad_token
        self.trg_sos_token = trg_sos_token
        self.device = device
        self.n_head = n_head
        self.encoder = Encoder(d_model=d_model, n_head=n_head, max_len=max_len, ffn_hidden=ffn_hidden, enc_voc_size=enc_voc_size, drop_prob=drop_prob, n_layers=n_layers, device=device)
        self.decoder = Decoder(d_model=d_model, n_head=n_head, max_len=max_len, ffn_hidden=ffn_hidden, dec_voc_size=dec_voc_size, drop_prob=drop_prob, n_layers=n_layers, device=device)

    def make_src_mask(self, src): 
        # print(f'src: {src}')
        # print(f'src_pad_token: {self.src_pad_token}')
        # print(f'src != self.src_pad_token: {src != self.src_pad_token}')
        src_mask = (src != self.src_pad_token).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg): 
        trg_pad_mask = (trg != self.trg_pad_token).unsqueeze(1).unsqueeze(3)
        trg_len = trg.shape[1]

        # make a look-ahead mask using torch.tril 
        # [[1 0 0]
        #  [1 1 0]
        #  [1 1 1]]
        trg_sub_mask = torch.tril(torch.ones(trg_len, trg_len)).type(torch.ByteTensor).to(self.device)

        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask
        
    
    def forward(self, src, trg): 
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output

In [15]:
import math 
from collections import Counter 
import numpy 

# compute the statistics for BLEU 
def bleu_stats(hypothesis, reference): 
    stats = [] 
    stats.append(len(hypothesis))
    stats.append(len(reference))

    for n in range(1, 5): 
        s_ngrams = Counter(
            [tuple(hypothesis[i:i+n]) for i in range(len(hypothesis) + 1 - n)]
        )

        r_ngrams = Counter(
            [tuple(reference[i:i+n]) for i in range(len(reference) + 1 - n)]
        )

        stats.append(max([sum((s_ngrams & r_ngrams).values()), 0]))
        stats.append(max([len(hypothesis) + 1 - n, 0]))

    return stats 

def bleu(stats): 
    for i in stats:
        if i == 0:
            return 0 

    (h_len, r_len) = stats[:2]
    log_bleu_prec = sum(
        [math.log(float(x)/y) for x, y in zip(stats[2::2], stats[3::2])]
    ) / 4.

    return math.exp(min([0, 1 - float(r_len) / h_len]) + log_bleu_prec)

def get_bleu(hypotheses, reference):
    """Get validation BLEU score for dev set."""
    stats = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
    for hyp, ref in zip(hypotheses, reference):
        stats += np.array(bleu_stats(hyp, ref))
    return 100 * bleu(stats)
    

# Train the Model using datas

In [16]:
from torch.optim import Adam
from datetime import datetime
import torch 
from tqdm import tqdm

In [17]:
# define device 
# configuration 

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA as device")
else:
    # Check that MPS is available
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                  "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
                  "and/or you do not have an MPS-enabled device on this machine.")
        device = torch.device("cpu")
        print("Using CPU as device")
    else:
        device = torch.device("mps")
        print("Using MPS as device")
    
torch.set_default_device(device)

Using CUDA as device


In [18]:
# Define some configuration of training 
d_model = 256 
n_head = 8
max_len = 128
ffn_hidden = 128 
n_layers=6
drop_prob=0.1
epochs=300
init_lr = 1e-3
weight_decay = 5e-4
clip = 1

In [19]:
# Define some configuration of training 

src_pad_token = kr_tokenizer.pad_token_id
trg_pad_token = en_tokenizer.pad_token_id
trg_sos_token = en_tokenizer.sep_token_id
enc_voc_size = kr_tokenizer.vocab_size
dec_voc_size = en_tokenizer.vocab_size

print(f'src_pad_token: {src_pad_token}')
print(f'trg_pad_token: {trg_pad_token}')
print(f'trg_sos_token: {trg_sos_token}')
print(f'enc_voc_size: {enc_voc_size}')
print(f'dec_voc_size: {dec_voc_size}')

src_pad_token: 0
trg_pad_token: 0
trg_sos_token: 102
enc_voc_size: 42000
dec_voc_size: 30522


In [20]:
# Prepare the model 
model = Transformer(src_pad_token, trg_pad_token, trg_sos_token, enc_voc_size, dec_voc_size,n_head, max_len, ffn_hidden, n_layers, drop_prob, device).to(device)
model.train()

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'model parameter #: {count_parameters(model)}')


model parameter #: 31953210


In [21]:
# Setup optimizer 
optimizer = Adam(params=model.parameters(), lr=init_lr, weight_decay=weight_decay)

loss_func = nn.CrossEntropyLoss(ignore_index=src_pad_token)


In [22]:
def train_epoch(epoch_num): 
    train_epoch_loss = 0 

    for step, (kr_tokenized, en_tokenized) in tqdm(enumerate(train_dataloader)): 
        optimizer.zero_grad()

        kr_tokenized = kr_tokenized.to(device)
        en_tokenized = en_tokenized.to(device)

        out = model(kr_tokenized, en_tokenized[:, :-1])

        # remove sos token from en_tokenized when calculating loss because out will not include sos token. 
        en_tokenized = en_tokenized[:, 1:].contiguous().view(-1)

        out = out.contiguous().view(-1, out.shape[-1])

        loss = loss_func(out.to(device), en_tokenized.type(torch.LongTensor).to(device))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        train_epoch_loss += loss.item()
        
        if step % 100 == 0:
            print(f'EPOCH #{epoch_num} STEP #{step} | loss: {loss.item()}, avg_loss: {train_epoch_loss / (step + 1)}')

    train_step_loss = train_epoch_loss / (step+1)
    # After training epoch, do evaluation 

    return train_step_loss
    

# evaluate the model 
def evaluate(): 
    model.eval()
    test_epoch_loss = 0 
    test_bleu_loss = 0
    
    with torch.no_grad(): 
        for step, (kr_tokenized, en_tokenized) in tqdm(enumerate(test_dataloader)): 
            kr_tokenized = kr_tokenized.to(device)
            en_tokenized = en_tokenized.to(device)

            out = model(kr_tokenized, en_tokenized[:, :-1])

            # remove sos token from en_tokenized when calculating loss because out will not include sos token. 
            en_tokenized = en_tokenized[:, 1:].contiguous().view(-1)
    
            out = out.contiguous().view(-1, out.shape[-1])
            loss = loss_func(out.to(device), en_tokenized.type(torch.LongTensor).to(device))
            test_epoch_loss += loss.item()

            # calcuate the bleu 
            # TODO
    return test_step_loss

In [23]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

for epoch in range(epochs):
    train_loss = train_epoch(epoch)
    test_loss = evaluate()

    best_vloss = 100_000_000

    print(f'Epoch {epoch}: Train Loss {train_loss}, Test Loss {test_loss}')

    if test_loss < best_vloss:
        best_vloss = avg_vloss 
        model_path = f'models/model_{timestamp}_{epoch}' 
        torch.save(model.state_dict(), model_path)  

1it [00:01,  1.01s/it]

EPOCH #0 STEP #0 | loss: 10.541214942932129, avg_loss: 10.541214942932129


101it [00:56,  1.82it/s]

EPOCH #0 STEP #100 | loss: 6.313336372375488, avg_loss: 7.000858165249966


201it [01:51,  1.80it/s]

EPOCH #0 STEP #200 | loss: 5.790494918823242, avg_loss: 6.506908468939178


301it [02:46,  1.82it/s]

EPOCH #0 STEP #300 | loss: 5.443808078765869, avg_loss: 6.209476363223256


401it [03:41,  1.81it/s]

EPOCH #0 STEP #400 | loss: 5.416319847106934, avg_loss: 6.0133354907618495


501it [04:37,  1.81it/s]

EPOCH #0 STEP #500 | loss: 5.411207675933838, avg_loss: 5.8705325869029155


601it [05:32,  1.80it/s]

EPOCH #0 STEP #600 | loss: 5.20664644241333, avg_loss: 5.762033244337695


701it [06:27,  1.82it/s]

EPOCH #0 STEP #700 | loss: 5.181723117828369, avg_loss: 5.677373403149223


801it [07:22,  1.82it/s]

EPOCH #0 STEP #800 | loss: 5.096766471862793, avg_loss: 5.608622318796451


901it [08:17,  1.82it/s]

EPOCH #0 STEP #900 | loss: 4.993821144104004, avg_loss: 5.552498687253014


1001it [09:12,  1.82it/s]

EPOCH #0 STEP #1000 | loss: 5.038311004638672, avg_loss: 5.504494700398478


1101it [10:07,  1.81it/s]

EPOCH #0 STEP #1100 | loss: 4.971671104431152, avg_loss: 5.463236480924241


1201it [11:03,  1.79it/s]

EPOCH #0 STEP #1200 | loss: 5.03561544418335, avg_loss: 5.427038662836614


1301it [11:58,  1.81it/s]

EPOCH #0 STEP #1300 | loss: 4.888619422912598, avg_loss: 5.397040049357198


1401it [12:54,  1.80it/s]

EPOCH #0 STEP #1400 | loss: 4.945991039276123, avg_loss: 5.370804591999149


1501it [13:49,  1.81it/s]

EPOCH #0 STEP #1500 | loss: 5.102868556976318, avg_loss: 5.346625537732535


1601it [14:44,  1.81it/s]

EPOCH #0 STEP #1600 | loss: 5.013057708740234, avg_loss: 5.326211640717163


1701it [15:39,  1.82it/s]

EPOCH #0 STEP #1700 | loss: 5.014945030212402, avg_loss: 5.3069283759012285


1801it [16:34,  1.81it/s]

EPOCH #0 STEP #1800 | loss: 5.120586395263672, avg_loss: 5.288847078157094


1901it [17:30,  1.81it/s]

EPOCH #0 STEP #1900 | loss: 4.906405925750732, avg_loss: 5.2724330526850585


2001it [18:25,  1.81it/s]

EPOCH #0 STEP #2000 | loss: 5.081276893615723, avg_loss: 5.257734864667199


2101it [19:21,  1.80it/s]

EPOCH #0 STEP #2100 | loss: 5.065474033355713, avg_loss: 5.2440520270219135


2201it [20:16,  1.82it/s]

EPOCH #0 STEP #2200 | loss: 5.084625720977783, avg_loss: 5.230697857147453


2301it [21:12,  1.79it/s]

EPOCH #0 STEP #2300 | loss: 4.996845722198486, avg_loss: 5.21819617322402


2401it [22:07,  1.79it/s]

EPOCH #0 STEP #2400 | loss: 4.797029495239258, avg_loss: 5.206418680876605


2501it [23:03,  1.81it/s]

EPOCH #0 STEP #2500 | loss: 4.789111137390137, avg_loss: 5.194752705759737


2601it [23:58,  1.80it/s]

EPOCH #0 STEP #2600 | loss: 4.946078300476074, avg_loss: 5.183212106478118


2701it [24:54,  1.80it/s]

EPOCH #0 STEP #2700 | loss: 4.8410420417785645, avg_loss: 5.172599741460129


2801it [25:49,  1.79it/s]

EPOCH #0 STEP #2800 | loss: 4.975810527801514, avg_loss: 5.161925049944887


2901it [26:45,  1.81it/s]

EPOCH #0 STEP #2900 | loss: 4.855827808380127, avg_loss: 5.152057188455502


3001it [27:40,  1.80it/s]

EPOCH #0 STEP #3000 | loss: 4.9022369384765625, avg_loss: 5.142554531332573


3101it [28:36,  1.79it/s]

EPOCH #0 STEP #3100 | loss: 4.8263068199157715, avg_loss: 5.133201427822612


3201it [29:32,  1.80it/s]

EPOCH #0 STEP #3200 | loss: 4.748086452484131, avg_loss: 5.125047830148475


3301it [30:27,  1.80it/s]

EPOCH #0 STEP #3300 | loss: 4.929564476013184, avg_loss: 5.116490756840606


3401it [31:23,  1.80it/s]

EPOCH #0 STEP #3400 | loss: 4.896345138549805, avg_loss: 5.10930697913031


3501it [32:19,  1.80it/s]

EPOCH #0 STEP #3500 | loss: 4.718604564666748, avg_loss: 5.102668470466045


3601it [33:14,  1.79it/s]

EPOCH #0 STEP #3600 | loss: 4.810760498046875, avg_loss: 5.0973838025151075


3701it [34:10,  1.81it/s]

EPOCH #0 STEP #3700 | loss: 4.950533390045166, avg_loss: 5.092490433937986


3801it [35:05,  1.79it/s]

EPOCH #0 STEP #3800 | loss: 4.846146106719971, avg_loss: 5.088402448657687


3901it [36:01,  1.80it/s]

EPOCH #0 STEP #3900 | loss: 4.950700283050537, avg_loss: 5.085229725753366


4001it [36:56,  1.80it/s]

EPOCH #0 STEP #4000 | loss: 4.815249443054199, avg_loss: 5.082221902152712


4101it [37:52,  1.82it/s]

EPOCH #0 STEP #4100 | loss: 5.09555721282959, avg_loss: 5.0805277411631335


4201it [38:47,  1.82it/s]

EPOCH #0 STEP #4200 | loss: 5.07959508895874, avg_loss: 5.079366654334991


4301it [39:43,  1.81it/s]

EPOCH #0 STEP #4300 | loss: 5.133204460144043, avg_loss: 5.078668712765858


4401it [40:38,  1.80it/s]

EPOCH #0 STEP #4400 | loss: 5.206789016723633, avg_loss: 5.0783725002629465


4501it [41:33,  1.81it/s]

EPOCH #0 STEP #4500 | loss: 5.012576103210449, avg_loss: 5.078908734468851


4601it [42:29,  1.79it/s]

EPOCH #0 STEP #4600 | loss: 5.075676441192627, avg_loss: 5.080065226145502


4701it [43:24,  1.79it/s]

EPOCH #0 STEP #4700 | loss: 5.181363105773926, avg_loss: 5.081968600839738


4801it [44:20,  1.80it/s]

EPOCH #0 STEP #4800 | loss: 5.306509017944336, avg_loss: 5.084116335040305


4901it [45:16,  1.81it/s]

EPOCH #0 STEP #4900 | loss: 5.222613334655762, avg_loss: 5.086588747962059


5001it [46:11,  1.80it/s]

EPOCH #0 STEP #5000 | loss: 5.2342000007629395, avg_loss: 5.08989161478236


5101it [47:07,  1.79it/s]

EPOCH #0 STEP #5100 | loss: 5.229423999786377, avg_loss: 5.093037656418741


5201it [48:02,  1.81it/s]

EPOCH #0 STEP #5200 | loss: 5.279235363006592, avg_loss: 5.097134287415548


5301it [48:58,  1.76it/s]

EPOCH #0 STEP #5300 | loss: 5.332743167877197, avg_loss: 5.101440882597346


5401it [49:54,  1.81it/s]

EPOCH #0 STEP #5400 | loss: 5.498273849487305, avg_loss: 5.1060260777296


5501it [50:49,  1.80it/s]

EPOCH #0 STEP #5500 | loss: 5.3998517990112305, avg_loss: 5.111038852921184


5601it [51:45,  1.80it/s]

EPOCH #0 STEP #5600 | loss: 5.4574456214904785, avg_loss: 5.116451723408304


5701it [52:41,  1.78it/s]

EPOCH #0 STEP #5700 | loss: 5.429754257202148, avg_loss: 5.12169444931116


5801it [53:37,  1.79it/s]

EPOCH #0 STEP #5800 | loss: 5.391129493713379, avg_loss: 5.1273660608826575


5901it [54:32,  1.80it/s]

EPOCH #0 STEP #5900 | loss: 5.488823413848877, avg_loss: 5.133220429380876


6001it [55:27,  1.81it/s]

EPOCH #0 STEP #6000 | loss: 5.512679576873779, avg_loss: 5.139298893614344


6101it [56:23,  1.80it/s]

EPOCH #0 STEP #6100 | loss: 5.570666313171387, avg_loss: 5.145330566868315


6201it [57:18,  1.81it/s]

EPOCH #0 STEP #6200 | loss: 5.684518337249756, avg_loss: 5.151806959162372


6301it [58:14,  1.81it/s]

EPOCH #0 STEP #6300 | loss: 5.49767541885376, avg_loss: 5.158379687333633


6401it [59:09,  1.81it/s]

EPOCH #0 STEP #6400 | loss: 5.586667537689209, avg_loss: 5.164629202599713


6501it [1:00:05,  1.81it/s]

EPOCH #0 STEP #6500 | loss: 5.442760467529297, avg_loss: 5.170644631038866


6601it [1:01:00,  1.79it/s]

EPOCH #0 STEP #6600 | loss: 5.6463117599487305, avg_loss: 5.176537555774908


6701it [1:01:55,  1.81it/s]

EPOCH #0 STEP #6700 | loss: 5.514750957489014, avg_loss: 5.182439062528122


6801it [1:02:51,  1.81it/s]

EPOCH #0 STEP #6800 | loss: 5.511746406555176, avg_loss: 5.188359956709081


6901it [1:03:46,  1.81it/s]

EPOCH #0 STEP #6900 | loss: 5.596418857574463, avg_loss: 5.19424016149886


7001it [1:04:41,  1.79it/s]

EPOCH #0 STEP #7000 | loss: 5.626104354858398, avg_loss: 5.20006668422924


7101it [1:05:37,  1.80it/s]

EPOCH #0 STEP #7100 | loss: 5.8297271728515625, avg_loss: 5.206271656330766


7201it [1:06:33,  1.80it/s]

EPOCH #0 STEP #7200 | loss: 5.77896785736084, avg_loss: 5.213157742209872


7301it [1:07:28,  1.80it/s]

EPOCH #0 STEP #7300 | loss: 5.768976211547852, avg_loss: 5.219090037469324


7401it [1:08:24,  1.80it/s]

EPOCH #0 STEP #7400 | loss: 5.6462602615356445, avg_loss: 5.224401397328815


7501it [1:09:19,  1.80it/s]

EPOCH #0 STEP #7500 | loss: 5.57719612121582, avg_loss: 5.229337397423955


7601it [1:10:15,  1.81it/s]

EPOCH #0 STEP #7600 | loss: 5.598400115966797, avg_loss: 5.234182435640331


7701it [1:11:10,  1.80it/s]

EPOCH #0 STEP #7700 | loss: 5.62114953994751, avg_loss: 5.238769987034745


7801it [1:12:06,  1.80it/s]

EPOCH #0 STEP #7800 | loss: 5.6087727546691895, avg_loss: 5.243139215378162


7901it [1:13:01,  1.81it/s]

EPOCH #0 STEP #7900 | loss: 5.602408409118652, avg_loss: 5.247336042785838


8001it [1:13:57,  1.82it/s]

EPOCH #0 STEP #8000 | loss: 5.518220901489258, avg_loss: 5.251457450479556


8101it [1:14:52,  1.82it/s]

EPOCH #0 STEP #8100 | loss: 5.470426559448242, avg_loss: 5.255578789029971


8201it [1:15:47,  1.81it/s]

EPOCH #0 STEP #8200 | loss: 5.6926751136779785, avg_loss: 5.260010766686499


8301it [1:16:42,  1.81it/s]

EPOCH #0 STEP #8300 | loss: 5.641053199768066, avg_loss: 5.264280108626586


8401it [1:17:38,  1.79it/s]

EPOCH #0 STEP #8400 | loss: 5.5576019287109375, avg_loss: 5.268352349277565


8501it [1:18:34,  1.78it/s]

EPOCH #0 STEP #8500 | loss: 5.505454063415527, avg_loss: 5.272131330895993


8601it [1:19:29,  1.80it/s]

EPOCH #0 STEP #8600 | loss: 5.646795749664307, avg_loss: 5.275898174329797


8701it [1:20:25,  1.80it/s]

EPOCH #0 STEP #8700 | loss: 5.575753211975098, avg_loss: 5.279477310268348


8764it [1:20:59,  1.80it/s]
0it [00:00, ?it/s]


TypeError: list indices must be integers or slices, not str