In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/transformer/__results__.html
/kaggle/input/transformer/__notebook__.ipynb
/kaggle/input/transformer/__output__.json
/kaggle/input/transformer/model.pt
/kaggle/input/transformer/custom.css


In [104]:
!pip install datasets --quiet
!pip install evaluate --quiet

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

filepath = '/kaggle/working/'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from datasets import load_dataset

data = load_dataset(
    "mt_eng_vietnamese",
    "iwslt2015-en-vi"
)
data

Downloading builder script:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading and preparing dataset mt_eng_vietnamese/iwslt2015-en-vi (download: 30.83 MiB, generated: 31.59 MiB, post-processed: Unknown size, total: 62.42 MiB) to /root/.cache/huggingface/datasets/mt_eng_vietnamese/iwslt2015-en-vi/1.0.0/53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71...


Downloading data:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/140k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/188k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/132k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/184k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133318 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1269 [00:00<?, ? examples/s]

Dataset mt_eng_vietnamese downloaded and prepared to /root/.cache/huggingface/datasets/mt_eng_vietnamese/iwslt2015-en-vi/1.0.0/53add551a01e9874588066f89d42925f9fad43db347199dad00f7e4b0c905a71. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 133318
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1269
    })
})

In [4]:
train_data = data['train']
test_data = data['test']
valid_data = data['validation']

In [5]:
train_data[3]

{'translation': {'en': 'Headlines that look like this when they have to do with climate change , and headlines that look like this when they have to do with air quality or smog .',
  'vi': 'Có những dòng trông như thế này khi bàn về biến đổi khí hậu , và như thế này khi nói về chất lượng không khí hay khói bụi .'}}

In [6]:
SOURCE_LANG = 'en'
TARGET_LANG = 'vi'

UNK_IDX, PAD_IDX, SOS_IDX, EOS_IDX = 0, 1, 2, 3
SPECIAL_SYMBOLS = ['<unk>', '<pad>', '<sos>', '<eos>']

In [7]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

token_transform = {}
vocab = {}
token_transform[SOURCE_LANG] = get_tokenizer('basic_english')
token_transform[TARGET_LANG] = get_tokenizer('basic_english')

In [8]:
token_transform['en']("hello it's me")

['hello', 'it', "'", 's', 'me']

In [9]:
def tokenize_example(example, sos_token, eos_token, token_transform, src_lang, tgt_lang):
    en_tokens = token_transform['en'](example['translation']['en'])
    vi_tokens = token_transform['vi'](example['translation']['vi'])

    en_tokens = ([sos_token] + en_tokens + [eos_token])
    vi_tokens = ([sos_token] + vi_tokens + [eos_token])

    return {"en_tokens": (en_tokens), "vi_tokens":(vi_tokens)}

In [10]:
fn_kwargs = {
    'sos_token': '<sos>',
    'eos_token': '<eos>',
    'token_transform': token_transform,
    'src_lang': SOURCE_LANG,
    'tgt_lang': TARGET_LANG,
    }
train_data = train_data.map(tokenize_example, fn_kwargs = fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs = fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs = fn_kwargs)

  0%|          | 0/133318 [00:00<?, ?ex/s]

  0%|          | 0/1269 [00:00<?, ?ex/s]

  0%|          | 0/1269 [00:00<?, ?ex/s]

In [11]:
print(train_data[0])

{'translation': {'en': 'Rachel Pike : The science behind a climate headline', 'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'}, 'en_tokens': ['<sos>', 'rachel', 'pike', 'the', 'science', 'behind', 'a', 'climate', 'headline', '<eos>'], 'vi_tokens': ['<sos>', 'khoa', 'học', 'đằng', 'sau', 'một', 'tiêu', 'đề', 'về', 'khí', 'hậu', '<eos>']}


In [12]:
for lang in [SOURCE_LANG, TARGET_LANG]:
    vocab[lang] = build_vocab_from_iterator(
        train_data[lang + '_tokens'],
        min_freq = 1,
        specials = SPECIAL_SYMBOLS,
        special_first = True
    )
    vocab[lang].set_default_index(UNK_IDX)

In [13]:
print(vocab['vi'].get_itos()[:10])
print(len(vocab['vi']))
print(vocab['en'].get_itos()[:10])
print(len(vocab['en']))

['<unk>', '<pad>', '<sos>', '<eos>', ',', '.', 'và', 'tôi', 'là', 'một']
21114
['<unk>', '<pad>', '<sos>', '<eos>', ',', '.', 'the', 'and', 'to', '&apos']
47271


In [15]:
def numericalize_example(example, vocab, src_lang, tgt_lang):
    en_ids = torch.tensor(vocab[src_lang].lookup_indices(example['en_tokens']))
    vi_ids = torch.tensor(vocab[tgt_lang].lookup_indices(example['vi_tokens']))

    return {'en_ids': en_ids, 'vi_ids': vi_ids}

In [16]:
fn_kwargs = {
    'vocab': vocab,
    'src_lang': SOURCE_LANG,
    'tgt_lang': TARGET_LANG,
    }
train_data = train_data.map(numericalize_example, fn_kwargs = fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs = fn_kwargs)
valid_data = valid_data.map(numericalize_example, fn_kwargs = fn_kwargs)

  0%|          | 0/133318 [00:00<?, ?ex/s]

  0%|          | 0/1269 [00:00<?, ?ex/s]

  0%|          | 0/1269 [00:00<?, ?ex/s]

In [17]:
train_data[0]

{'translation': {'en': 'Rachel Pike : The science behind a climate headline',
  'vi': 'Khoa học đằng sau một tiêu đề về khí hậu'},
 'en_tokens': ['<sos>',
  'rachel',
  'pike',
  'the',
  'science',
  'behind',
  'a',
  'climate',
  'headline',
  '<eos>'],
 'vi_tokens': ['<sos>',
  'khoa',
  'học',
  'đằng',
  'sau',
  'một',
  'tiêu',
  'đề',
  'về',
  'khí',
  'hậu',
  '<eos>'],
 'en_ids': [2, 6429, 17576, 6, 295, 553, 11, 682, 5334, 3],
 'vi_ids': [2, 300, 66, 1070, 109, 9, 360, 117, 37, 398, 700, 3]}

In [18]:
def collate_fn(batch):
    source_batch = [torch.tensor(sample[SOURCE_LANG + "_ids"]) for sample in batch]
    target_batch = [torch.tensor(sample[TARGET_LANG + "_ids"]) for sample in batch]

    #source_batch = torch.tensor(source_batch)
    #target_batch = torch.tensor(target_batch)
    # Source batch will have size (batch_size, length of longest sequence)
    # Same for target batch
    source_batch = nn.utils.rnn.pad_sequence(source_batch, padding_value = PAD_IDX, batch_first = True)
    target_batch = nn.utils.rnn.pad_sequence(target_batch, padding_value = PAD_IDX, batch_first = True)

    return source_batch, target_batch

In [19]:
from torch.utils.data import DataLoader

BATCH_SIZE = 16
train_data_loader = torch.utils.data.DataLoader(
    dataset = train_data,
    batch_size = BATCH_SIZE,
    collate_fn = collate_fn,
    shuffle = True
)

test_data_loader = DataLoader(
    dataset = test_data,
    batch_size = BATCH_SIZE,
    collate_fn = collate_fn,
)

valid_data_loader = DataLoader(
    dataset = valid_data,
    batch_size = BATCH_SIZE,
    collate_fn = collate_fn,
)

In [42]:
class MultiHeadAttention(nn.Module):
    def __init__(self, attention_head, model_dimension):
        super(MultiHeadAttention, self).__init__()
        assert model_dimension % attention_head == 0, "dimension of model must be divisible by the attention head"

        self.attention_head = attention_head
        self.model_dimension = model_dimension
        self.d_k = self.model_dimension // self.attention_head

        # All of the below has shape
        self.W_q = nn.Linear(model_dimension, model_dimension, bias = False) # Query transformation
        self.W_k = nn.Linear(model_dimension, model_dimension, bias = False) # Key transformation
        self.W_v = nn.Linear(model_dimension, model_dimension, bias = False) # Value transformation
        self.W_o = nn.Linear(model_dimension, model_dimension) # Output transformation

    def scaled_dot_products(self, Q, K, V, mask = None):
        # Q has shape (batch_size, num_heads, n_q, d_k)
        # K has shape (batch_size, num_heads, n_k, d_k)
        # V has shape (batch_size, num_heads, n_v, d_k)
        # Where n_q, n_k, n_v are seq_len of either src / tgt sequence
        # Mask has shape (batch_size, 1, 1, src_seq_len)
        # Mask has shape (batch_size, 1, tgt_seq_len, tgt_seq_len)
        
        # After tranposing, K has shape (batch_sze, num_heads, d_k, n_k)
        attention_score = (Q @ K.transpose(-2, -1)) / math.sqrt(self.d_k)
        # Attention_score has shape (batch_size, num_heads, n_q, n_k)
        
        if mask is not None:
            mask = mask.unsqueeze(1)
            attention_score = attention_score.masked_fill(mask == 0, value = -1e9)

        # Attention_probability is computed via softmax function
        # Still have shape (batch_size, num_heads, n_q, n_k)
        attention_probability = torch.softmax(attention_score, dim = -1)


        # (batch_size, num_heads, n_q, n_k) * (batch_size, num_heads, n_v, d_k)
        # In multi-head self-attention there are 2 cases:
        # If it's not cross attention, then 
        # Q = X * W_q, K = X * W_k, V = X * W_v
        # If it is then 
        # Q = X * W_q, K = encoder_output * W_k, V = encoder_output * W_k
        # => output has shape (batch_size, num_heads, n_q, d_k) 
        output = (attention_probability @ V)
        return output

    def split_heads(self, X):
        '''
        Reshape input X to have attention_head for multi-head attention
        '''

        # Tensor X has shape (batch_size, seq_len, model_dimension)

        batch_size, seq_len, model_dimension = X.shape
        output = X.view(batch_size, seq_len, self.attention_head, self.d_k)
        output = output.transpose(1, 2)
        # Return Tensor has shape (batch_size, attention_head, seq_len, dimension of each head)
        return output

    def combine_heads(self, X):
        '''
        Reshape input tensor X to have the same dimension before being fed for
        Multi head attention    
        '''
        # Tensor X has shape (batch_size, attention_head, seq_len, dim_each_head)
        # X.tranpose(1, 2) has shape (batch_size, seq_len, attention_head, dim_each_head)
        
        batch_size, _, seq_len, d_k = X.shape
        output = X.transpose(1, 2).contiguous().view(batch_size, seq_len, self.model_dimension)
        return output

    def forward(self, Q, K, V, mask = None):

        # Seems like Q, K, V all has shape (batch_size, n, d)
        # Where n is input sequence length
        # Where d is embedding dimension / model_dimension
        # To simplify this, I will consider model_dimension
        # And embedding dimension the same
        # But if embedding dimension is different then
        # We can change input dimension of Wo, Wv, Wk, Wq

        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attention_output = self.scaled_dot_products(Q, K, V, mask)

        # attention_output has shape (batch_size, num_heads, n_q, d_k)

        output = self.W_o(self.combine_heads(attention_output))
        # combine heads we have (batch_size, n_q, model_dimension)

        return output

In [43]:
class PositionWiseFeedForwardNetwork(nn.Module):
    def __init__(self, model_dimension, feed_forward_dimension):

        super(PositionWiseFeedForwardNetwork, self).__init__()
        self.model_dimension = model_dimension
        self.feed_forward_dimension = feed_forward_dimension

        self.fc1 = nn.Linear(model_dimension, feed_forward_dimension)
        self.fc2 = nn.Linear(feed_forward_dimension, model_dimension)
        self.relu = nn.ReLU()

    def forward(self, X):
        # X has shape (batch_size, seq_len, model_dimension)
        # Return tensor has the same thing
        return self.fc2(self.relu(self.fc1(X)))

In [44]:
class PositionalEncoding(nn.Module):
    def __init__(self, model_dimension, max_seq_len, dropout):
        super(PositionalEncoding, self).__init__()
        self.model_dimension = model_dimension
        self.max_seq_len = max_seq_len
        self.dropout = nn.Dropout(dropout)
        positional_encoding = torch.zeros(max_seq_len, model_dimension)
        position = torch.arange(0, max_seq_len, dtype = torch.float).unsqueeze(1)

        # position has shape (max_seq_len, 1)
        # positional_encoding has shape (max_seq_len, model_dimension)

        # log(pos) - 2i/model * log(10000)
        div_term = torch.exp(torch.arange(0, model_dimension, 2).float() * -(math.log(10000.0) / model_dimension))

        positional_encoding[:, 0::2] = torch.sin(position * div_term)
        positional_encoding[:, 1::2] = torch.cos(position * div_term)
        positional_encoding = positional_encoding.unsqueeze(0)
        
        self.register_buffer('pe', positional_encoding)

    def forward(self, X):
        # X has shape (batch_size, seq_len, model_dimension)
        return self.dropout(X + (self.pe[:, :X.shape[1], :]))

In [45]:
class EncoderBlock(nn.Module):
    def __init__(self, model_dimension, attention_heads, feed_forward_dimension, dropout):
        super(EncoderBlock, self).__init__()

        self.attention = MultiHeadAttention(attention_heads, model_dimension)
        self.feed_forward_network = PositionWiseFeedForwardNetwork(model_dimension, feed_forward_dimension)
        self.norm1 = nn.LayerNorm(model_dimension)
        self.norm2 = nn.LayerNorm(model_dimension)

        self.dropout = nn.Dropout(dropout)

    def forward(self, X, mask):
        # X is a tensor of shape (batch_size, seq_len, model_dimmension)
        attention_output = self.attention(X, X, X, mask)
        X = self.norm1(X + self.dropout(attention_output))
        feed_forward_output = self.feed_forward_network(X)
        X = self.norm2(X + self.dropout(feed_forward_output))

        # Return tensor is the same size
        return X


In [46]:
class DecoderBlock(nn.Module):
    def __init__(self, model_dimension, attention_heads, feed_forward_dimension, dropout):
        super(DecoderBlock, self).__init__()

        self.attention = MultiHeadAttention(attention_heads, model_dimension)
        self.feed_forward_network = PositionWiseFeedForwardNetwork(model_dimension, feed_forward_dimension)
        self.norm1 = nn.LayerNorm(model_dimension)
        self.norm2 = nn.LayerNorm(model_dimension)
        self.norm3 = nn.LayerNorm(model_dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, X, encoder_output, source_mask, target_mask):

        attention_output = self.attention(X, X, X, target_mask)
        X = self.norm1(X + self.dropout(attention_output))
        # X after add & norm layer has the same shape (batch_size, tgt_len, model_dim)    
        # In this scenario, encoder output will be played as key and value
        # This is cross-attention
        # Encoder_output has shape (batch_size, src_len, model_dim)
        attention_output = self.attention(X, encoder_output, encoder_output, source_mask)
        
        # Attention_output has shape (batch_size, tgt_len, model_dim)
        X = self.norm2(X + self.dropout(attention_output))
            
        feed_forward_output = self.feed_forward_network(X)
        X = self.norm3(X + self.dropout(feed_forward_output))

        return X

In [47]:
class Transformer(nn.Module):
    def __init__(self, model_dimension, attention_heads, feed_forward_dimension,
                 source_vocab_size, target_vocab_size, num_layers, max_seq_len, dropout):
        super(Transformer, self).__init__()

        self.model_dimension = model_dimension
        self.attention_heads = attention_heads
        self.feed_forward_dimension = feed_forward_dimension

        self.positional_encoding = PositionalEncoding(model_dimension, max_seq_len, dropout)
        self.encoder_embedding = nn.Embedding(source_vocab_size, model_dimension)
        self.decoder_embedding = nn.Embedding(target_vocab_size, model_dimension)
        self.encoders = nn.ModuleList([EncoderBlock(model_dimension, attention_heads, feed_forward_dimension, dropout) for _ in range(num_layers)])
        self.decoders = nn.ModuleList([DecoderBlock(model_dimension, attention_heads, feed_forward_dimension, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(model_dimension, target_vocab_size)
    
    
    def generate_mask(self, source_sentence, target_sentence):
        # target sentence has shape (batch_size, max_tgt_len)
        # source sentence has shape (batch_size, max_src_len)
        # since batch_size can be broadcasted so we can skip it

        batch_size = source_sentence.shape[0]
        max_target_len = target_sentence.shape[1]
        #print((target_sentence != PAD_IDX).unsqueeze(1))
        # source_sentence has shape (batch_size, 1, max_src_len)
        # = 1 if it's not PAD_IDX, 0 otherwise
        source_mask = (source_sentence != PAD_IDX).unsqueeze(1).int().to(device)
        
        # target_mask has shape (batch_size, 1, max_tgt_len)
        target_mask = (target_sentence != PAD_IDX).unsqueeze(1).int().to(device)

        # no peak mask has shape (batch_size, max_tgt_len, max_tgt_len)
        # = 0 if i >= j, 1 otherwise
        no_peak_mask = 1 - torch.triu(torch.ones((1, max_target_len, max_target_len)), diagonal=1).type(torch.int).to(device)

        # every position (i,j) such that i >= j and not a PAD_IDX
        target_mask = target_mask & no_peak_mask
        # target_mask now has shape (batch_size, max_tgt_len, max_tgt_len)

        return source_mask, target_mask

    def forward(self, source_sentence, target_sentence):

        # source sentence have shape (batch_size, src_seq_len)
        # target sentence have shape (batch_size, tgt_seq_len)
        source_mask, target_mask = self.generate_mask(source_sentence, target_sentence)
        source_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(source_sentence)))
        target_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(target_sentence)))

        # Now source_embedded have shape (batch_size, src_seq_len, model_dimension)
        # Now target_embedded have shape (batch_size, tgt_seq_len, model_dimension)
        
        encoder_output = source_embedded
        for encoder_layer in self.encoders:
            encoder_output = encoder_layer(encoder_output, source_mask)
        # encoder_output has shape (batch_size, src_seq_len, model_dimension)
        
        decoder_output = target_embedded
        for decoder_layer in self.decoders:
            decoder_output = decoder_layer(decoder_output, encoder_output, source_mask, target_mask)

        # Output will have shape (batch_size, max_seq_len, target_vocab_size)
        output = self.fc(decoder_output)
        return output

In [48]:
model_dimension = 512
attention_heads = 8
feed_forward_dimension = 2048
source_vocab_size = len(vocab[SOURCE_LANG])
target_vocab_size = len(vocab[TARGET_LANG])
num_layers = 6
max_seq_len = 1000
dropout = 0.1



model = Transformer(
    model_dimension,
    attention_heads,
    feed_forward_dimension,
    source_vocab_size,
    target_vocab_size,
    num_layers,
    max_seq_len,
    dropout,
)
model= nn.DataParallel(model)
model.to(device)
#model_dimension, attention_heads, feed_forward_dimension,
#source_vocab_size, target_vocab_size, num_layers, max_seq_len, dropout

DataParallel(
  (module): Transformer(
    (positional_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder_embedding): Embedding(47271, 512)
    (decoder_embedding): Embedding(21114, 512)
    (encoders): ModuleList(
      (0-5): 6 x EncoderBlock(
        (attention): MultiHeadAttention(
          (W_q): Linear(in_features=512, out_features=512, bias=False)
          (W_k): Linear(in_features=512, out_features=512, bias=False)
          (W_v): Linear(in_features=512, out_features=512, bias=False)
          (W_o): Linear(in_features=512, out_features=512, bias=True)
        )
        (feed_forward_network): PositionWiseFeedForwardNetwork(
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affi

In [51]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 83,660,922 trainable parameters


In [52]:
optimizer = optim.Adam(model.parameters(), lr = 1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [59]:
def train_fn(model, data_loader, optimizer, criterion, clip, device):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        
        source_sentence = batch[0].to(device)
        target_sentence = batch[1].to(device)
        # source_sentence has shape (batch_size, max_src_len)
        # target_sentence has shape (batch_size, max_tgt_len)
        batch_size = source_sentence.shape[0]
        target_len = target_sentence.shape[1]
        target_input = target_sentence[:, :-1]
        # Each word i put into transformer will predict word i + 1 
        # So no need for last word 
        target_output = target_sentence[:, 1:]
        # Same for above comment => dont need first word for output
        optimizer.zero_grad()

        # output has shape (batch_size, max_tgt_len - 1, tgt_vocab_size)
        output = model(source_sentence, target_input)
        output = output.reshape(batch_size * (target_len - 1), output.shape[-1])
        target_output = target_output.reshape(batch_size * (target_len - 1))
        loss = criterion(output, target_output)
        
        if i % (len(data_loader) // 10) == 0:
            print(i // (len(data_loader) // 10), "%", end = ' ') 
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
        
    print()
    return epoch_loss / len(data_loader)

In [60]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            source_sentence = batch[0].to(device)
            target_sentence = batch[1].to(device)

            # source_sentence has shape (batch_size, max_src_len)
            # target_sentence has shape (batch_size, max_tgt_len)
            
            batch_size = source_sentence.shape[0]
            target_len = target_sentence.shape[1]
            target_input = target_sentence[:, :-1]
            target_output = target_sentence[:, 1:]
            
            output = model(source_sentence, target_input)
            output = output.reshape(batch_size * (target_len - 1), output.shape[-1])
            target_output = target_output.reshape(batch_size * (target_len - 1))
            
            loss = criterion(output, target_output)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [61]:
import tqdm

n_epochs = 10
clip = 1
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_fn(model, train_data_loader, optimizer, criterion, clip, device)
    valid_loss = evaluate_fn(model, valid_data_loader, criterion, device)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), filepath + "model.pth")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  0%|          | 0/10 [00:00<?, ?it/s]

0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 10%|█         | 1/10 [23:29<3:31:29, 1409.97s/it]

	Train Loss:   4.038 | Train PPL:  56.698
	Valid Loss:   3.312 | Valid PPL:  27.435
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 20%|██        | 2/10 [46:57<3:07:46, 1408.30s/it]

	Train Loss:   3.015 | Train PPL:  20.396
	Valid Loss:   2.816 | Valid PPL:  16.716
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 30%|███       | 3/10 [1:10:34<2:44:45, 1412.23s/it]

	Train Loss:   2.571 | Train PPL:  13.080
	Valid Loss:   2.540 | Valid PPL:  12.684
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 40%|████      | 4/10 [1:33:54<2:20:45, 1407.57s/it]

	Train Loss:   2.294 | Train PPL:   9.917
	Valid Loss:   2.374 | Valid PPL:  10.743
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 50%|█████     | 5/10 [1:57:29<1:57:30, 1410.13s/it]

	Train Loss:   2.100 | Train PPL:   8.168
	Valid Loss:   2.295 | Valid PPL:   9.927
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 60%|██████    | 6/10 [2:21:02<1:34:05, 1411.28s/it]

	Train Loss:   1.954 | Train PPL:   7.055
	Valid Loss:   2.233 | Valid PPL:   9.332
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 70%|███████   | 7/10 [2:44:21<1:10:21, 1407.29s/it]

	Train Loss:   1.836 | Train PPL:   6.274
	Valid Loss:   2.216 | Valid PPL:   9.170
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 80%|████████  | 8/10 [3:07:57<46:59, 1409.94s/it]  

	Train Loss:   1.738 | Train PPL:   5.685
	Valid Loss:   2.186 | Valid PPL:   8.900
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


 90%|█████████ | 9/10 [3:31:15<23:26, 1406.29s/it]

	Train Loss:   1.653 | Train PPL:   5.222
	Valid Loss:   2.177 | Valid PPL:   8.821
0 % 1 % 2 % 3 % 4 % 5 % 6 % 7 % 8 % 9 % 10 % 


100%|██████████| 10/10 [3:54:50<00:00, 1409.06s/it]

	Train Loss:   1.577 | Train PPL:   4.840
	Valid Loss:   2.175 | Valid PPL:   8.806





In [62]:
filepath = '/kaggle/working/'
torch.save(model.state_dict(), filepath + "model.pt")

In [63]:
filepath = '/kaggle/working/'
model.load_state_dict(torch.load(filepath + "model.pt", map_location = device))
model

DataParallel(
  (module): Transformer(
    (positional_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder_embedding): Embedding(47271, 512)
    (decoder_embedding): Embedding(21114, 512)
    (encoders): ModuleList(
      (0-5): 6 x EncoderBlock(
        (attention): MultiHeadAttention(
          (W_q): Linear(in_features=512, out_features=512, bias=False)
          (W_k): Linear(in_features=512, out_features=512, bias=False)
          (W_v): Linear(in_features=512, out_features=512, bias=False)
          (W_o): Linear(in_features=512, out_features=512, bias=True)
        )
        (feed_forward_network): PositionWiseFeedForwardNetwork(
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (relu): ReLU()
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affi

In [64]:
test_loss = evaluate_fn(model, test_data_loader, criterion, device)
print(f"\tTest Loss: {test_loss:7.3f} | Train PPL: {np.exp(test_loss):7.3f}")

	Test Loss:   2.175 | Train PPL:   8.806


In [65]:
def get_pad_mask(sentence, pad_idx):
    mask = (sentence != pad_idx).unsqueeze(1).int().to(device)
    return mask

def get_no_peak_mask(sentence):
    sentence_len = sentence.shape[1]
    no_peak_mask = 1 - torch.triu(torch.ones((1, sentence_len, sentence_len)), diagonal=1).type(torch.int).to(device)
    return no_peak_mask

In [127]:
import torch.nn.functional as F

def greedy_decode(model, sentence, max_len = 100):
    model.eval()
    
    input_tokens = token_transform['en'](sentence)
    input_ids = [SOS_IDX] + vocab['en'].lookup_indices(input_tokens) + [EOS_IDX]
    input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)
    input_mask = get_pad_mask(input_tensor, PAD_IDX)
    with torch.no_grad():
        input_embedding = model.module.encoder_embedding(input_tensor)
        input_embedding = model.module.positional_encoding(input_embedding)
        encoder_output = input_embedding
        for encoder_layer in model.module.encoders:
            encoder_output = encoder_layer(encoder_output, input_mask)
            
    output_ids = [SOS_IDX]
    for i in range(max_len):
        output_tensor = torch.tensor(output_ids).unsqueeze(0).to(device)
        output_mask = get_pad_mask(output_tensor, PAD_IDX) & get_no_peak_mask(output_tensor)
        with torch.no_grad():
            output_embedding = model.module.decoder_embedding(output_tensor)
            output_embedding = model.module.positional_encoding(output_embedding)
            decoder_output = output_embedding
            for decoder_layer in model.module.decoders:
                decoder_output = decoder_layer(decoder_output, encoder_output, input_mask, output_mask)
            output = model.module.fc(decoder_output)
            
        output = F.softmax(output, dim = -1) 
        output_id = output.argmax(dim = -1)[:, -1].item()
        output_ids.append(output_id)
        
        if len(output_ids) > max_len or output_id == EOS_IDX:
            break
    
    output_tokens = [vocab['vi'].get_itos()[idx] for idx in output_ids]
    return output_tokens

In [128]:
text = 'i am studying artificial intelligence'
print(' '.join(greedy_decode(model, text, 500)))

<sos> tôi đang nghiên cứu về trí thông minh nhân tạo <eos>


In [129]:
translations = [greedy_decode(model, example['translation']['en'].lower()) for example in test_data]

In [131]:
references = [example['vi_tokens'] for example in test_data]

In [132]:
print(references[0])
print(translations[0])

['<sos>', 'khi', 'tôi', 'còn', 'nhỏ', ',', 'tôi', 'nghĩ', 'rằng', 'bắctriều', 'tiên', 'là', 'đất', 'nước', 'tốt', 'nhất', 'trên', 'thế', 'giới', 'và', 'tôi', 'thường', 'hát', 'bài', '&quot', 'chúng', 'ta', 'chẳng', 'có', 'gì', 'phải', 'ghen', 'tị', '.', '&quot', '<eos>']
['<sos>', 'khi', 'tôi', 'còn', 'nhỏ', ',', 'tôi', 'nghĩ', 'đất', 'nước', 'tôi', 'là', 'người', 'tốt', 'nhất', 'trên', 'hành', 'tinh', 'này', ',', 'và', 'tôi', 'lớn', 'lên', 'hát', 'một', 'bài', 'hát', 'có', 'tên', '&quot', 'không', 'gì', 'để', 'ghen', 'tị', '.', '&quot', '<eos>']


In [133]:
predictions = [example[1:-1] for example in translations]
references = [example[1: -1] for example in references]

In [135]:
references = [[example] for example in references]
print(predictions[900])
print(references[900])

['nhưng', 'sư', 'tử', 'rất', 'thông', 'minh', '.']
[['nhưng', 'sư', 'tử', 'rất', 'thông', 'minh', '.']]


In [136]:
from torchtext.data.metrics import bleu_score

score = bleu_score(predictions, references)
print(score)

0.2640448933379371
