### **Libraries**

In [1]:
import codecs
import csv
import re
from tqdm import tqdm
import sys
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
import torch.nn.functional as F
from typing import List
import numpy as np
from torch.optim import Adam
import matplotlib.pyplot as plt
import torch.optim.lr_scheduler as LRSchedule



### **Load Data**

In [2]:
with open("train_tieng_viet.txt", 'r', encoding='utf-8') as f:
    train_outputs = f.readlines()
print(len(train_outputs))

3624432


In [3]:
def remove_tone_line(utf8_str):
    intab_l = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ"
    intab_u = "ẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"
    intab = list(intab_l + intab_u)
    outtab_l = "a"*17 + "o"*17 + "e"*11 + "u"*11 + "i"*5 + "y"*5 + "d"
    outtab_u = "A"*17 + "O"*17 + "E"*11 + "U"*11 + "I"*5 + "Y"*5 + "D"
    outtab = outtab_l + outtab_u
    r = re.compile("|".join(intab))
    replaces_dict = dict(zip(intab, outtab))
    non_dia_str = r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)
    return non_dia_str

print(remove_tone_line("Đi một ngày đàng học 1 sàng không"))


Di mot ngay dang hoc 1 sang khong


In [4]:
train_idx_500k, train_ipt_500k, train_opt_500k = [], [], []
val_idx_50k, val_ipt_50k, val_opt_50k = [], [], []
test_idx_50k, test_ipt_50k, test_opt_50k = [], [], []

In [5]:
for i in tqdm(range(600000)):
    [idx, origin_seq] = train_outputs[i].split('\t')
    try:
        non_acc_seq = remove_tone_line(origin_seq)
    except:
        print(f"Error at {i}")
        next
    if i < 500000:
        train_idx_500k.append(idx)
        train_opt_500k.append(origin_seq)
        train_ipt_500k.append(non_acc_seq)
    elif i < 550000:
        val_idx_50k.append(idx)
        val_opt_50k.append(origin_seq)
        val_ipt_50k.append(non_acc_seq)
    else:
        test_idx_50k.append(idx)
        test_opt_50k.append(origin_seq)
        test_ipt_50k.append(non_acc_seq)

  0%|          | 0/600000 [00:00<?, ?it/s]

100%|██████████| 600000/600000 [00:24<00:00, 24837.84it/s]


### **Build Dataset**

In [6]:
class CustomDataset(Dataset):
    def __init__(self, input:List[List[str]], output: List[List[str]], vocab_input, vocab_output) -> None:
        self.vocab_input = vocab_input
        self.vocab_output = vocab_output
        self.input = [[vocab_input.encode(token) for token in text.split()] for text in input]
        self.output = [[vocab_output.encode(token) for token in text.split()] for text in output]
        self._add_start_end()
    def __len__(self):
        return len(self.input)
    def _add_start_end(self):
        self.input = [[len(self.vocab_input)] + text + [len(self.vocab_input) + 1] for text in self.input]
        self.output = [[len(self.vocab_output)] + text + [len(self.vocab_output) + 1] for text in self.output]
    def __getitem__(self, idx):
        input = self.input[idx]
        output = self.input[idx]
        input = torch.tensor(input, dtype=torch.float32)
        output = torch.tensor(output, dtype=torch.float32)
        return input, output

In [7]:
class BuildVocab:
    def __init__(self, data):
        self.vocab = build_vocab_from_iterator(self.input_iterator(data), specials=["<unk>"])
        self.vocab.set_default_index(self.vocab["<unk>"])

    def __len__(self):
        return len(self.vocab)
    
    def input_iterator(self, data):
        for text in data:
            yield text.split()
    def encode(self, token):
        return self.vocab[token]

In [8]:
vocab_input = BuildVocab(train_ipt_500k)
vocab_output = BuildVocab(train_opt_500k)

In [9]:
train_dataset = CustomDataset(train_ipt_500k, train_opt_500k, vocab_input, vocab_output)
val_dataset = CustomDataset(val_ipt_50k, val_opt_50k, vocab_input, vocab_output)

In [10]:
def custom_collate(batch):
    input, output = zip(*batch)
    max_len_input = max([len(sample) for sample in input])
    max_len_output = max([len(sample) for sample in output])
    input_padded = []
    output_padded = []
    for sample in input:
        if len(sample) < max_len_input:
            num_paddings = max_len_input - len(sample)
            padded =torch.cat([sample,torch.tensor([0]*num_paddings)])
        else:
            padded = sample[:max_len_input]
        input_padded.append(padded)
        
    for sample in output:
        if len(sample) < max_len_output:
            num_paddings = max_len_output - len(sample)
            padded =torch.cat([sample,torch.tensor([0]*num_paddings)])
        else:
            padded = sample[:max_len_output]
        output_padded.append(padded)
    input_padded = torch.stack(input_padded)
    output_padded = torch.stack(output_padded)
    return input_padded, output_padded

In [11]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=custom_collate)

### **Position Encoding**

$PE(pos, 2i)$ = $sin(\frac{pos}{1000^{\frac{2i}{d_{model}}}})$\
\
$PE(pos, 2i+1)$ = $cos(\frac{pos}{1000^{\frac{2i}{d_{model}}}})$

In [12]:
def get_angles(pos, i, d_model):
    angle_rates = 1/np.power(1000, (2*(i//2)) / np.float32(d_model))
    return pos * angle_rates

In [13]:
def position_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:,np.newaxis],
                            np.arange(d_model)[np.newaxis,:],
                            d_model)
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return torch.tensor(pos_encoding, dtype=torch.float32)

### **Masking**

In [14]:
def create_padding_mask(seq):
    mask = (seq == 0).float()
    return mask.unsqueeze(1).unsqueeze(2)

In [15]:
def create_look_ahead_mask(seq_len):
    ahead_mask = 1 - torch.triu(torch.ones(seq_len, seq_len), diagonal=1)
    ahead_mask = ahead_mask == 0
    return ahead_mask.unsqueeze(0).unsqueeze(0)  

### **Scaled Dot Product Attention**

$Attention(Q, K, V)$ = $softmax_k(\frac{QK^T}{\sqrt{d_k}}V)$

In [16]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = torch.matmul(q, k.transpose(-2, -1))  # Correct matrix multiplication
    
    dk = k.shape[-1]
    scaled_attention_logits = matmul_qk / torch.sqrt(torch.tensor(dk))
    
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)
    
    attention_weight = F.softmax(scaled_attention_logits)
    output = torch.matmul(attention_weight, v)
    return output, attention_weight

In [17]:
mask = torch.tensor([[0, 1, 1],
                    [0, 0, 1],
                    [0, 0, 0]], dtype = torch.float32)

scaled_attention_logit = torch.tensor([[1, 3, 10],
                                        [1, 2, 5],
                                        [1, 1, 5]], dtype = torch.float32)

scaled_attention_logit += (mask * -1e9)
attention_weights = F.softmax(scaled_attention_logit)
print('scaled_attention_logit: ', scaled_attention_logit)
print('attention_weights: ', attention_weights)


scaled_attention_logit:  tensor([[ 1.0000e+00, -1.0000e+09, -1.0000e+09],
        [ 1.0000e+00,  2.0000e+00, -1.0000e+09],
        [ 1.0000e+00,  1.0000e+00,  5.0000e+00]])
attention_weights:  tensor([[1.0000, 0.0000, 0.0000],
        [0.2689, 0.7311, 0.0000],
        [0.0177, 0.0177, 0.9647]])


  attention_weights = F.softmax(scaled_attention_logit)


In [18]:
torch.set_printoptions(precision=5, sci_mode=False)
def print_out(q, k, v):
  temp_out, temp_attn = scaled_dot_product_attention(
      q, k, v, None)
  print ('Attention weights are:')
  print (temp_attn)
  print ('Output is:')
  print (temp_out)
temp_k = torch.tensor([[10,0,0],
                      [0,10,0],
                      [0,0,10],
                      [0,0,10]], dtype=torch.float32)  # (4, 3)

temp_v = torch.tensor([[   1,0],
                      [  10,0],
                      [ 100,5],
                      [1000,6]], dtype=torch.float32)  # (4, 2)

# This `query` aligns with the second `key`,
# so the second `value` is returned.
temp_q = torch.tensor([[0, 10, 0]], dtype=torch.float32)  # (1, 3)
print_out(temp_q, temp_k, temp_v)

Attention weights are:
tensor([[    0.00000,     1.00000,     0.00000,     0.00000]])
Output is:
tensor([[   10.00000,     0.00000]])


  attention_weight = F.softmax(scaled_attention_logits)


### **Multihead Attention**

In [19]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        
        self.depth = d_model // self.num_heads
        
        self.wq = nn.Linear(in_features=d_model, out_features=d_model)
        self.wk = nn.Linear(in_features=d_model, out_features=d_model)
        self.wv = nn.Linear(in_features=d_model, out_features=d_model)
        
        self.linear = nn.Linear(in_features=d_model, out_features=d_model)
    def split_heads(self, x, batch_size):
        x = x.reshape(batch_size, -1, self.num_heads, self.depth)
        return x.transpose(1, 2)
    def forward(self, v, k, q, mask):
        batch_size = q.shape[0]
        
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)
        
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        scaled_attention = scaled_attention.permute(0, 2, 1, 3) # (batch_size, seq_len num_heads, depth)
        concat_attention = scaled_attention.reshape(batch_size, -1, self.d_model)
        output = self.linear(concat_attention)
        return output, attention_weights

In [20]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
y = torch.rand((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y, k=y, q=y, mask=None)
out.shape, attn.shape

  attention_weight = F.softmax(scaled_attention_logits)


(torch.Size([1, 60, 512]), torch.Size([1, 8, 60, 60]))

### **Feed Forward Network**

In [21]:
def ffn(d_model, dff):
    return nn.Sequential(
        nn.Linear(d_model, dff),
        nn.ReLU(),
        nn.Linear(dff, d_model)
    )

### **Encoder layer**
1. Multi-head Attention (với padding mask)
2. Point wise feed forward network.

In [22]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = ffn(d_model, dff)
        
        self.layer_norm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm2 = nn.LayerNorm(d_model, eps=1e-6)
        
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output, _  = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.layer_norm1(x + attn_output)
        
        ffn_output = self.ffn(out1) 
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layer_norm2(out1 + ffn_output)
        
        return out2

In [23]:
sample_encoder_layer = EncoderLayer(512, 8, 2048)

sample_encoder_layer_output = sample_encoder_layer(
   torch.rand((64, 43, 512)), None)

  attention_weight = F.softmax(scaled_attention_logits)


In [24]:
sample_encoder_layer_output.shape

torch.Size([64, 43, 512])

### **Encoder**
1. Input Embedding
2. Positional Encoding
3. N encoder layers

In [25]:
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maxinum_pe, dropout=0.1):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_vocab_size, d_model)
        self.pe = position_encoding(maxinum_pe, self.d_model)
        
        self.enc_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, dff, dropout) for _ in range(num_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        seq_len = x.shape[1]
        
        x = self.embedding(x.type(torch.int))
        x *= torch.sqrt(torch.tensor(self.d_model))
        x += self.pe[:, :seq_len:, :].to(x.device)
        
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, mask)
        return x

In [26]:
sample_encoder = Encoder(num_layers=2, d_model=512, num_heads=8, 
                         dff=2048, input_vocab_size=8500,
                         maxinum_pe=10000)

# Init sample tensorflow with shape 64 x 62 and data type int.
temp_input = torch.randint(0, 2000, (64, 62))

sample_encoder_output = sample_encoder(temp_input, mask=None)

print (sample_encoder_output.shape)  # (batch_size, input_seq_len, d_model)


torch.Size([64, 62, 512])


  attention_weight = F.softmax(scaled_attention_logits)


### **Decoder Layer**
1. Masked multi-head attention (với look ahead mask và padding mask).
2. Multi-head attention (với padding mask). Ma trận V, K cùng lấy output từ Encoder và ma trận  Q nhận output từ masked multi-head attention.
3. Point wise feed forward network

In [27]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dff, dropout = 0.1):
        super(DecoderLayer, self).__init__()
        
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        
        self.ffn = ffn(d_model, dff)
        
        self.layer_norm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm3 = nn.LayerNorm(d_model, eps=1e-6)
        
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        
    def forward(self, x, encoding_output, look_ahead_mask, padding_mask):
        attn1, attn_weight_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1)
        out1 = self.layer_norm1(attn1)
        
        attn2, attn_weight_block2 = self.mha2(encoding_output, encoding_output, out1, padding_mask)
        attn2 = self.dropout2(attn2)
        out2 = self.layer_norm2(attn2)
        
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output)
        out3 = self.layer_norm3(out2 + ffn_output)
        
        return out3, attn_weight_block1, attn_weight_block2
    

### **Decoder**
1. Output Embedding
2. Positional Embedding
3. N encoder layers

In [28]:
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_pe, dropout=0.1):
        super(Decoder, self).__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(target_vocab_size, d_model)
        self.pe = position_encoding(maximum_pe, d_model)
        
        self.dec_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dff, dropout) for _ in range(self.num_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, look_ahead_mask, padding_mask):
        seq_len = x.shape[1]
        attention_weights = {}
        
        x = self.embedding(x.type(torch.int))
        x *= torch.sqrt(torch.tensor(self.d_model))
        x += self.pe[:, :seq_len, :].to(x.device)
        
        x = self.dropout(x)
        
        for i in range(self.num_layers):
            x , block1, block2 = self.dec_layers[i](x, enc_output, look_ahead_mask, padding_mask)
            attention_weights[f'decoder_layer{i}_block1'] = block1
            attention_weights[f'decoder_layer{i}_block2'] = block2
            
        return x, attention_weights
        

In [29]:
sample_decoder = Decoder(num_layers=2, d_model=512, num_heads=8, 
                         dff=2048, target_vocab_size=8000,
                         maximum_pe=5000)
temp_input = torch.randint(0, 200, (64, 26))

output, attn = sample_decoder(temp_input, 
                              enc_output=sample_encoder_output, 
                              look_ahead_mask=None, 
                              padding_mask=None)

  attention_weight = F.softmax(scaled_attention_logits)


### **Transformer**

In [30]:
class Transformer(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, dropout=0.1):
        super(Transformer, self).__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, dropout)
        
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, dropout)
        
        self.final_layer = nn.Linear(d_model, target_vocab_size)
        
    def forward(self, input, target, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(input, enc_padding_mask)
        
        dec_output, attention_weights = self.decoder(target, enc_output, look_ahead_mask, dec_padding_mask)
        
        final_output = self.final_layer(dec_output)
        
        return final_output, attention_weights

In [31]:
sample_transformer = Transformer(
    num_layers=2, d_model=512, num_heads=8, dff=2048, 
    input_vocab_size=8500, target_vocab_size=8000, 
    pe_input=10000, pe_target=6000)

temp_input = torch.randint(0, 200, (64, 38))
temp_target = torch.randint(0, 200, (64, 36))

fn_out, _ = sample_transformer(temp_input, temp_target, 
                               enc_padding_mask=None, 
                               look_ahead_mask=None,
                               dec_padding_mask=None)


  attention_weight = F.softmax(scaled_attention_logits)


### **Hyperparameters**

In [32]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = len(vocab_input) + 2
target_vocab_size = len(vocab_output)+ 2
dropout_rate = 0.1

In [33]:
transformer = Transformer(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff,
                          input_vocab_size=input_vocab_size, target_vocab_size=target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          dropout=dropout_rate)

### **Optimizer**

In [34]:
class CustomSchedule(torch.optim.lr_scheduler.LRScheduler):
    def __init__(self,optimizer, d_model, warmup_steps=4000, last_epoch=-1):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        super().__init__(optimizer, last_epoch)
        
    def get_lr(self):
        step = max(1, self.last_epoch + 1)
        arg1 = step ** (-0.5)
        arg2 = step * (self.warmup_steps ** (-1.5))
        scale = torch.sqrt(torch.tensor(self.d_model)) * min(arg1, arg2)
        return [base_lr * scale for base_lr  in self.base_lrs]
optimizer =  Adam(transformer.parameters())
schedule = CustomSchedule(optimizer, d_model=512)

  from .autonotebook import tqdm as notebook_tqdm


### **Loss Function**

In [47]:
def loss_function(y_hat, y):
    mask = (y != 0)
    loss = F.cross_entropy(y_hat.transpose(1, 2), y)
    loss *= mask.float()
    return loss.sum() / mask.float().sum()

### **Create Mask**

In [36]:
def create_masks(input, target, device):
    enc_padding_mask = create_padding_mask(input)
    look_ahead_mask = create_look_ahead_mask(target.size(1)).type(torch.float)
    dec_padding_mask = create_padding_mask(target)
    combined_mask = torch.maximum(dec_padding_mask.to(device), look_ahead_mask.to(device))
    return enc_padding_mask, combined_mask, dec_padding_mask

### **Training**

In [37]:
def train(model, optimizer, loss_fn, train_loader, val_loader, scheduler, epochs, device):
    model.to(device)
    model.train()
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
        train_loss = 0
        val_loss = 0
        for _, (input, target) in enumerate(train_loader):
            optimizer.zero_grad()
            input = input.to(device)
            target = target.to(device)
            enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(input, target, device)
            output, _ = model(input, target, enc_padding_mask, look_ahead_mask, dec_padding_mask)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_loss += loss.item()
        train_losses.append(train_loss/len(train_loader))
        for _, (input, target) in enumerate(val_loader):
            input = input.to(device)
            target = target.to(device)
            enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(input, target)
            output, _ = model(input, target, enc_padding_mask, look_ahead_mask, dec_padding_mask)
            loss = loss_fn(output, target)
            val_loss += loss.item()
        val_losses.append(val_loss/len(val_loader))
        print(f'Epoch {epoch + 1}: Training Loss: {train_losses[-1]} - Validation Loss: {val_losses[-1]}')

In [None]:
device = 'cpu'
train(transformer, optimizer, loss_function, train_loader, val_loader, schedule, 10, device)