In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import numpy as np
import os
import math
from torch.utils.data import DataLoader, Dataset
import json
import re
from sklearn.model_selection import train_test_split

In [2]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model=512):
        super(Embedder, self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = d_model

        self.embed = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        out = self.embed(x)
        return out

In [3]:
class PositionalEncoder(nn.Module):
    def __init__(self, seq_len, d_model):
        super(PositionalEncoder, self).__init__()
        self.d_model = d_model

        pe = torch.zeros(seq_len, self.d_model)
        for pos in range(seq_len):
            for i in range(0, self.d_model, 2):
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/self.d_model)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/self.d_model)))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)


    def forward(self, x):
        x = x * math.sqrt(self.d_model)
        seq_len = x.size(1)
        x = x + torch.autograd.Variable(self.pe[:, :seq_len], requires_grad=False)
        return x


# test
# a = torch.rand(size=(32, 10, 512))
# net = PositionalEncoder(seq_len=10, d_model=512)
# b = net(a)
# b.requires_grad
# b.shape

In [4]:
class MultiheadAttention(nn.Module):
  def __init__(self, d_model=512, n_head=8):
    super(MultiheadAttention, self).__init__()
    self.d_model = d_model
    self.n_head = n_head
    self.d_k = int(d_model/n_head)

    self.q_matrix = nn.Linear(d_model, d_model)
    self.k_matrix = nn.Linear(d_model, d_model)
    self.v_matrix = nn.Linear(d_model, d_model)
    self.o_matrix = nn.Linear(d_model, d_model)

  def split_head(self, x):
    # x: (32, 10, 512)
    batch_size = x.shape[0]
    return x.view(batch_size, -1, self.n_head, self.d_k).permute(0, 2, 1, 3) #(32, 10, 512) => (32, 10, 8, 64) => (32, 8, 10, 64)

  def forward(self, q, k, v, mask = None):
    """
    q, k, v: (batch_size, seq_len, d_model)
    mask
    """

    batch_size = q.shape[0]

    q = self.q_matrix(q)
    k = self.k_matrix(k)
    v = self.v_matrix(v)

    q, k, v = self.split_head(q), self.split_head(k), self.split_head(v) #(32, 8, 10, 64)
    k = k.transpose(-2, -1) #(32, 8, 64, 10)

    score = torch.matmul(q, k)/math.sqrt(self.d_model) #(32, 8, 10, 10)
    if mask is not None:
      score = score.masked_fill(mask == 0, -1e9)
    score = nn.functional.softmax(score, dim=-1)

    attn_score = torch.matmul(score, v) #(32, 8, 10, 64)
    attn_score = attn_score.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)

    output = self.o_matrix(attn_score)
    return output

# x = torch.rand(size=(32, 10, 512))
# net = MultiheadAttention(d_model=512, n_head=8)
# a = net(k=x, q=x, v=x)
# a.shape

In [5]:
class TransformerBlock(nn.Module):
  def __init__(self, d_model=512, n_head=8, factor=4):
    super(TransformerBlock, self).__init__()

    self.d_model = d_model
    self.n_head = n_head
    self.factor = factor

    self.multihead_attention = MultiheadAttention(d_model, n_head)
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)

    self.feed_forward = nn.Sequential(
        nn.Linear(d_model, factor*d_model),
        nn.ReLU(),
        nn.Linear(factor*d_model, d_model)
    )

    self.dropout1 = nn.Dropout(0.2)
    self.dropout2 = nn.Dropout(0.2)

  def forward(self, q, k, v):
    """
    x: (batch_size, sequence length, embedded dimension)
    """

    attention_out = self.multihead_attention(q, k, v)
    attention_res_out = attention_out + v
    norm1_out = self.dropout1(self.norm1(attention_res_out))
    fw_out = self.feed_forward(norm1_out)
    fw_res_out = norm1_out + fw_out
    norm2_out = self.dropout2(self.norm2(fw_res_out))

    return norm2_out



x = torch.rand(size=(32, 10, 512))
net = TransformerBlock(d_model=512, n_head=8)
a = net(x, x, x)
print(a.shape)

torch.Size([32, 10, 512])


In [6]:
class TransformerEncoder(nn.Module):
    def __init__(self, seq_len, vocab_size, d_model=512, num_layer=6, factor=4, n_head=8):
        super(TransformerEncoder, self).__init__()
        self.seq_len = seq_len
        self.vocab_size = vocab_size
        self.num_layer = num_layer
        self.factor = factor
        self.n_head = n_head
        self.d_model = d_model

        self.embedding_layer = Embedder(vocab_size=vocab_size, d_model=d_model)
        self.positional_encoder = PositionalEncoder(seq_len=seq_len, d_model=d_model)

        self.layers = nn.ModuleList([TransformerBlock(d_model=d_model, n_head=n_head, factor=factor) for i in range(num_layer)])

    def forward(self, x):
        embed_out = self.embedding_layer(x)
        out = self.positional_encoder(embed_out)
        for layer in self.layers:
            out = layer(out, out, out)

        return out

x = torch.randint(size=(32, 10), low=1, high=100)
net = TransformerEncoder(seq_len=10, vocab_size=200, num_layer=1)
a = net(x)
print(a.shape)

torch.Size([32, 10, 512])


In [7]:
class DecoderBlock(nn.Module):
  def __init__(self, d_model=512, factor=4, n_head=8):
    super(DecoderBlock, self).__init__()
    self.d_model = d_model
    self.factor = factor
    self.n_head = n_head

    self.attention = MultiheadAttention(d_model, n_head=8)
    self.norm = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(0.2)
    self.transformer_block = TransformerBlock(d_model, factor=factor, n_head=n_head)

  def forward(self, x, k, v, mask):
    attention = self.attention(x, x, x, mask=mask)
    q = self.dropout(self.norm(attention + x))
    out = self.transformer_block(q, k, v)

    return out

In [8]:
class TransformerDecoder(nn.Module):
  def __init__(self, seq_len, vocab_size, d_model = 512, num_layer=2, factor=4, n_head=8):
    super(TransformerDecoder, self).__init__()
    self.vocab_size = vocab_size
    self.d_model = d_model
    self.num_layer = num_layer
    self.factor = factor
    self.n_head = n_head

    self.embedding_layer = Embedder(vocab_size=vocab_size, d_model=d_model)
    self.positional_encoder = PositionalEncoder(seq_len=seq_len, d_model=d_model)

    self.layers = nn.ModuleList([
        DecoderBlock(d_model, factor=factor, n_head=n_head)
        for i in range(self.num_layer)
    ])

    self.fc_out = nn.Linear(d_model, vocab_size)
    self.dropout = nn.Dropout(0.2)

  def forward(self, x, encoder_out, mask):
    x = self.embedding_layer(x)
    x = self.positional_encoder(x)
    x = self.dropout(x)

    for layer in self.layers:
      x = layer(x=x, k=encoder_out, v=encoder_out, mask=mask)

    out = nn.functional.softmax(self.fc_out(x), dim=-1) #batch_size, seq_len, vocab_size

    return out

In [9]:
class Transformer(nn.Module):
  def __init__(self, d_model, src_vocab_size, target_vocab_size, seq_len, num_layer=6, factor=4, n_head=8):
    super(Transformer, self).__init__()
    self.d_model = d_model
    self.src_vocab_size = src_vocab_size
    self.target_vocab_size = target_vocab_size
    self.seq_len = seq_len
    self.num_laye = num_layer
    self.factor = factor
    self.n_head = n_head

    self.encoder = TransformerEncoder(seq_len=seq_len, vocab_size=src_vocab_size, d_model=d_model, num_layer=num_layer, factor=factor, n_head=n_head)
    self.decoder = TransformerDecoder(seq_len=seq_len, vocab_size=target_vocab_size, d_model=d_model, num_layer=num_layer, factor=factor, n_head=n_head)

  def make_target_mask(self, trg):
    device = trg.device
    tgt_mask = (trg != 0).unsqueeze(1).unsqueeze(3).to(device)
    seq_length = trg.size(1)
    nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().to(device)
    tgt_mask = tgt_mask & nopeak_mask
    return tgt_mask.to(trg.device)

  def decode(self, src):


        batch_size, seq_len = src.shape[0], src.shape[1]
        trg = torch.zeros(size=(batch_size, seq_len), dtype=torch.int)
        trg[:, 0] = 1

        enc_out = self.encoder(src)

        for i in range(1, seq_len):
          trg_mask = self.make_target_mask(trg)
          out = self.decoder(x=trg, encoder_out=enc_out, mask=trg_mask)
          out = out.argmax(-1)[:, i]
          trg[:, i] = out

        return trg

  def forward(self, src, trg):

    trg_mask = self.make_target_mask(trg)
    enc_out = self.encoder(src)

    outputs = self.decoder(trg, enc_out, trg_mask)
    return outputs 

In [10]:
en = []
vi = []
en_path = "/kaggle/input/transformer/en_sents.txt"
vi_path = "/kaggle/input/transformer/vi_sents.txt"

num_word_vi_path = "/kaggle/input/transformer/num_word_vi.json"
word_num_vi_path = "/kaggle/input/transformer/word_num_vi.json"
num_word_en_path = "/kaggle/input/transformer/num_word_en.json"
word_num_en_path = "/kaggle/input/transformer/word_num_en.json"

In [11]:
def read_json(path):
  with open(path) as f:
    d = json.load(f)

  return d

def get_dict(lang: str):
  if lang == "vi":
    num_word_vi_path = "/kaggle/input/transformer/num_word_vi.json"
    word_num_vi_path = "/kaggle/input/transformer/word_num_vi.json"

    num_word, word_num = read_json(num_word_vi_path), read_json(word_num_vi_path)
    return word_num, num_word

  if lang == "en":
    num_word_en_path = "/kaggle/input/transformer/num_word_en.json"
    word_num_en_path = "/kaggle/input/transformer/word_num_en.json"
    num_word, word_num = read_json(num_word_en_path), read_json(word_num_en_path)
    return word_num, num_word


word_num_en, num_word_en = get_dict(lang="en")
word_num_vi, num_word_vi = get_dict(lang="vi")

In [12]:
def process_sentence(sentence):
  sentence = re.sub(
  r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
  sentence = re.sub(r',', '', sentence)
  sentence = re.sub(r"[ ]+", " ", sentence)
  sentence = re.sub(r"\!+", "!", sentence)
  sentence = re.sub(r"\,+", ",", sentence)
  sentence = re.sub(r"\?+", "?", sentence)
  sentence = re.sub(r"\.$", "", sentence)
  sentence = re.sub(r"\!$", "", sentence)
  sentence = re.sub(r"\?$", "", sentence)
  sentence = sentence.lower()

  return sentence

sentence="tôi yêu em, nhưng tôi ngu"
process_sentence(sentence)

'tôi yêu em nhưng tôi ngu'

In [13]:
def text_to_tensor(sentence, lang, seq_len = 64):
  word_num = None
  num_word = None
  if lang == "en": 
    word_num, num_word = word_num_en, num_word_en
  elif lang == "vi": 
    word_num, num_word = word_num_vi, num_word_vi
  
  sentence = process_sentence(sentence)
  sentence = sentence.split()
  sentence = ['<START>'] + sentence + ["<EOS>"]
  if len(sentence) < seq_len:
    spare_len = seq_len - len(sentence)
    sentence = sentence + ["<PAD>"] * spare_len

  for i in range(len(sentence)):
    if sentence[i] not in word_num:
      sentence[i] = word_num['<UNK>']
    sentence[i] = word_num[sentence[i]]

  sentence = torch.tensor(sentence)

  return sentence

def read_data(path):
    data = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(line.strip())
    return data

vi_data = read_data(vi_path)
en_data = read_data(en_path)
print(vi_data[:10])
print(text_to_tensor("i love you!", lang="en"))

['xin vui lòng đặt người quét rác trong tủ chổi', 'im lặng một lát', 'đọc này', 'tom thuyết phục người quản lý cửa hàng trả lại tiền cho anh ta.', 'tình bạn bao gồm sự hiểu biết lẫn nhau', 'ngày mai bạn có đến không', 'nhìn thấy vấn đề này ngay lập tức, bạn sẽ?', 'tôi đã cho bạn bè của tôi xem những tấm bưu thiếp hình ảnh.', 'mary là em út trong ba chị em', 'anh ấy có hai người dì ở bên mẹ.']
tensor([  1,  43, 284,  34,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0])


In [14]:
def data_to_tensor(data, lang):
  tensor_data = []
  for i in range(len(data)):
    tensor_sent = text_to_tensor(sentence=data[i], lang=lang).unsqueeze(0)
    tensor_data.append(tensor_sent)

  tensor_data = torch.cat(tensor_data, dim=0)
  return tensor_data

vi_data_tensor = data_to_tensor(vi_data, lang="vi")
en_data_tensor = data_to_tensor(en_data, lang="en")
print(vi_data_tensor.shape)
vi_data_tensor[:10, :]

torch.Size([254090, 64])


tensor([[ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,  2,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 14, 15, 16, 17,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 18, 19,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1, 20, 21, 22,  8, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,  2,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,

In [15]:
print(len(num_word_vi))
print(len(num_word_en))

7907
22199


In [16]:
class CustomDataset(Dataset):
  def __init__(self, data_vi_tensor, data_en_tensor):
    self.data_vi = data_vi_tensor 
    self.data_en = data_en_tensor

  def __len__(self):
    return self.data_vi.shape[0]

  def __getitem__(self, index):
    return {
        "input_vi": self.data_vi[index],
        "input_en": self.data_en[index]
    }


In [17]:
train_vi, test_vi, train_en, test_en = train_test_split(vi_data_tensor, en_data_tensor, test_size=0.2, random_state=42)
train_vi, valid_vi, train_en, valid_en = train_test_split(train_vi, train_en, test_size=0.1, random_state=42)
print(train_vi[0], train_en[0])

tensor([   1,  148,   55,  236,  244,  579,  166,  276, 1069, 1070,  342,  183,
         184,   44,  149,    2,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]) tensor([   1,  149,   43,  151,   59,    6,  689,   30, 1585, 1290,  156,  510,
          22,  134,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])


In [18]:
train_dataset = CustomDataset(data_vi_tensor=train_vi, data_en_tensor=train_en)
valid_dataset = CustomDataset(data_vi_tensor=valid_vi, data_en_tensor=valid_en)
test_dataset = CustomDataset(data_vi_tensor=test_vi, data_en_tensor=test_en)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False)

In [20]:
def train(model, epochs, optimizer, criterion, train_loader: DataLoader, device):
    model.train()
    for epoch in range(epochs):
        loss = None
        for i, data in enumerate(train_loader): 
            input_vi = data['input_vi'].to(device)
            input_en = data['input_en'].to(device)
            output = model(input_vi, input_en) #batch_size, seq_len, trg_vocab_size
            batch_size, seq_len, trg_vocab = output.shape

            input_target = torch.zeros(batch_size, seq_len, trg_vocab).to(device)

            for batch in range(batch_size):
                for len in range(seq_len): 
                    a = torch.zeros(trg_vocab).to(device)
                    a[input_en[batch, len]] = 1.
                    input_target[batch, len] = a 

            loss = criterion(output, input_target)  

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            

            if i % 128 == 0: 
                path = '/kaggle/working/model.pth'
                torch.save(model.state_dict(), path)
                print(f"Epoch: {epoch}, step: {i}, train loss: {loss.item()}")


In [21]:
SEQ_LEN = 64 
D_MODEL = 128 
SRC_VOCAB = len(num_word_vi)
TRG_VOCAB = len(num_word_en)
EPOCHS = 10
LR = 2e-4
NUM_LAYERS = 2


device = torch.device("cuda")
model = Transformer(d_model=D_MODEL, src_vocab_size=SRC_VOCAB, target_vocab_size=TRG_VOCAB, seq_len=SEQ_LEN, num_layer=NUM_LAYERS).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = nn.BCELoss()

train(model=model, epochs=EPOCHS, optimizer=optimizer, criterion=criterion, train_loader=train_loader, device=device)

Epoch: 0, step: 0, train loss: 0.0004943166277371347
Epoch: 0, step: 128, train loss: 0.00010715109965531155
Epoch: 0, step: 256, train loss: 7.530010043410584e-05
Epoch: 0, step: 384, train loss: 5.970893471385352e-05
Epoch: 0, step: 512, train loss: 4.8616930143907666e-05
Epoch: 0, step: 640, train loss: 4.234011430526152e-05
Epoch: 0, step: 768, train loss: 4.003898720839061e-05
Epoch: 0, step: 896, train loss: 3.948991434299387e-05
Epoch: 0, step: 1024, train loss: 3.881322481902316e-05
Epoch: 0, step: 1152, train loss: 3.9614984416402876e-05
Epoch: 0, step: 1280, train loss: 3.934179767384194e-05
Epoch: 0, step: 1408, train loss: 3.7908455851720646e-05
Epoch: 1, step: 0, train loss: 3.804587322520092e-05
Epoch: 1, step: 128, train loss: 3.5336855944478884e-05
Epoch: 1, step: 256, train loss: 3.441991066210903e-05
Epoch: 1, step: 384, train loss: 3.5026907426072285e-05
Epoch: 1, step: 512, train loss: 3.453363024163991e-05
Epoch: 1, step: 640, train loss: 3.4288299502804875e-05
Epo