In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

from tqdm import tqdm
import math


tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [2]:
if tokenizer.bos_token is None:
    tokenizer.bos_token = '<bos>'
    tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids('<bos>')

if tokenizer.eos_token is None:
    tokenizer.eos_token = '<eos>'
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids('<eos>')

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

SOS_token = tokenizer.bos_token_id


In [3]:
class Encoder(nn.Module):
  def __init__(self, src_vocab_size, embedding_dim, hidden_dim, dropout):
    super(Encoder, self).__init__()
    self.src_vocab_size = src_vocab_size
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim
    self.embedding = nn.Embedding(src_vocab_size, embedding_dim)
    self.gru = nn.GRU(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src_input):
    embedded = self.dropout(self.embedding(src_input))
    output, hidden = self.gru(embedded)
    return output, hidden


class Decoder(nn.Module):
  def __init__(self, tgt_vocab_size, hidden_dim, dropout):
    super(Decoder, self).__init__()
    self.hidden_dim = hidden_dim
    self.tgt_vocab_size = tgt_vocab_size
    self.embedding = nn.Embedding(tgt_vocab_size, hidden_dim)
    self.dropout = nn.Dropout(dropout)

    self.gru = nn.GRU(input_size=hidden_dim, hidden_size=hidden_dim, batch_first=True)
    self.out = nn.Linear(hidden_dim, tgt_vocab_size)

  def forward(self, input, hidden):
    embedded = self.dropout(self.embedding(input))
    output, hidden = self.gru(embedded, hidden)
    output = self.out(output)
    return output, hidden


class Seq2seq(nn.Module):
  def __init__(self, src_vocab_size, tgt_vocab_size, embedding_dim, hidden_dim, dropout):
    super(Seq2seq, self).__init__()
    self.encoder = Encoder(src_vocab_size, embedding_dim, hidden_dim, dropout)
    self.decoder = Decoder(tgt_vocab_size, hidden_dim, dropout)

  def forward(self, src_input, tgt_input):
    max_length = tgt_input.shape[1]

    batch_size = src_input.shape[0]

    enc_output, enc_hidden = self.encoder(src_input)
    dec_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)

    dec_hidden = enc_hidden

    outputs = []
    for i in range(max_length):
      dec_output, dec_hidden = self.decoder(dec_input, dec_hidden)
      outputs.append(dec_output)
      dec_input = tgt_input[:, i].unsqueeze(1)

    outputs = torch.cat(outputs, dim=1)
    outputs = nn.functional.log_softmax(outputs, dim=-1)
    return outputs

#test
a = torch.randint(size=(32, 64), low=0, high=256).to(device)
b = torch.randint(size=(32, 64), low=0, high=256).to(device)
model = Seq2seq(1000, 1000, 256, 512, 0.1).to(device)
output = model(a, b)

print(output.shape)

torch.Size([32, 64, 1000])


In [4]:
class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, tokenizer, max_length=64):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]

        src_encoding = self.tokenizer.encode_plus(src_text, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)
        tgt_encoding = self.tokenizer.encode_plus(tgt_text, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)

        src_input_ids = src_encoding['input_ids'].squeeze()
        tgt_input_ids = tgt_encoding['input_ids'].squeeze()

        return src_input_ids, tgt_input_ids

def collate_fn(batch):
    src_batch = [item[0] for item in batch]
    tgt_batch = [item[1] for item in batch]
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    return src_batch, tgt_batch

def loadData(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

en_sentences = loadData("/kaggle/input/dataset/train.en")
vi_sentences = loadData("/kaggle/input/dataset/train.vi")


In [5]:
en_train = en_sentences[0: 12800]
vi_train = vi_sentences[0: 12800] 

en_test = en_sentences[12800: 12800 + 300] 
vi_test = vi_sentences[12800: 12800 + 300]

en_val = en_sentences[12800 + 300: 12800 + 300 + 300] 
vi_val = vi_sentences[12800 + 300: 12800 + 300 + 300] 

In [6]:
train_dataset = TranslationDataset(en_train, vi_train, tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=128, collate_fn=collate_fn)

val_dataset = TranslationDataset(en_val, vi_val, tokenizer)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=128, collate_fn=collate_fn)

test_dataset = TranslationDataset(en_test, vi_test, tokenizer)

In [7]:
src_vocab_size = tokenizer.vocab_size
tgt_vocab_size = tokenizer.vocab_size

embedding_dim = 256
hidden_dim = 512
dropout = 0.1

model = Seq2seq(src_vocab_size, tgt_vocab_size, embedding_dim, hidden_dim, dropout)
model = model.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 50

for epoch in range(epochs):
  total_loss = 0
  model.train()
  for src_input, tgt_input in tqdm(train_dataloader):
    src_input = src_input.to(device)
    tgt_input = tgt_input.to(device)

    optimizer.zero_grad()

    outputs = model(src_input, tgt_input)

    outputs = outputs.view(-1, src_vocab_size)

    tgt_input = tgt_input.view(-1)

    loss = criterion(outputs, tgt_input)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  avg_train_loss = total_loss / len(train_dataloader)
  print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss}")

  total_val_loss = 0
  model.eval()
  with torch.no_grad():
   for src_input, tgt_input in tqdm(train_dataloader):
    src_input = src_input.to(device)
    tgt_input = tgt_input.to(device)

    optimizer.zero_grad()

    outputs = model(src_input, tgt_input)

    outputs = outputs.view(-1, src_vocab_size)
    tgt_input = tgt_input.view(-1)

    loss = criterion(outputs, tgt_input)
    total_val_loss += loss.item()

  avg_val_loss = total_val_loss / len(val_dataloader)
  print(f"Epoch {epoch+1}, Eval Loss: {avg_val_loss}")

torch.save(model.state_dict(), "/kaggle/working/model.pth")

100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


Epoch 1, Train Loss: 6.057565503120422


100%|██████████| 100/100 [00:34<00:00,  2.88it/s]


Epoch 1, Eval Loss: 168.02517970403036


100%|██████████| 100/100 [01:34<00:00,  1.06it/s]


Epoch 2, Train Loss: 4.800240745544434


100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


Epoch 2, Eval Loss: 148.2323611577352


100%|██████████| 100/100 [01:34<00:00,  1.05it/s]


Epoch 3, Train Loss: 4.371626844406128


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 3, Eval Loss: 136.75490967432657


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 4, Train Loss: 4.090284337997437


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 4, Eval Loss: 128.35619179407755


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 5, Train Loss: 3.8698006653785706


100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


Epoch 5, Eval Loss: 121.10094706217448


100%|██████████| 100/100 [01:34<00:00,  1.05it/s]


Epoch 6, Train Loss: 3.6825115966796873


100%|██████████| 100/100 [00:35<00:00,  2.86it/s]


Epoch 6, Eval Loss: 114.97996505101521


100%|██████████| 100/100 [01:34<00:00,  1.05it/s]


Epoch 7, Train Loss: 3.5183021807670594


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 7, Eval Loss: 109.504332224528


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 8, Train Loss: 3.3690885162353514


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 8, Eval Loss: 104.47916666666667


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 9, Train Loss: 3.2311485314369204


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 9, Eval Loss: 99.70310266812642


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 10, Train Loss: 3.1006514978408815


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 10, Eval Loss: 95.21292241414388


100%|██████████| 100/100 [01:34<00:00,  1.05it/s]


Epoch 11, Train Loss: 2.980485830307007


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 11, Eval Loss: 91.02735177675883


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 12, Train Loss: 2.865658061504364


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 12, Eval Loss: 86.99141017595927


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 13, Train Loss: 2.7580989480018614


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 13, Eval Loss: 83.24215523401897


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 14, Train Loss: 2.656025745868683


100%|██████████| 100/100 [00:35<00:00,  2.86it/s]


Epoch 14, Eval Loss: 79.67561912536621


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 15, Train Loss: 2.5608630990982055


100%|██████████| 100/100 [00:35<00:00,  2.86it/s]


Epoch 15, Eval Loss: 76.42583457628886


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 16, Train Loss: 2.4691732811927793


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 16, Eval Loss: 73.0643428961436


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 17, Train Loss: 2.379225833415985


100%|██████████| 100/100 [00:35<00:00,  2.86it/s]


Epoch 17, Eval Loss: 69.9464180469513


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 18, Train Loss: 2.297469234466553


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 18, Eval Loss: 67.00379558404286


100%|██████████| 100/100 [01:34<00:00,  1.05it/s]


Epoch 19, Train Loss: 2.221345398426056


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 19, Eval Loss: 64.30651370684306


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 20, Train Loss: 2.1471122610569


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 20, Eval Loss: 61.71052384376526


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 21, Train Loss: 2.079525773525238


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 21, Eval Loss: 59.26821176211039


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 22, Train Loss: 2.0144553065299986


100%|██████████| 100/100 [00:35<00:00,  2.86it/s]


Epoch 22, Eval Loss: 56.946348349253334


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 23, Train Loss: 1.9518517565727234


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 23, Eval Loss: 54.908960501352944


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 24, Train Loss: 1.8928236556053162


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 24, Eval Loss: 52.72146077950796


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 25, Train Loss: 1.8373034715652465


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 25, Eval Loss: 50.759365995725


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 26, Train Loss: 1.7849050748348236


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 26, Eval Loss: 48.9176131884257


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 27, Train Loss: 1.7355118668079377


100%|██████████| 100/100 [00:35<00:00,  2.86it/s]


Epoch 27, Eval Loss: 47.15660512447357


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 28, Train Loss: 1.687611492872238


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 28, Eval Loss: 45.44792620340983


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 29, Train Loss: 1.6429794645309448


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 29, Eval Loss: 43.91401179631551


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 30, Train Loss: 1.6088108026981354


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 30, Eval Loss: 42.57701416810354


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 31, Train Loss: 1.561667355298996


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 31, Eval Loss: 41.11813223361969


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 32, Train Loss: 1.5221706092357636


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 32, Eval Loss: 39.68386288483938


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 33, Train Loss: 1.4851665556430818


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 33, Eval Loss: 38.451826095581055


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 34, Train Loss: 1.4480081081390381


100%|██████████| 100/100 [00:35<00:00,  2.86it/s]


Epoch 34, Eval Loss: 37.19574328263601


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 35, Train Loss: 1.4145998442173005


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 35, Eval Loss: 36.016033947467804


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 36, Train Loss: 1.3830730652809142


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 36, Eval Loss: 34.80869082609812


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 37, Train Loss: 1.3528497505187989


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 37, Eval Loss: 33.829817394415535


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 38, Train Loss: 1.3195328307151795


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 38, Eval Loss: 32.82978485027949


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 39, Train Loss: 1.2946374893188477


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 39, Eval Loss: 31.824846665064495


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 40, Train Loss: 1.2639966869354249


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 40, Eval Loss: 30.855754216512043


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 41, Train Loss: 1.239521737098694


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 41, Eval Loss: 30.044210076332092


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 42, Train Loss: 1.2156154537200927


100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Epoch 42, Eval Loss: 29.10393915573756


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 43, Train Loss: 1.1897105038166047


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 43, Eval Loss: 28.432717045148213


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 44, Train Loss: 1.165582342147827


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 44, Eval Loss: 27.54764676094055


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 45, Train Loss: 1.1422358572483062


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 45, Eval Loss: 26.812563180923462


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 46, Train Loss: 1.12058061003685


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 46, Eval Loss: 26.119137684504192


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 47, Train Loss: 1.100774322748184


100%|██████████| 100/100 [00:35<00:00,  2.86it/s]


Epoch 47, Eval Loss: 25.421197791894276


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 48, Train Loss: 1.0782735633850098


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 48, Eval Loss: 24.702286799748737


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 49, Train Loss: 1.0604415148496629


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 49, Eval Loss: 24.108668347199757


100%|██████████| 100/100 [01:35<00:00,  1.05it/s]


Epoch 50, Train Loss: 1.0414984863996506


100%|██████████| 100/100 [00:35<00:00,  2.85it/s]


Epoch 50, Eval Loss: 23.55108352502187


In [8]:
# Translation function
def translate_sentence(model, sentence, tokenizer, max_length=64):
    model.eval()

    with torch.no_grad():
      src_input = tokenizer.encode_plus(sentence, return_tensors="pt", padding="max_length", max_length=max_length, truncation=True)
      src_input = src_input['input_ids'].to(device)

      enc_output, enc_hidden = model.encoder(src_input)
      dec_input = torch.empty(1, 1, dtype=torch.long, device=device).fill_(SOS_token).to(device)

      dec_hidden = enc_hidden

      outputs = []
      for i in range(max_length):
        dec_output, dec_hidden = model.decoder(dec_input, dec_hidden) #dec_output : (1, src_vocab_size)
        
        res = dec_output.argmax(dim=-1).item()
        if res == tokenizer.eos_token_id:
          break
        outputs.append(res)
        dec_input = torch.tensor([res]).reshape(1, 1).to(device)

      translation = tokenizer.decode(outputs, skip_special_tokens=True)
    return translation

In [9]:
translations = []

for sen in test_dataset.src_texts:
  translations.append(translate_sentence(model, sen, tokenizer))

tgt = []
for i in range(len(translations)):
  a = tokenizer.decode(test_dataset.__getitem__(i)[1], skip_special_tokens=True)
  tgt.append([a])

2024-06-18 17:31:11.626230: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-18 17:31:11.626328: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-18 17:31:11.750677: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
print(translations[10])
print(tgt[10]) 

Bạn biết đấy, chúng ta đã không có ý nghĩ nào.
['Các bạn biết đấy là vì họ không thể nghe tiếng ngáy ngủ nữa']


In [11]:
!pip install evaluate

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [12]:
import evaluate
bleu = evaluate.load("bleu")
bleu.compute(predictions=translations, references=tgt)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.014809472659057169,
 'precisions': [0.22305389221556887,
  0.03674801362088536,
  0.0071121647651503925,
  0.0009300883583940474],
 'brevity_penalty': 0.9705036457993693,
 'length_ratio': 0.9709302325581395,
 'translation_length': 7348,
 'reference_length': 7568}