In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split
from tqdm import tqdm
import math

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

if tokenizer.bos_token is None:
    tokenizer.bos_token = '<bos>'
    tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids('<bos>')

if tokenizer.eos_token is None:
    tokenizer.eos_token = '<eos>'
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids('<eos>')

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.d_model = d_model  
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
        
        tgt_sub_mask = torch.tril(torch.ones((tgt.size(1), tgt.size(1)), device=tgt.device)).bool()
        tgt_mask = tgt_mask & tgt_sub_mask.unsqueeze(0).unsqueeze(0)
        
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        
        src = self.encoder_embedding(src) * math.sqrt(self.d_model)
        src = self.dropout(self.positional_encoding(src))
        
        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        
        tgt = self.decoder_embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.dropout(self.positional_encoding(tgt))
        
        for layer in self.decoder_layers:
            tgt = layer(tgt, src, src_mask, tgt_mask)
        
        output = self.fc(tgt)
        return output




config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [2]:
class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, tokenizer, max_length=64):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]

        src_encoding = self.tokenizer.encode_plus(src_text, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)
        tgt_encoding = self.tokenizer.encode_plus(tgt_text, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)
        
        src_input_ids = src_encoding['input_ids'].squeeze()
        tgt_input_ids = tgt_encoding['input_ids'].squeeze()

        return src_input_ids, tgt_input_ids

def collate_fn(batch):
    src_batch = [item[0] for item in batch]
    tgt_batch = [item[1] for item in batch]
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    return src_batch, tgt_batch

def loadData(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

vi_sentences = loadData("/kaggle/input/datasettrans/train.vi")
en_sentences = loadData("/kaggle/input/datasettrans/train.en")



In [3]:
en_train = en_sentences[0: 12800]
vi_train = vi_sentences[0: 12800] 

en_test = en_sentences[12800: 12800 + 300] 
vi_test = vi_sentences[12800: 12800 + 300]

en_val = en_sentences[12800 + 300: 12800 + 300 + 300] 
vi_val = vi_sentences[12800 + 300: 12800 + 300 + 300] 

In [4]:
# Create datasets and dataloaders
train_dataset = TranslationDataset(en_train, vi_train, tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=128, collate_fn=collate_fn)

val_dataset = TranslationDataset(en_val, vi_val, tokenizer)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=128, collate_fn=collate_fn)

In [5]:
train_dataset.__getitem__(4)

(tensor([    0,  1827,  6489,  9679,  4157,  4525, 48571,  1388,  7093, 28834,
          1913,  2292,  1302, 10516, 11277,  1913,  3148, 15494, 30934,  4933,
         37712, 11851,     5,     2,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]),
 tensor([   0, 1351,   82,  131,    8,   16, 4153,    7,   81,   16, 7362, 5191,
           12,  255, 2054,  222, 1660, 6248,    5,    2,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1]))

In [6]:
train_dataset.src_texts[4]

'They are both two branches of the same field of atmospheric science .'

In [7]:
train_dataset.tgt_texts[4]

'Cả hai đều là một nhánh của cùng một lĩnh vực trong ngành khoa học khí quyển .'

In [8]:
# Initialize model
src_vocab_size = tokenizer.vocab_size
tgt_vocab_size = tokenizer.vocab_size
d_model = 512
num_heads = 8
num_layers = 4
d_ff = 2048
max_seq_length = 64
dropout = 0.1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
epochs = 50
for epoch in range(epochs):
    total_train_loss = 0
    model.train()
    for src_batch, tgt_batch in tqdm(train_dataloader):
        src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
        optimizer.zero_grad()
        
        output = model(src_batch, tgt_batch[:, :-1])
        output = output.transpose(1, 2)
        loss = criterion(output, tgt_batch[:, 1:])
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss}")
    
    total_val_loss = 0
    model.eval()
    with torch.no_grad():
        for src_batch, tgt_batch in tqdm(val_dataloader):
            src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
            
            output = model(src_batch, tgt_batch[:, :-1])
            output = output.transpose(1, 2)
            val_loss = criterion(output, tgt_batch[:, 1:])
            total_val_loss += val_loss.item()
    
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Eval Loss: {avg_val_loss}")
    

    
PATH = '/kaggle/working/trans.pth'
torch.save(model.state_dict(), PATH)

cuda


100%|██████████| 100/100 [02:43<00:00,  1.64s/it]


Epoch 1, Train Loss: 7.416339235305786


100%|██████████| 3/3 [00:01<00:00,  2.41it/s]


Epoch 1, Eval Loss: 5.713122367858887


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 2, Train Loss: 5.555399537086487


100%|██████████| 3/3 [00:01<00:00,  2.44it/s]


Epoch 2, Eval Loss: 5.046143690745036


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 3, Train Loss: 5.049206099510193


100%|██████████| 3/3 [00:01<00:00,  2.38it/s]


Epoch 3, Eval Loss: 4.702757517496745


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 4, Train Loss: 4.718872761726379


100%|██████████| 3/3 [00:01<00:00,  2.42it/s]


Epoch 4, Eval Loss: 4.496665636698405


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 5, Train Loss: 4.480376181602478


100%|██████████| 3/3 [00:01<00:00,  2.38it/s]


Epoch 5, Eval Loss: 4.367178122202556


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 6, Train Loss: 4.287568554878235


100%|██████████| 3/3 [00:01<00:00,  2.38it/s]


Epoch 6, Eval Loss: 4.274004618326823


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 7, Train Loss: 4.1252974915504455


100%|██████████| 3/3 [00:01<00:00,  2.46it/s]


Epoch 7, Eval Loss: 4.18869670232137


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 8, Train Loss: 3.9805177545547483


100%|██████████| 3/3 [00:01<00:00,  2.39it/s]


Epoch 8, Eval Loss: 4.122767368952434


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 9, Train Loss: 3.845264630317688


100%|██████████| 3/3 [00:01<00:00,  2.42it/s]


Epoch 9, Eval Loss: 4.074082454045613


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 10, Train Loss: 3.7150524997711183


100%|██████████| 3/3 [00:01<00:00,  2.43it/s]


Epoch 10, Eval Loss: 4.036865313847859


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 11, Train Loss: 3.592560393810272


100%|██████████| 3/3 [00:01<00:00,  2.37it/s]


Epoch 11, Eval Loss: 3.9972101052602134


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 12, Train Loss: 3.4775629091262816


100%|██████████| 3/3 [00:01<00:00,  2.42it/s]


Epoch 12, Eval Loss: 3.9674972693125405


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 13, Train Loss: 3.364148962497711


100%|██████████| 3/3 [00:01<00:00,  2.45it/s]


Epoch 13, Eval Loss: 3.9347421328226724


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 14, Train Loss: 3.2544044399261476


100%|██████████| 3/3 [00:01<00:00,  2.43it/s]


Epoch 14, Eval Loss: 3.9272659619649253


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 15, Train Loss: 3.144320924282074


100%|██████████| 3/3 [00:01<00:00,  2.44it/s]


Epoch 15, Eval Loss: 3.9199702739715576


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 16, Train Loss: 3.038717358112335


100%|██████████| 3/3 [00:01<00:00,  2.38it/s]


Epoch 16, Eval Loss: 3.897386074066162


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 17, Train Loss: 2.9384568405151366


100%|██████████| 3/3 [00:01<00:00,  2.42it/s]


Epoch 17, Eval Loss: 3.912545124689738


100%|██████████| 100/100 [02:45<00:00,  1.66s/it]


Epoch 18, Train Loss: 2.8365317940711976


100%|██████████| 3/3 [00:01<00:00,  2.43it/s]


Epoch 18, Eval Loss: 3.907090425491333


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 19, Train Loss: 2.7327635216712953


100%|██████████| 3/3 [00:01<00:00,  2.44it/s]


Epoch 19, Eval Loss: 3.9231099287668862


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 20, Train Loss: 2.635183050632477


100%|██████████| 3/3 [00:01<00:00,  2.45it/s]


Epoch 20, Eval Loss: 3.9395782947540283


100%|██████████| 100/100 [02:45<00:00,  1.66s/it]


Epoch 21, Train Loss: 2.5352901220321655


100%|██████████| 3/3 [00:01<00:00,  2.40it/s]


Epoch 21, Eval Loss: 3.949622313181559


100%|██████████| 100/100 [02:45<00:00,  1.66s/it]


Epoch 22, Train Loss: 2.443274359703064


100%|██████████| 3/3 [00:01<00:00,  2.43it/s]


Epoch 22, Eval Loss: 3.985475460688273


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 23, Train Loss: 2.3469714856147768


100%|██████████| 3/3 [00:01<00:00,  2.42it/s]


Epoch 23, Eval Loss: 4.00720755259196


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 24, Train Loss: 2.258991277217865


100%|██████████| 3/3 [00:01<00:00,  2.45it/s]


Epoch 24, Eval Loss: 4.026662985483806


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 25, Train Loss: 2.1649688601493837


100%|██████████| 3/3 [00:01<00:00,  2.44it/s]


Epoch 25, Eval Loss: 4.035322348276774


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 26, Train Loss: 2.0736373615264894


100%|██████████| 3/3 [00:01<00:00,  2.40it/s]


Epoch 26, Eval Loss: 4.0890318552653


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 27, Train Loss: 1.9843728411197663


100%|██████████| 3/3 [00:01<00:00,  2.43it/s]


Epoch 27, Eval Loss: 4.104293505350749


100%|██████████| 100/100 [02:45<00:00,  1.66s/it]


Epoch 28, Train Loss: 1.8979486739635467


100%|██████████| 3/3 [00:01<00:00,  2.42it/s]


Epoch 28, Eval Loss: 4.135605176289876


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 29, Train Loss: 1.8104706919193267


100%|██████████| 3/3 [00:01<00:00,  2.41it/s]


Epoch 29, Eval Loss: 4.189175128936768


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 30, Train Loss: 1.7265040838718415


100%|██████████| 3/3 [00:01<00:00,  2.42it/s]


Epoch 30, Eval Loss: 4.240418752034505


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 31, Train Loss: 1.6456129050254822


100%|██████████| 3/3 [00:01<00:00,  2.46it/s]


Epoch 31, Eval Loss: 4.28283707300822


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 32, Train Loss: 1.5680766546726226


100%|██████████| 3/3 [00:01<00:00,  2.48it/s]


Epoch 32, Eval Loss: 4.340606053670247


100%|██████████| 100/100 [02:47<00:00,  1.67s/it]


Epoch 33, Train Loss: 1.4886435306072234


100%|██████████| 3/3 [00:01<00:00,  2.40it/s]


Epoch 33, Eval Loss: 4.35266129175822


100%|██████████| 100/100 [02:47<00:00,  1.67s/it]


Epoch 34, Train Loss: 1.4121513080596924


100%|██████████| 3/3 [00:01<00:00,  2.37it/s]


Epoch 34, Eval Loss: 4.419138272603353


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 35, Train Loss: 1.3401305782794952


100%|██████████| 3/3 [00:01<00:00,  2.37it/s]


Epoch 35, Eval Loss: 4.446452617645264


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 36, Train Loss: 1.2654249024391175


100%|██████████| 3/3 [00:01<00:00,  2.35it/s]


Epoch 36, Eval Loss: 4.5536314646403


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 37, Train Loss: 1.1984056282043456


100%|██████████| 3/3 [00:01<00:00,  2.44it/s]


Epoch 37, Eval Loss: 4.5606842041015625


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 38, Train Loss: 1.1299130934476853


100%|██████████| 3/3 [00:01<00:00,  2.42it/s]


Epoch 38, Eval Loss: 4.621124744415283


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 39, Train Loss: 1.06885284781456


100%|██████████| 3/3 [00:01<00:00,  2.43it/s]


Epoch 39, Eval Loss: 4.672598520914714


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 40, Train Loss: 1.0027736401557923


100%|██████████| 3/3 [00:01<00:00,  2.40it/s]


Epoch 40, Eval Loss: 4.704363187154134


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 41, Train Loss: 0.9471097421646119


100%|██████████| 3/3 [00:01<00:00,  2.42it/s]


Epoch 41, Eval Loss: 4.758705774943034


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 42, Train Loss: 0.8903688752651214


100%|██████████| 3/3 [00:01<00:00,  2.45it/s]


Epoch 42, Eval Loss: 4.815125624338786


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 43, Train Loss: 0.8313466066122055


100%|██████████| 3/3 [00:01<00:00,  2.47it/s]


Epoch 43, Eval Loss: 4.871533393859863


100%|██████████| 100/100 [02:46<00:00,  1.66s/it]


Epoch 44, Train Loss: 0.7814788848161698


100%|██████████| 3/3 [00:01<00:00,  2.41it/s]


Epoch 44, Eval Loss: 4.955506483713786


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 45, Train Loss: 0.7347620224952698


100%|██████████| 3/3 [00:01<00:00,  2.46it/s]


Epoch 45, Eval Loss: 4.986998399098714


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 46, Train Loss: 0.6849058127403259


100%|██████████| 3/3 [00:01<00:00,  2.39it/s]


Epoch 46, Eval Loss: 5.039760907491048


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 47, Train Loss: 0.6436994075775146


100%|██████████| 3/3 [00:01<00:00,  2.35it/s]


Epoch 47, Eval Loss: 5.0959954261779785


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 48, Train Loss: 0.6004395723342896


100%|██████████| 3/3 [00:01<00:00,  2.43it/s]


Epoch 48, Eval Loss: 5.113900025685628


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 49, Train Loss: 0.5632187166810035


100%|██████████| 3/3 [00:01<00:00,  2.39it/s]


Epoch 49, Eval Loss: 5.212279319763184


100%|██████████| 100/100 [02:46<00:00,  1.67s/it]


Epoch 50, Train Loss: 0.5267600268125534


100%|██████████| 3/3 [00:01<00:00,  2.41it/s]


Epoch 50, Eval Loss: 5.255292574564616


In [9]:
# Translation function
def translate_sentence(model, sentence, tokenizer, max_length=64):
    model.eval()
    with torch.no_grad():
        encoded_input = tokenizer.encode_plus(sentence, return_tensors="pt", padding="max_length", max_length=max_length, truncation=True)
        src_tensor = encoded_input['input_ids'].to(device)
        
        tgt_tokens = [tokenizer.bos_token_id]  # Use BOS token instead of CLS token
        for i in range(max_length):
            tgt_tensor = torch.tensor([tgt_tokens], dtype=torch.long).to(device)
            output = model(src_tensor, tgt_tensor)
            output_token = output[0, -1, :].argmax(dim=-1).item()
            if output_token == tokenizer.eos_token_id:
                break
            tgt_tokens.append(output_token)
        
        translation = tokenizer.decode(tgt_tokens, skip_special_tokens=True)
        return translation

# Test translation
test_sentences = ["Hello friend", "How are you?", "I am fine, thank you."]
for sentence in test_sentences:
    translation = translate_sentence(model, sentence, tokenizer)
    print(f"Input: {sentence}")
    print(f"Translation: {translation}")

2024-06-18 17:12:20.793073: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-18 17:12:20.793201: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-18 17:12:20.928135: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Input: Hello friend
Translation: Xin chào
Input: How are you?
Translation: Còn sao?
Input: I am fine, thank you.
Translation: Xin chào.


In [10]:
test_dataset = TranslationDataset(en_test, vi_test, tokenizer)
test_dataset.__getitem__(4)
    

(tensor([    0, 31400,  1388,  7441,  1118,  1517, 15814,     4,  4014,  1565,
          1388,  7441,  1118,  1517,  1236, 23999,     5,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]),
 tensor([   0,  345,   73,  443,  188, 5714,    4,   18,   73,  238,  188, 2084,
         1411,    2,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1]))

In [11]:
print(test_dataset.src_texts[4])

He buys a ring , she buys a dress .


In [12]:
translations = []

for sen in test_dataset.src_texts:
  translations.append(translate_sentence(model, sen, tokenizer))

tgt = []
for i in range(len(translations)):
  a = tokenizer.decode(test_dataset.__getitem__(i)[1], skip_special_tokens=True)
  tgt.append([a])

In [13]:
print(translations[10])
print(tgt[10]) 

Bạn không biết đấy, bởi vì chúng có thể nghe thấy những chiếc bình thường.
['Các bạn biết đấy là vì họ không thể nghe tiếng ngáy ngủ nữa']


In [14]:
!pip install evaluate

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [15]:
import evaluate
bleu = evaluate.load("bleu")
bleu.compute(predictions=translations, references=tgt)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.05814941682867853,
 'precisions': [0.3691331923890063,
  0.1158204562178072,
  0.034334103156274054,
  0.010169491525423728],
 'brevity_penalty': 0.9355069850316178,
 'length_ratio': 0.9375,
 'translation_length': 7095,
 'reference_length': 7568}