In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

from sklearn.model_selection import train_test_split
from tqdm import tqdm
import math

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

if tokenizer.bos_token is None:
    tokenizer.bos_token = '<bos>'
    tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids('<bos>')

if tokenizer.eos_token is None:
    tokenizer.eos_token = '<eos>'
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids('<eos>')

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.d_model = d_model  
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)
        
        tgt_sub_mask = torch.tril(torch.ones((tgt.size(1), tgt.size(1)), device=tgt.device)).bool()
        tgt_mask = tgt_mask & tgt_sub_mask.unsqueeze(0).unsqueeze(0)
        
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        
        src = self.encoder_embedding(src) * math.sqrt(self.d_model)
        src = self.dropout(self.positional_encoding(src))
        
        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        
        tgt = self.decoder_embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.dropout(self.positional_encoding(tgt))
        
        for layer in self.decoder_layers:
            tgt = layer(tgt, src, src_mask, tgt_mask)
        
        output = self.fc(tgt)
        return output




config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [2]:
class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts, tokenizer, max_length=64):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src_text = self.src_texts[idx]
        tgt_text = self.tgt_texts[idx]

        src_encoding = self.tokenizer.encode_plus(src_text, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)
        tgt_encoding = self.tokenizer.encode_plus(tgt_text, return_tensors="pt", padding="max_length", max_length=self.max_length, truncation=True)
        
        src_input_ids = src_encoding['input_ids'].squeeze()
        tgt_input_ids = tgt_encoding['input_ids'].squeeze()

        return src_input_ids, tgt_input_ids

def collate_fn(batch):
    src_batch = [item[0] for item in batch]
    tgt_batch = [item[1] for item in batch]
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    return src_batch, tgt_batch

def loadData(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return [line.strip() for line in lines]

vi_sentences = loadData("/kaggle/input/datasettrans/train.vi")
en_sentences = loadData("/kaggle/input/datasettrans/train.en")

vi_train, vi_val, en_train, en_val = train_test_split(vi_sentences, en_sentences, test_size=0.1, random_state=42)

# Create datasets and dataloaders
train_dataset = TranslationDataset(en_train, vi_train, tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=128, collate_fn=collate_fn)

val_dataset = TranslationDataset(en_val, vi_val, tokenizer)
val_dataloader = DataLoader(val_dataset, shuffle=False, batch_size=128, collate_fn=collate_fn)

In [3]:
train_dataset.__getitem__(4)

(tensor([    0,   870,  5353, 60824, 12188, 19530, 30498,  2292,  4177,  1659,
          4975, 26748,  5474,  4339,  2049,     4,   246, 13315,  6333, 26431,
             5,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1]),
 tensor([    0,   870,  5353, 60824,    74,    50,    10,  2151,  2856,    52,
           229,    40,    77,    70,    14,   437, 13315,     5,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             

In [4]:
train_dataset.src_texts[4]

'Sri Lanka still had the civil war going on , so Bali it was .'

In [5]:
# Initialize model
src_vocab_size = tokenizer.vocab_size
tgt_vocab_size = tokenizer.vocab_size
d_model = 512
num_heads = 8
num_layers = 4
d_ff = 2048
max_seq_length = 64
dropout = 0.1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
epochs = 20
for epoch in range(epochs):
    total_train_loss = 0
    model.train()
    for src_batch, tgt_batch in tqdm(train_dataloader):
        src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
        optimizer.zero_grad()
        
        output = model(src_batch, tgt_batch[:, :-1])
        output = output.transpose(1, 2)
        loss = criterion(output, tgt_batch[:, 1:])
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss}")
    
    total_val_loss = 0
    model.eval()
    with torch.no_grad():
        for src_batch, tgt_batch in tqdm(val_dataloader):
            src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
            
            output = model(src_batch, tgt_batch[:, :-1])
            output = output.transpose(1, 2)
            val_loss = criterion(output, tgt_batch[:, 1:])
            total_val_loss += val_loss.item()
    
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}, Eval Loss: {avg_val_loss}")
    

    
PATH = '/kaggle/working/trans.pth'
torch.save(model.state_dict(), PATH)

cuda


100%|██████████| 938/938 [25:17<00:00,  1.62s/it]


Epoch 1, Train Loss: 5.056412428935199


100%|██████████| 105/105 [00:49<00:00,  2.14it/s]


Epoch 1, Eval Loss: 4.195433269228254


100%|██████████| 938/938 [25:13<00:00,  1.61s/it]


Epoch 2, Train Loss: 3.9797180959664935


100%|██████████| 105/105 [00:48<00:00,  2.15it/s]


Epoch 2, Eval Loss: 3.7522595927828832


100%|██████████| 938/938 [25:14<00:00,  1.61s/it]


Epoch 3, Train Loss: 3.607441318823076


100%|██████████| 105/105 [00:48<00:00,  2.14it/s]


Epoch 3, Eval Loss: 3.508116168067569


100%|██████████| 938/938 [25:14<00:00,  1.61s/it]


Epoch 4, Train Loss: 3.3598559327217052


100%|██████████| 105/105 [00:48<00:00,  2.15it/s]


Epoch 4, Eval Loss: 3.3258542106265114


100%|██████████| 938/938 [25:15<00:00,  1.62s/it]


Epoch 5, Train Loss: 3.1736074802972105


100%|██████████| 105/105 [00:49<00:00,  2.14it/s]


Epoch 5, Eval Loss: 3.206379699707031


100%|██████████| 938/938 [25:17<00:00,  1.62s/it]


Epoch 6, Train Loss: 3.0203761909562132


100%|██████████| 105/105 [00:49<00:00,  2.14it/s]


Epoch 6, Eval Loss: 3.120989415759132


100%|██████████| 938/938 [25:15<00:00,  1.62s/it]


Epoch 7, Train Loss: 2.889595826551604


100%|██████████| 105/105 [00:49<00:00,  2.14it/s]


Epoch 7, Eval Loss: 3.058172180539086


100%|██████████| 938/938 [25:15<00:00,  1.62s/it]


Epoch 8, Train Loss: 2.774248494522404


100%|██████████| 105/105 [00:48<00:00,  2.14it/s]


Epoch 8, Eval Loss: 3.004350653148833


100%|██████████| 938/938 [25:16<00:00,  1.62s/it]


Epoch 9, Train Loss: 2.670287130229763


100%|██████████| 105/105 [00:48<00:00,  2.14it/s]


Epoch 9, Eval Loss: 2.9662366253989085


100%|██████████| 938/938 [25:15<00:00,  1.62s/it]


Epoch 10, Train Loss: 2.5768876637477103


100%|██████████| 105/105 [00:49<00:00,  2.12it/s]


Epoch 10, Eval Loss: 2.9382364363897415


100%|██████████| 938/938 [25:16<00:00,  1.62s/it]


Epoch 11, Train Loss: 2.488025595130188


100%|██████████| 105/105 [00:49<00:00,  2.14it/s]


Epoch 11, Eval Loss: 2.9216610613323395


100%|██████████| 938/938 [25:14<00:00,  1.61s/it]


Epoch 12, Train Loss: 2.4068735685429847


100%|██████████| 105/105 [00:48<00:00,  2.15it/s]


Epoch 12, Eval Loss: 2.9074859800792874


100%|██████████| 938/938 [25:15<00:00,  1.62s/it]


Epoch 13, Train Loss: 2.329885006459283


100%|██████████| 105/105 [00:48<00:00,  2.14it/s]


Epoch 13, Eval Loss: 2.90221327600025


100%|██████████| 938/938 [25:14<00:00,  1.61s/it]


Epoch 14, Train Loss: 2.256668046339234


100%|██████████| 105/105 [00:49<00:00,  2.13it/s]


Epoch 14, Eval Loss: 2.8965962114788235


100%|██████████| 938/938 [25:15<00:00,  1.62s/it]


Epoch 15, Train Loss: 2.1885699656472277


100%|██████████| 105/105 [00:49<00:00,  2.13it/s]


Epoch 15, Eval Loss: 2.901572983605521


100%|██████████| 938/938 [25:14<00:00,  1.61s/it]


Epoch 16, Train Loss: 2.1232171511090896


100%|██████████| 105/105 [00:48<00:00,  2.14it/s]


Epoch 16, Eval Loss: 2.906813739594959


100%|██████████| 938/938 [25:15<00:00,  1.62s/it]


Epoch 17, Train Loss: 2.0609925377851863


100%|██████████| 105/105 [00:49<00:00,  2.13it/s]


Epoch 17, Eval Loss: 2.9167130084264845


100%|██████████| 938/938 [25:16<00:00,  1.62s/it]


Epoch 18, Train Loss: 2.00273276976685


100%|██████████| 105/105 [00:49<00:00,  2.13it/s]


Epoch 18, Eval Loss: 2.92745737348284


100%|██████████| 938/938 [25:16<00:00,  1.62s/it]


Epoch 19, Train Loss: 1.945607002228816


100%|██████████| 105/105 [00:49<00:00,  2.13it/s]


Epoch 19, Eval Loss: 2.945558720543271


100%|██████████| 938/938 [25:16<00:00,  1.62s/it]


Epoch 20, Train Loss: 1.8910626089140805


100%|██████████| 105/105 [00:49<00:00,  2.14it/s]


Epoch 20, Eval Loss: 2.9649926798684256


In [6]:
# Translation function
def translate_sentence(model, sentence, tokenizer, max_length=64):
    model.eval()
    with torch.no_grad():
        encoded_input = tokenizer.encode_plus(sentence, return_tensors="pt", padding="max_length", max_length=max_length, truncation=True)
        src_tensor = encoded_input['input_ids'].to(device)
        
        tgt_tokens = [tokenizer.bos_token_id]  # Use BOS token instead of CLS token
        for i in range(max_length):
            tgt_tensor = torch.tensor([tgt_tokens], dtype=torch.long).to(device)
            output = model(src_tensor, tgt_tensor)
            output_token = output[0, -1, :].argmax(dim=-1).item()
            if output_token == tokenizer.eos_token_id:
                break
            tgt_tokens.append(output_token)
        
        translation = tokenizer.decode(tgt_tokens, skip_special_tokens=True)
        return translation

# Test translation
test_sentences = ["Hello friend", "How are you?", "I am fine, thank you."]
for sentence in test_sentences:
    translation = translate_sentence(model, sentence, tokenizer)
    print(f"Input: {sentence}")
    print(f"Translation: {translation}")

2024-06-16 23:49:28.715361: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-16 23:49:28.715498: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-16 23:49:28.844947: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Input: Hello friend
Translation: Xin chào bạn
Input: How are you?
Translation: trẻ tuổi nào?
Input: I am fine, thank you.
Translation: Tôi là trẻ nhỏ, cảm ơn.
