In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

data = pd.read_csv('en-fr.csv')
data = data[data['source_language'].isin(['en'])]

train_data, val_data = train_test_split(data, test_size=0.2)


In [2]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers

# Initialize tokenizer for English
tokenizer_en = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace()
trainer_en = trainers.WordPieceTrainer(vocab_size=6000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer_en.train_from_iterator(train_data['source'].tolist(), trainer_en)

# Initialize tokenizer for French
tokenizer_fr = Tokenizer(models.WordPiece(unk_token="[UNK]"))
tokenizer_fr.pre_tokenizer = pre_tokenizers.Whitespace()
trainer_fr = trainers.WordPieceTrainer(vocab_size=6000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer_fr.train_from_iterator(train_data['reference'].tolist(), trainer_fr)

# Save the tokenizers
tokenizer_en.save("tokenizer_en.json")
tokenizer_fr.save("tokenizer_fr.json")





In [3]:
def encode_and_pad(tokenizer, texts, max_len):
    # Encode texts
    encoded_texts = [tokenizer.encode(text).ids for text in texts]
    
    # Pad encoded texts
    padded_texts = [seq + [tokenizer.token_to_id("[PAD]")] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in encoded_texts]
    return padded_texts

max_length = 50  # Define the maximum sequence length
train_data['source_encoded'] = encode_and_pad(tokenizer_en, train_data['source'], max_length)
train_data['target_encoded'] = encode_and_pad(tokenizer_fr, train_data['reference'], max_length)
val_data['source_encoded'] = encode_and_pad(tokenizer_en, val_data['source'], max_length)
val_data['target_encoded'] = encode_and_pad(tokenizer_fr, val_data['reference'], max_length)


In [4]:
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, src_data, tgt_data):
        self.src_data = src_data
        self.tgt_data = tgt_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, index):
        src = torch.tensor(self.src_data[index])
        tgt = torch.tensor(self.tgt_data[index])
        return src, tgt

train_dataset = TranslationDataset(train_data['source_encoded'].tolist(), train_data['target_encoded'].tolist())
val_dataset = TranslationDataset(val_data['source_encoded'].tolist(), val_data['target_encoded'].tolist())

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


In [5]:
import torch.nn as nn
import torch.optim as optim

class Seq2SeqGRU(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_size, hidden_size, num_layers):
        super(Seq2SeqGRU, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, embed_size)
        self.encoder = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.decoder = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.out = nn.Linear(hidden_size, tgt_vocab_size)

    def forward(self, src, tgt):
        embedded_src = self.encoder_embedding(src)
        embedded_tgt = self.decoder_embedding(tgt)
        _, hidden = self.encoder(embedded_src)
        output, _ = self.decoder(embedded_tgt, hidden)
        return self.out(output)
    
    
src_vocab_size = 6000
tgt_vocab_size = 6000
embed_dim = 128  # Size of the embeddings
hidden_dim = 256
n_layers = 3

model = Seq2SeqGRU(src_vocab_size, tgt_vocab_size, embed_dim, hidden_dim, n_layers)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer_fr.token_to_id("[PAD]"))
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [70]:
def train_model(model, train_loader, val_loader, criterion, optimizer, n_epochs=10):
    for epoch in range(n_epochs):
        model.train()
        train_loss = 0
        for src, tgt in train_loader:
            src, tgt = src.long(), tgt.long()  # Ensure inputs are in long format for embeddings
            optimizer.zero_grad()
            outputs = model(src, tgt[:, :-1])
            loss = criterion(outputs.view(-1, tgt_vocab_size), tgt[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for src, tgt in val_loader:
                # print(src.shape, tgt.shape)
                src, tgt = src.long(), tgt.long()  # Ensure inputs are in long format for embeddings
                outputs = model(src, tgt[:, :-1])
                loss = criterion(outputs.view(-1, tgt_vocab_size), tgt[:, 1:].reshape(-1))
                val_loss += loss.item()

        print(f'Epoch {epoch + 1}, Train Loss: {train_loss / len(train_loader)}, Val Loss: {val_loss / len(val_loader)}')

train_model(model, train_loader, val_loader, criterion, optimizer, n_epochs=10)


Epoch 1, Train Loss: 5.669390930175782, Val Loss: 5.401816219091415
Epoch 2, Train Loss: 5.042625526428223, Val Loss: 4.804058834910393
Epoch 3, Train Loss: 4.331138456344605, Val Loss: 4.089646831154823
Epoch 4, Train Loss: 3.559778356552124, Val Loss: 3.392179347574711
Epoch 5, Train Loss: 2.8656258583068848, Val Loss: 2.880200032144785
Epoch 6, Train Loss: 2.3665108690261842, Val Loss: 2.5587290078401566
Epoch 7, Train Loss: 1.9911110172271729, Val Loss: 2.34603931568563
Epoch 8, Train Loss: 1.702771348953247, Val Loss: 2.1865750085562468
Epoch 9, Train Loss: 1.4552901000976564, Val Loss: 2.0931572504341602
Epoch 10, Train Loss: 1.2492290813922882, Val Loss: 2.024342691525817
