In [4]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import uuid
import time

torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

file_path = "eng_-french.csv"
df = pd.read_csv(file_path)
print(df.head())


def build_vocab(sentences, min_freq=2):
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
    counter = Counter()
    for sentence in sentences:
        for word in sentence.lower().split():
            word = word.strip(".,!?;:()\"'")
            counter[word] += 1
    idx = len(vocab)
    for word, freq in counter.items():
        if freq >= min_freq and word not in vocab:
            vocab[word] = idx
            idx += 1
    return vocab


english_sentences = df['English words/sentences'].tolist()
french_sentences = df['French words/sentences'].tolist()
eng_vocab = build_vocab(english_sentences, min_freq=2)
fr_vocab = build_vocab(french_sentences, min_freq=2)
print(f"English vocab size: {len(eng_vocab)}")
print(f"French vocab size: {len(fr_vocab)}")


def tokenize_sentence(sentence, vocab, add_sos_eos=False, max_len=30):
    tokens = []
    if add_sos_eos:
        tokens.append(vocab["<sos>"])
    for word in sentence.lower().split():
        word = word.strip(".,!?;:()\"'")
        tokens.append(vocab.get(word, vocab["<unk>"]))
    if add_sos_eos:
        tokens.append(vocab["<eos>"])
    if len(tokens) > max_len:
        tokens = tokens[:max_len]
    else:
        tokens += [vocab["<pad>"]] * (max_len - len(tokens))
    return tokens


class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, src_vocab, tgt_vocab, max_len=30):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        src = self.source_sentences[idx]
        tgt = self.target_sentences[idx]
        src_tokens = tokenize_sentence(src, self.src_vocab, add_sos_eos=False, max_len=self.max_len)
        tgt_tokens = tokenize_sentence(tgt, self.tgt_vocab, add_sos_eos=True, max_len=self.max_len)
        return torch.tensor(src_tokens, dtype=torch.long), torch.tensor(tgt_tokens, dtype=torch.long)


train_src, val_src, train_tgt, val_tgt = train_test_split(
    english_sentences, french_sentences, test_size=0.2, random_state=42)

train_dataset = TranslationDataset(train_src, train_tgt, eng_vocab, fr_vocab, max_len=30)
val_dataset = TranslationDataset(val_src, val_tgt, eng_vocab, fr_vocab, max_len=30)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


Using device: cuda
  English words/sentences French words/sentences
0                     Hi.                 Salut!
1                    Run!                Cours !
2                    Run!               Courez !
3                    Who?                  Qui ?
4                    Wow!             Ça alors !
English vocab size: 10109
French vocab size: 18110


In [5]:

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=2, dropout=0.5, max_len=30):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(max_len, emb_dim)
        self.layer_norm = nn.LayerNorm(emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=n_layers,
                            dropout=dropout if n_layers > 1 else 0,
                            batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        self.dropout = nn.Dropout(dropout)
        self.max_len = max_len

    def forward(self, src):
        batch_size, seq_len = src.size()
        positions = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(src.device)
        embedded = self.embedding(src) + self.pos_embedding(positions)
        embedded = self.dropout(self.layer_norm(embedded))
        outputs, (hidden, cell) = self.lstm(embedded)
        # Combine bidirectional hidden states
        hidden_cat = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)  # (batch, hid_dim*2)
        hidden = torch.tanh(self.fc(hidden_cat))  # (batch, hid_dim)
        # Prepare hidden and cell for decoder with n_layers
        hidden = hidden.unsqueeze(0).repeat(self.lstm.num_layers, 1, 1)  # (n_layers, batch, hid_dim)
        cell_cat = torch.cat((cell[-2, :, :], cell[-1, :, :]), dim=1)
        cell = torch.tanh(self.fc(cell_cat)).unsqueeze(0).repeat(self.lstm.num_layers, 1, 1)
        return outputs, (hidden, cell)


class LuongAttention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 3, hid_dim)  # Adjusted for bidir encoder
        self.v = nn.Parameter(torch.rand(hid_dim))
        self.layer_norm = nn.LayerNorm(hid_dim)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = self.layer_norm(energy)
        energy = energy.permute(0, 2, 1)
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        attention = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention, dim=1)


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=2, dropout=0.5):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.layer_norm = nn.LayerNorm(emb_dim)
        # Input to LSTM will be emb + context vector (hid_dim * 2)
        self.lstm = nn.LSTM(emb_dim + hid_dim * 2, hid_dim, num_layers=n_layers,
                            dropout=dropout if n_layers > 1 else 0,
                            batch_first=True)
        self.fc_out = nn.Linear(hid_dim * 3, output_dim)  # output, context concat
        self.attention = LuongAttention(hid_dim)
        self.dropout = nn.Dropout(dropout)
        # Learnable initial hidden and cell states
        self.init_hidden = nn.Parameter(torch.randn(n_layers, 1, hid_dim))
        self.init_cell = nn.Parameter(torch.randn(n_layers, 1, hid_dim))

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(1)  # (batch, 1)
        embedded = self.dropout(self.layer_norm(self.embedding(input)))  # (batch, 1, emb_dim)

        attn_weights = self.attention(hidden[-1], encoder_outputs)  # (batch, src_len)
        attn_weights = attn_weights.unsqueeze(1)  # (batch, 1, src_len)

        context = torch.bmm(attn_weights, encoder_outputs)  # (batch, 1, hid_dim*2)

        lstm_input = torch.cat((embedded, context), dim=2)  # (batch, 1, emb_dim + hid_dim*2)

        output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))  # output: (batch,1,hid_dim)

        output = output.squeeze(1)  # (batch, hid_dim)
        context = context.squeeze(1)  # (batch, hid_dim*2)

        output = self.fc_out(torch.cat((output, context), dim=1))  # (batch, output_dim)

        return output, hidden, cell, attn_weights.squeeze(1)


In [6]:

EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5
MAX_LEN = 30
INPUT_DIM = len(eng_vocab)
OUTPUT_DIM = len(fr_vocab)

encoder = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, max_len=MAX_LEN).to(device)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT).to(device)

encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    encoder_optimizer, mode='min', factor=0.5, patience=3)

criterion = nn.CrossEntropyLoss(ignore_index=fr_vocab["<pad>"])
pad_idx = fr_vocab["<pad>"]

In [8]:
def train_loop(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, device, pad_idx,
               teacher_forcing_ratio=0.5):
    encoder.train()
    decoder.train()
    total_loss, total_correct, total_tokens = 0, 0, 0

    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, (hidden, cell) = encoder(src)

        # Use learnable initial hidden + cell if needed (for first step, already set inside decoder)

        input_tok = tgt[:, 0]  # <sos>
        batch_size = src.shape[0]
        max_len = tgt.shape[1]
        outputs = torch.zeros(batch_size, max_len, OUTPUT_DIM).to(device)

        for t in range(1, max_len):
            output, hidden, cell, attn = decoder(input_tok, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            teacher_force = np.random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input_tok = tgt[:, t] if teacher_force else top1

        # flatten outputs and targets to calculate loss
        outputs = outputs[:, 1:].reshape(-1, OUTPUT_DIM)
        tgt = tgt[:, 1:].reshape(-1)
        loss = criterion(outputs, tgt)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1)

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item() * batch_size

        preds = outputs.argmax(1)
        non_pad = tgt != pad_idx
        total_correct += (preds[non_pad] == tgt[non_pad]).sum().item()
        total_tokens += non_pad.sum().item()

    avg_loss = total_loss / total_tokens
    accuracy = total_correct / total_tokens
    return avg_loss, accuracy


def evaluate(dataloader, encoder, decoder, criterion, device, pad_idx):
    encoder.eval()
    decoder.eval()
    total_loss, total_correct, total_tokens = 0, 0, 0

    with torch.no_grad():
        for src, tgt in dataloader:
            src, tgt = src.to(device), tgt.to(device)
            encoder_outputs, (hidden, cell) = encoder(src)

            input_tok = tgt[:, 0]
            batch_size = src.shape[0]
            max_len = tgt.shape[1]
            outputs = torch.zeros(batch_size, max_len, OUTPUT_DIM).to(device)

            for t in range(1, max_len):
                output, hidden, cell, attn = decoder(input_tok, hidden, cell, encoder_outputs)
                outputs[:, t] = output
                top1 = output.argmax(1)
                input_tok = top1

            outputs = outputs[:, 1:].reshape(-1, OUTPUT_DIM)
            tgt = tgt[:, 1:].reshape(-1)
            loss = criterion(outputs, tgt)

            total_loss += loss.item() * batch_size

            preds = outputs.argmax(1)
            non_pad = tgt != pad_idx
            total_correct += (preds[non_pad] == tgt[non_pad]).sum().item()
            total_tokens += non_pad.sum().item()

    avg_loss = total_loss / total_tokens
    accuracy = total_correct / total_tokens
    return avg_loss, accuracy


EPOCHS = 10

for epoch in range(1, EPOCHS + 1):
    start_time = time.time()
    train_loss, train_acc = train_loop(train_loader, encoder, decoder,
                                       encoder_optimizer, decoder_optimizer, criterion, device, pad_idx)
    val_loss, val_acc = evaluate(val_loader, encoder, decoder, criterion, device, pad_idx)
    scheduler.step(val_loss)
    end_time = time.time()

    print(
        f"Epoch {epoch} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Time: {end_time - start_time:.1f}s")


Epoch 1 | Train Loss: 0.5357 | Train Acc: 0.3457 | Val Loss: 0.4305 | Val Acc: 0.4139 | Time: 1130.6s
Epoch 2 | Train Loss: 0.3378 | Train Acc: 0.4967 | Val Loss: 0.3736 | Val Acc: 0.4633 | Time: 1142.2s
Epoch 3 | Train Loss: 0.2722 | Train Acc: 0.5518 | Val Loss: 0.3501 | Val Acc: 0.4891 | Time: 1140.5s
Epoch 4 | Train Loss: 0.2350 | Train Acc: 0.5892 | Val Loss: 0.3421 | Val Acc: 0.5051 | Time: 1143.0s
Epoch 5 | Train Loss: 0.2131 | Train Acc: 0.6134 | Val Loss: 0.3399 | Val Acc: 0.5129 | Time: 1141.8s
Epoch 6 | Train Loss: 0.1970 | Train Acc: 0.6337 | Val Loss: 0.3411 | Val Acc: 0.5165 | Time: 1149.4s
Epoch 7 | Train Loss: 0.1826 | Train Acc: 0.6532 | Val Loss: 0.3389 | Val Acc: 0.5227 | Time: 1145.6s
Epoch 8 | Train Loss: 0.1722 | Train Acc: 0.6668 | Val Loss: 0.3367 | Val Acc: 0.5278 | Time: 1149.1s
Epoch 9 | Train Loss: 0.1628 | Train Acc: 0.6810 | Val Loss: 0.3408 | Val Acc: 0.5290 | Time: 1170.7s
Epoch 10 | Train Loss: 0.1550 | Train Acc: 0.6926 | Val Loss: 0.3364 | Val Acc: 0.

In [None]:
MODEL_PATH = "eng_french_translation.pth"

torch.save({
    'encoder_state_dict': encoder.state_dict(),
    'decoder_state_dict': decoder.state_dict(),
    'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
    'decoder_optimizer_state_dict': decoder_optimizer.state_dict(),
    'eng_vocab': eng_vocab,
    'fr_vocab': fr_vocab,
}, MODEL_PATH)

print("Model saved.")

In [7]:
# Make sure model is initialized with the same architecture
encoder = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT, max_len=MAX_LEN).to(device)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT).to(device)

encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=1e-3)

checkpoint = torch.load("eng_french_translation.pth", map_location=device)

encoder.load_state_dict(checkpoint['encoder_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])
encoder_optimizer.load_state_dict(checkpoint['encoder_optimizer_state_dict'])
decoder_optimizer.load_state_dict(checkpoint['decoder_optimizer_state_dict'])

eng_vocab = checkpoint['eng_vocab']
fr_vocab = checkpoint['fr_vocab']

encoder.eval()
decoder.eval()

print("Model loaded.")


Model loaded.


In [9]:
def translate_sentence(sentence, encoder, decoder, eng_vocab, fr_vocab, device, max_len=30):
    encoder.eval()
    decoder.eval()

    tokens = tokenize_sentence(sentence, eng_vocab, add_sos_eos=False, max_len=max_len)
    src_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():
        encoder_outputs, (hidden, cell) = encoder(src_tensor)

    input_tok = torch.tensor([fr_vocab["<sos>"]], dtype=torch.long).to(device)
    output_sentence = []

    for _ in range(max_len):
        with torch.no_grad():
            output, hidden, cell, _ = decoder(input_tok, hidden, cell, encoder_outputs)
        top1 = output.argmax(1).item()

        if top1 == fr_vocab["<eos>"]:
            break

        output_sentence.append(top1)
        input_tok = torch.tensor([top1], dtype=torch.long).to(device)

    # Reverse vocab
    idx2word = {idx: word for word, idx in fr_vocab.items()}
    translated = [idx2word.get(idx, "<unk>") for idx in output_sentence]
    return ' '.join(translated)

# Example usage:
print(translate_sentence("I am going to school", encoder, decoder, eng_vocab, fr_vocab, device))


je vais à l'école
