In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import uuid
torch.manual_seed(42)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
# Replace with your actual file path
file_path = "eng_-french.csv"
df = pd.read_csv(file_path)
print(df.head())

Using device: cuda
  English words/sentences French words/sentences
0                     Hi.                 Salut!
1                    Run!                Cours !
2                    Run!               Courez !
3                    Who?                  Qui ?
4                    Wow!             Ça alors !


In [2]:
def build_vocab(sentences, min_freq=2):
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
    counter = Counter()

    for sentence in sentences:
        for word in sentence.lower().split():
            word = word.strip(".,!?;:()\"'")
            counter[word] += 1

    idx = len(vocab)
    for word, freq in counter.items():
        if freq >= min_freq and word not in vocab:
            vocab[word] = idx
            idx += 1
    return vocab


In [3]:
english_sentences = df['English words/sentences'].tolist()
french_sentences = df['French words/sentences'].tolist()
eng_vocab = build_vocab(english_sentences, min_freq=2)
fr_vocab = build_vocab(french_sentences, min_freq=2)
print(f"English vocab size: {len(eng_vocab)}")
print(f"French vocab size: {len(fr_vocab)}")

def tokenize_sentence(sentence, vocab, add_sos_eos=False, max_len=30):
    tokens = []
    if add_sos_eos:
        tokens.append(vocab["<sos>"])

    for word in sentence.lower().split():
        word = word.strip(".,!?;:()\"'")
        tokens.append(vocab.get(word, vocab["<unk>"]))

    if add_sos_eos:
        tokens.append(vocab["<eos>"])

    if len(tokens) > max_len:
        tokens = tokens[:max_len]
    else:
        tokens += [vocab["<pad>"]] * (max_len - len(tokens))

    return tokens

# Test tokenization
print(tokenize_sentence("I love this movie.", eng_vocab))
print(tokenize_sentence("J'aime ce film.", fr_vocab, add_sos_eos=True))


English vocab size: 10109
French vocab size: 18110
[16, 269, 120, 2249, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 537, 723, 4304, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [4]:
class TranslationDataset(Dataset):
    def __init__(self, source_sentences, target_sentences, src_vocab, tgt_vocab, max_len=30):
        self.source_sentences = source_sentences
        self.target_sentences = target_sentences
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, idx):
        src = self.source_sentences[idx]
        tgt = self.target_sentences[idx]
        src_tokens = tokenize_sentence(src, self.src_vocab, add_sos_eos=False, max_len=self.max_len)
        tgt_tokens = tokenize_sentence(tgt, self.tgt_vocab, add_sos_eos=True, max_len=self.max_len)
        return torch.tensor(src_tokens, dtype=torch.long), torch.tensor(tgt_tokens, dtype=torch.long)


In [5]:
train_src, val_src, train_tgt, val_tgt = train_test_split(
    english_sentences, french_sentences, test_size=0.2, random_state=42)

train_dataset = TranslationDataset(train_src, train_tgt, eng_vocab, fr_vocab, max_len=30)
val_dataset = TranslationDataset(val_src, val_tgt, eng_vocab, fr_vocab, max_len=30)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers=2, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.layer_norm = nn.LayerNorm(emb_dim)
        self.gru = nn.GRU(emb_dim, hid_dim, num_layers=n_layers,
                         dropout=dropout if n_layers > 1 else 0,
                         batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.layer_norm(self.embedding(src)))
        outputs, hidden = self.gru(embedded)

        # Combine bidirectional hidden states
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        hidden = hidden.unsqueeze(0).repeat(self.gru.num_layers, 1, 1)
        return outputs, hidden

class LuongAttention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 3, hid_dim)  # Adjusted for bidirectional encoder
        self.v = nn.Parameter(torch.rand(hid_dim))
        self.layer_norm = nn.LayerNorm(hid_dim)

    def forward(self, hidden, encoder_outputs):
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]

        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = self.layer_norm(energy)
        energy = energy.permute(0, 2, 1)

        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        attention = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=2, dropout=0.5):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.layer_norm = nn.LayerNorm(emb_dim)
        # Adjusted input dimension for GRU to account for bidirectional encoder
        self.gru = nn.GRU(emb_dim + hid_dim * 2, hid_dim, num_layers=n_layers,
                         dropout=dropout if n_layers > 1 else 0, batch_first=True)
        self.fc_out = nn.Linear(hid_dim * 3, output_dim)  # Adjusted for context
        self.attention = LuongAttention(hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.layer_norm(self.embedding(input)))
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        attn_weights = attn_weights.unsqueeze(1)

        context = torch.bmm(attn_weights, encoder_outputs)  # context: (batch_size, 1, hid_dim * 2)
        gru_input = torch.cat((embedded, context), dim=2)  # gru_input: (batch_size, 1, emb_dim + hid_dim * 2)

        output, hidden = self.gru(gru_input, hidden)
        output = output.squeeze(1)
        context = context.squeeze(1)

        output = self.fc_out(torch.cat((output, context), dim=1))
        return output, hidden, attn_weights.squeeze(1)

In [7]:
EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5
INPUT_DIM = len(eng_vocab)
OUTPUT_DIM = len(fr_vocab)

encoder = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT).to(device)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, N_LAYERS, DROPOUT).to(device)

# Optimizers and scheduler
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=1e-3)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    encoder_optimizer, mode='min', factor=0.5, patience=3)

criterion = nn.CrossEntropyLoss(ignore_index=fr_vocab["<pad>"])
pad_idx = fr_vocab["<pad>"]

In [8]:
def train_loop(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, device, pad_idx, teacher_forcing_ratio=0.5):
    encoder.train()
    decoder.train()
    total_loss, total_correct, total_tokens = 0, 0, 0

    for src_tokens, tgt_tokens in dataloader:
        src_tokens, tgt_tokens = src_tokens.to(device), tgt_tokens.to(device)
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, hidden = encoder(src_tokens)
        batch_size = src_tokens.size(0)
        tgt_len = tgt_tokens.size(1)
        output_dim = decoder.output_dim

        input = tgt_tokens[:, 0]
        outputs = torch.zeros(batch_size, tgt_len - 1, output_dim).to(device)

        for t in range(1, tgt_len):
            output, hidden, _ = decoder(input, hidden, encoder_outputs)
            outputs[:, t-1, :] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt_tokens[:, t] if teacher_force else top1

        output_flat = outputs.reshape(-1, output_dim)
        target_flat = tgt_tokens[:, 1:].reshape(-1)

        loss = criterion(output_flat, target_flat)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(encoder.parameters(), max_norm=1)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=1)

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()
        preds = output_flat.argmax(dim=1)
        mask = target_flat != pad_idx
        total_correct += ((preds == target_flat) & mask).sum().item()
        total_tokens += mask.sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0
    return avg_loss, accuracy

In [9]:
@torch.no_grad()
def evaluate_loop(dataloader, encoder, decoder, criterion, device, pad_idx):
    encoder.eval()
    decoder.eval()
    total_loss, total_correct, total_tokens = 0, 0, 0

    for src_tokens, tgt_tokens in dataloader:
        src_tokens, tgt_tokens = src_tokens.to(device), tgt_tokens.to(device)
        encoder_outputs, hidden = encoder(src_tokens)
        batch_size = src_tokens.size(0)
        tgt_len = tgt_tokens.size(1)
        output_dim = decoder.output_dim

        input = tgt_tokens[:, 0]
        outputs = torch.zeros(batch_size, tgt_len - 1, output_dim).to(device)

        for t in range(1, tgt_len):
            output, hidden, _ = decoder(input, hidden, encoder_outputs)
            outputs[:, t-1, :] = output
            top1 = output.argmax(1)
            input = top1

        output_flat = outputs.reshape(-1, output_dim)
        target_flat = tgt_tokens[:, 1:].reshape(-1)

        loss = criterion(output_flat, target_flat)
        total_loss += loss.item()
        preds = output_flat.argmax(dim=1)
        mask = target_flat != pad_idx
        total_correct += ((preds == target_flat) & mask).sum().item()
        total_tokens += mask.sum().item()

    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_tokens if total_tokens > 0 else 0
    return avg_loss, accuracy

@torch.no_grad()
def translate_sentence(sentence, src_vocab, tgt_vocab, encoder, decoder, device, max_len=30):
    encoder.eval()
    decoder.eval()

    tokens = tokenize_sentence(sentence, src_vocab, add_sos_eos=False, max_len=max_len)
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)

    encoder_outputs, hidden = encoder(src_tensor)

    input_token = tgt_vocab["<sos>"]
    output_tokens = []

    for _ in range(max_len):
        input_tensor = torch.LongTensor([input_token]).to(device)
        output, hidden, _ = decoder(input_tensor, hidden, encoder_outputs)
        pred_token = output.argmax(1).item()

        if pred_token == tgt_vocab["<eos>"]:
            break

        output_tokens.append(pred_token)
        input_token = pred_token

    # Convert tokens to words
    inv_tgt_vocab = {v: k for k, v in tgt_vocab.items()}
    translated = [inv_tgt_vocab.get(token, "<unk>") for token in output_tokens]
    return " ".join(translated)

# Training loop with early stopping
num_epochs = 50
best_val_loss = float('inf')
patience = 5
patience_counter = 0
best_model_state = None

for epoch in range(num_epochs):
    train_loss, train_acc = train_loop(train_loader, encoder, decoder,
                                     encoder_optimizer, decoder_optimizer,
                                     criterion, device, pad_idx)
    val_loss, val_acc = evaluate_loop(val_loader, encoder, decoder,
                                    criterion, device, pad_idx)

    scheduler.step(val_loss)

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"  Val   Loss: {val_loss:.4f} | Val   Acc: {val_acc:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_model_state = {
            'encoder': encoder.state_dict(),
            'decoder': decoder.state_dict(),
        }
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs")
            break


Epoch 1/50
  Train Loss: 3.5468 | Train Acc: 0.3930
  Val   Loss: 3.2092 | Val   Acc: 0.3971
Epoch 2/50
  Train Loss: 2.5336 | Train Acc: 0.4828
  Val   Loss: 3.1427 | Val   Acc: 0.4169
Epoch 3/50
  Train Loss: 2.3018 | Train Acc: 0.5054
  Val   Loss: 3.1874 | Val   Acc: 0.4214
Epoch 4/50
  Train Loss: 2.1901 | Train Acc: 0.5203
  Val   Loss: 3.2353 | Val   Acc: 0.4251
Epoch 5/50
  Train Loss: 2.1462 | Train Acc: 0.5249
  Val   Loss: 3.2252 | Val   Acc: 0.4318
Epoch 6/50
  Train Loss: 2.1152 | Train Acc: 0.5294
  Val   Loss: 3.3864 | Val   Acc: 0.4166
Epoch 7/50
  Train Loss: 2.0406 | Train Acc: 0.5418
  Val   Loss: 3.4043 | Val   Acc: 0.4268
Early stopping triggered after 7 epochs


In [27]:
test_sentence = "I am going to school"
translated = translate_sentence(test_sentence, eng_vocab, fr_vocab, encoder, decoder, device)
print(f"Input: {test_sentence}")
print(f"Translated: {translated}")

Input: I am going to school
Translated: je vais l'école l'école


In [17]:
if best_model_state:
    encoder.load_state_dict(best_model_state['encoder'])
    decoder.load_state_dict(best_model_state['decoder'])

    # Save model state dictionaries
    torch.save({
        'encoder_state_dict': encoder.state_dict(),
        'decoder_state_dict': decoder.state_dict(),
        'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
        'decoder_optimizer_state_dict': decoder_optimizer.state_dict(),
        'best_val_loss': best_val_loss,
        'epoch': epoch + 1
    }, 'translation_model.pth')

    # Save vocabularies
    import pickle
    with open('eng_vocab.pkl', 'wb') as f:
        pickle.dump(eng_vocab, f)
    with open('fr_vocab.pkl', 'wb') as f:
        pickle.dump(fr_vocab, f)
    print("Model and vocabularies saved successfully.")

# Test translation


# Optional: Code to load the model and vocabularies later
def load_model_and_vocabs(model_path, eng_vocab_path, fr_vocab_path, device):
    # Load vocabularies
    with open(eng_vocab_path, 'rb') as f:
        eng_vocab = pickle.load(f)
    with open(fr_vocab_path, 'rb') as f:
        fr_vocab = pickle.load(f)

    # Initialize model with same architecture
    encoder = Encoder(len(eng_vocab), EMB_DIM, HID_DIM, N_LAYERS, DROPOUT).to(device)
    decoder = Decoder(len(fr_vocab), EMB_DIM, HID_DIM, N_LAYERS, DROPOUT).to(device)

    # Load model state dictionaries
    checkpoint = torch.load(model_path, map_location=device)
    encoder.load_state_dict(checkpoint['encoder_state_dict'])
    decoder.load_state_dict(checkpoint['decoder_state_dict'])

    return encoder, decoder, eng_vocab, fr_vocab

Model and vocabularies saved successfully.
Input: What is your name
Translated: comment s'appelle ton nom 
