# Translation with Sequence -> Sequence Model

In this task, we will use a Seq2Seq model for machine translation.

#### Import relevant libraries/packages

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import sentencepiece as spm
from sklearn.model_selection import train_test_split
import time
import os
from nltk.translate.bleu_score import corpus_bleu


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA Available:", torch.cuda.is_available())
print("Using device:", torch.cuda.get_device_name(0))


#### Load data into train, val, and test splits

In [None]:
def load_data(src_path, tgt_path, num_samples=None):
    with open(src_path, encoding='utf-8') as f_src, \
         open(tgt_path, encoding='utf-8') as f_tgt:
        src_lines = f_src.readlines()
        tgt_lines = f_tgt.readlines()
        assert len(src_lines) == len(tgt_lines), "Source and target files must have the same number of lines."
        pairs = [(s.strip(), t.strip())
                 for s, t in zip(src_lines, tgt_lines)
                 if s.strip() and t.strip()]
    return pairs if num_samples is None else pairs[:num_samples]

# load everything into lists of (src_str, tgt_str)
train_data = load_data("./train.en", "./train.fr", num_samples=None)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)
test_data  = load_data("./test.en",  "./test.fr",  num_samples=None)


#### Using SentencePiece to tokenize and POS tagging

In [None]:
if not os.path.exists("./spm_joint.model"):
  spm.SentencePieceTrainer.Train(
      input="files/JRC-Acquis.English-French.en,files/JRC-Acquis.French-English.fr",
      model_prefix="spm_joint",
      vocab_size=16000,
      model_type="bpe",
      character_coverage=1.0,
      bos_id=1, eos_id=2, pad_id=0, unk_id=3)


In [None]:
sp = spm.SentencePieceProcessor()
sp.Load("./spm_joint.model")

# convenience:
PAD_ID = sp.pad_id()   # 0
BOS_ID = sp.bos_id()   # 1
EOS_ID = sp.eos_id()   # 2
UNK_ID = sp.unk_id()   # 3
VOCAB_SIZE = sp.GetPieceSize()


#### Showing how the tokenizing step is currently working

In [None]:
src_tokens_en = sp.EncodeAsPieces(train_data[2][0], add_bos=True, add_eos=True)
src_tokens_fr = sp.EncodeAsPieces(train_data[2][1], add_bos=True, add_eos=True)
src_ids_en = [BOS_ID] + sp.EncodeAsIds(train_data[2][0]) + [EOS_ID]
src_ids_fr = [BOS_ID] + sp.EncodeAsIds(train_data[2][1]) + [EOS_ID]

print(src_tokens_en)
print(src_tokens_fr)
print(src_ids_en)
print(src_ids_fr)
print(sp.DecodeIds(src_ids_en))
print(sp.DecodeIds(src_ids_fr))


#### Creating the Translation dataset

In [63]:
class TranslationDataset(Dataset):
    def __init__(self, pairs, sp, max_len=64):
        """
        pairs: list of (src_str, tgt_str)
        sp:    loaded spm.SentencePieceProcessor
        """
        self.pairs   = pairs
        self.sp      = sp
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]

        # encode returns a list of IDs (no BOS/EOS by default)
        src_ids = [BOS_ID] + self.sp.EncodeAsIds(src)[:self.max_len-2] + [EOS_ID]
        tgt_ids = [BOS_ID] + self.sp.EncodeAsIds(tgt)[:self.max_len-2] + [EOS_ID]

        return torch.tensor(src_ids, dtype=torch.long), \
               torch.tensor(tgt_ids, dtype=torch.long)

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    
    # pad to fixed length manually
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=PAD_ID)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=PAD_ID)
    
    return src_padded.to(device), tgt_padded.to(device)


train_dataset = TranslationDataset(train_data, sp)
train_loader  = DataLoader(train_dataset,
                           batch_size=32,
                           shuffle=True,
                           collate_fn=collate_fn)

val_dataset   = TranslationDataset(val_data, sp)
val_loader    = DataLoader(val_dataset,
                           batch_size=32,
                           shuffle=False,                   # no need to shuffle at eval time
                           collate_fn=collate_fn)

test_dataset = TranslationDataset(test_data, sp)            # same BPE dataset class
test_loader  = DataLoader(test_dataset,
                          batch_size=1,                     # or whatever batch‐size you like
                          shuffle=False,                    # no need to shuffle at eval time
                          collate_fn=collate_fn)


Printing samples from DataLoader

In [78]:
# Get the first batch
for src_batch, tgt_batch in train_loader:
    print("Source batch shape:", src_batch.shape)
    print("Target batch shape:", tgt_batch.shape)
    print("Source batch example (first row):", src_batch[0])
    print("Target batch example (first row):", tgt_batch[0])
    break  # stop after first batch

print("Source batch example (first row) decoded:", sp.DecodeIds(src_batch[0].tolist()))
print("Target batch example (first row) decoded:", sp.DecodeIds(tgt_batch[0].tolist()))


Source batch shape: torch.Size([32, 64])
Target batch shape: torch.Size([32, 64])
Source batch example (first row): tensor([    1,    54, 15674, 15694,     9,  8418,   584,  4779,    34,    21,
         4317,  2805, 15683,  1782,    53,  4713,  6028, 13543,    21,  3182,
           54,   224,  6510,    22,    34,  5766, 15683, 13079,    51,    98,
        12223, 15683,  8537,    51,    98,  6954, 15683,  7638, 15686,  5890,
            2,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0], device='cuda:0')
Target batch example (first row): tensor([    1,    16, 15694,   313,  3522,    84,  2294,  8831,   926,   313,
         2449, 10762,  3828,   313,  4713,  3796,    66,    78,  2702,  6628,
         5007,    54,   224,  6510,    56, 15683, 11920,  3241, 13425, 15683,
         7770,    69,  9400, 15683,  7638, 15146,     2,     0,     0,     0,
            0,  

In [60]:
class EncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(num_embeddings=input_dim, embedding_dim=emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout if n_layers > 1 else 0, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # src: [batch_size, src_len]        
        embedded = self.dropout(self.embedding(src))  # [batch_size, src_len, emb_dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs: [src_len, batch_size, hid_dim * 2]
        # hidden: [n_layers * 2, batch_size, hid_dim]
        # cell:   [n_layers * 2, batch_size, hid_dim]
        
        return outputs, hidden, cell


### Attention Mechanism

In [61]:
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        
        self.attn = nn.Linear(hid_dim * 4, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)
    
    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, hid_dim * 2]
        # encoder_outputs: [batch_size, src_len, hid_dim * 2]
        
        src_len = encoder_outputs.shape[1]
        
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)  # [batch_size, src_len, hid_dim * 2]
        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [batch_size, src_len, hid_dim]
        attention = self.v(energy).squeeze(2)  # [batch_size, src_len]
        
        return torch.softmax(attention, dim=1)  # [batch_size, src_len]


#### Decoder

In [62]:
class DecoderRNN(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, attention):
        super().__init__()
        
        self.output_dim = output_dim
        self.attention = attention
        
        self.embedding = nn.Embedding(output_dim, emb_dim)        
        self.reduce_weighted = nn.Linear(hid_dim * 2, hid_dim)        
        self.rnn = nn.LSTM(emb_dim + hid_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)        
        self.fc_out = nn.Linear(emb_dim + hid_dim + hid_dim, output_dim)        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_token, hidden_cat, hidden, cell, encoder_outputs):
        '''
        Inference step: processes one token at a time.
        input_token: [batch_size, 1]
        '''
        embedded = self.dropout(self.embedding(input_token))  # [batch_size, 1, emb_dim]

        a = self.attention(hidden_cat, encoder_outputs)  # [batch_size, src_len]
        a = a.unsqueeze(1)  # [batch_size, 1, src_len]

        weighted = torch.bmm(a, encoder_outputs)  # [batch_size, 1, hid_dim*2]
        weighted_reduced = self.reduce_weighted(weighted.squeeze(1))  # [batch_size, hid_dim]
        weighted_seq = weighted_reduced.unsqueeze(1)  # [batch_size, 1, hid_dim]

        rnn_input = torch.cat((embedded, weighted_seq), dim=2)  # [batch_size, 1, emb_dim+hid_dim]

        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))  # [batch_size, 1, hid_dim]

        prediction = self.fc_out(torch.cat((output, weighted_seq, embedded), dim=2))  # [batch_size, 1, output_dim]

        return prediction, hidden, cell
    
    def forward_train(self, input_seq, hidden_cat, hidden, cell, encoder_outputs):
            """
            Training step: processes ENTIRE target sequence
            input_seq: [batch_size, seq_len]
            """
            batch_size, seq_len = input_seq.shape
            
            embedded = self.dropout(self.embedding(input_seq))  # [batch, seq_len, emb_dim]
            
            a = self.attention(hidden_cat, encoder_outputs)  # [batch, src_len]
            a = a.unsqueeze(1)  # [batch, 1, src_len]
            
            weighted = torch.bmm(a, encoder_outputs)  # [batch, 1, hid_dim*2]
            weighted_reduced = self.reduce_weighted(weighted.squeeze(1))  # [batch, hid_dim]
            weighted_seq = weighted_reduced.unsqueeze(1).repeat(1, seq_len, 1)  # [batch, seq_len, hid_dim]
            
            rnn_input = torch.cat((embedded, weighted_seq), dim=2)  # [batch, seq_len, emb_dim + hid_dim]
            
            output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))  # [batch, seq_len, hid_dim]
            
            prediction = self.fc_out(torch.cat((output, weighted_seq, embedded), dim=2))  # [batch, seq_len, output_dim]
            
            return prediction, hidden, cell


#### Seq2Seq wrapper

In [79]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        encoder_outputs, hidden, cell = self.encoder(src)

        # Reduce bidirectional → unidirectional
        hidden = hidden[::2].contiguous()  # [n_layers, batch_size, hid_dim]
        cell   = cell[::2].contiguous()      # [n_layers, batch_size, hid_dim]

        # For attention: concatenate last forward & backward hidden
        # hidden_cat = torch.cat((hidden[-2], hidden[-1]), dim=1)  # [batch_size, hid_dim*2]
        hidden_cat = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1) if hidden.shape[0] >= 2 else hidden[-1, :, :]

        # 👉 switch to forward_train()
        output, hidden, cell = self.decoder.forward_train(trg, hidden_cat, hidden, cell, encoder_outputs)

        return output


#### Creating the model and loading the saved weights

In [81]:
# hyper-parameters
INPUT_DIM  = VOCAB_SIZE
OUTPUT_DIM = VOCAB_SIZE
EMB_DIM = 256
HID_DIM = 512
attn = Attention(HID_DIM)
num_epochs = 20

encoder = EncoderRNN(INPUT_DIM, EMB_DIM, HID_DIM, 2, dropout=0.3).to(device)
decoder = DecoderRNN(OUTPUT_DIM, EMB_DIM, HID_DIM, 2, dropout=0.3, attention=attn).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

print(f"Model device: {next(model.parameters()).device}")

optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)

model.load_state_dict(torch.load("seq2seq_model.pth"))


Model device: cuda:0


<All keys matched successfully>

## New training

In [82]:
def train(model, iterator, optimizer, criterion, tf_ratio):
    model.train()
    epoch_loss = 0

    for src, tgt in iterator:
        optimizer.zero_grad()
        preds = model(src, tgt, teacher_forcing_ratio=tf_ratio)
        
        # flatten, ignore the first token (<sos>)
        out = preds[:,1:].reshape(-1, OUTPUT_DIM)
        trg = tgt[:,1:].reshape(-1)

        loss = criterion(out, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    num_batches = 0

    with torch.no_grad():
        for src, tgt in val_loader:
            src = src.to(device)
            tgt = tgt.to(device)

            output = model(src, tgt)

            # skip first token (BOS) for loss calculation
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)

            loss = criterion(output, tgt)

            total_loss += loss.item()
            num_batches += 1

    return total_loss / num_batches

best_loss = float('inf')
patience = 3
counter = 0

if not os.path.exists("./seq2seq_model.pth"):
    for epoch in range(num_epochs):
        # linearly decay teacher forcing from 0.9 → 0.5
        tf_ratio = max(0.5, 0.9 - 0.02*(epoch-1))
        start_time = time.time()
        train_loss = train(model, train_loader, optimizer, criterion, tf_ratio)
        end_time = time.time()
        val_loss = evaluate(model, val_loader, criterion)
        if val_loss < best_loss:
            best_loss = val_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stopping at epoch {epoch} with patience {patience}.")
                break
        print(f"Epoch {epoch} | Training Loss {train_loss:.4f} | Validation Loss {val_loss:.4f} | TF={tf_ratio:.2f} | Time: {(end_time-start_time):.2f}s", flush=True)

    torch.save(model.state_dict(), "seq2seq_model.pth")

# load the model for evaluation
model.load_state_dict(torch.load("seq2seq_model.pth"))


<All keys matched successfully>

## Evaluate your model

#### BLEU score

In [None]:
def evaluate(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    num_batches = 0
    example_shown = False

    references = []
    hypotheses = []

    with torch.no_grad():
        for src, tgt in test_loader:
            output = model(src, tgt)  # shape: [batch_size, trg_len, vocab_size]
            loss = criterion(output[:, 1:].reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))
            total_loss += loss.item()
            num_batches += 1

            # Get predicted token IDs
            predicted_ids = output.argmax(2)[0].tolist()  # take first sample (batch_size=1)

            # Strip special tokens
            predicted_ids_clean = [i for i in predicted_ids if i not in (PAD_ID, BOS_ID, EOS_ID)]
            tgt_ids_clean = [i for i in tgt[0].tolist() if i not in (PAD_ID, BOS_ID, EOS_ID)]

            # Decode to sentence (string)
            predicted_sentence = sp.DecodeIds(predicted_ids_clean)
            tgt_sentence = sp.DecodeIds(tgt_ids_clean)

            # For BLEU → tokenize into list of tokens
            pred_tokens = predicted_sentence.strip().split()
            tgt_tokens = tgt_sentence.strip().split()

            hypotheses.append(pred_tokens)
            references.append([tgt_tokens])  # reference needs to be a list of refs

            if not example_shown:
                src_ids_clean = [i for i in src[0].tolist() if i not in (PAD_ID, BOS_ID, EOS_ID)]
                src_sentence = sp.DecodeIds(src_ids_clean)
                print(f"English: {src_sentence}\nTarget French: {tgt_sentence}\nModel Prediction: {predicted_sentence}\n")
                example_shown = True

    avg_loss = total_loss / num_batches
    print(f"Average Test Loss: {avg_loss:.4f}")

    # Compute BLEU
    bleu_score = corpus_bleu(references, hypotheses)
    print(f"Corpus BLEU Score: {bleu_score:.4f}")

evaluate(model, test_loader, criterion)


#### New Inference updated for Step by Step

In [87]:
def evaluate_model_on_sentences(model, sentences, sp, max_len=64, device=device):
    """
    Run inference on multiple input sentences and print their translations.
    
    Args:
        model: your Seq2Seq model
        sentences: list of English source sentences (strings)
        sp: loaded SentencePiece tokenizer
        max_len: max decoding length
        device: computation device
    """
    model.eval()

    for i, sentence in enumerate(sentences):
        # 1️⃣ Encode input
        src_ids = [BOS_ID] + sp.EncodeAsIds(sentence)[:max_len-2] + [EOS_ID]
        src_tensor = torch.tensor(src_ids, dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            # 2️⃣ Encoder forward
            encoder_outputs, hidden, cell = model.encoder(src_tensor)

            # 3️⃣ Reduce bidirectional
            hidden = hidden[::2].contiguous()
            cell   = cell[::2].contiguous()

            # 4️⃣ Concatenate last forward & backward hidden
            if hidden.shape[0] >= 2:
                hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
            else:
                hidden_cat = hidden[-1,:,:]

            # 5️⃣ Decoder init
            input_token = torch.tensor([[BOS_ID]], dtype=torch.long, device=device)

            outputs = []
            
            for _ in range(max_len):
                output, hidden, cell = model.decoder(input_token, hidden_cat, hidden, cell, encoder_outputs)
                pred_token = output.argmax(2)  # [1,1]
                pred_token_id = pred_token.item()

                outputs.append(pred_token_id)

                if pred_token_id == EOS_ID:
                    break

                input_token = pred_token

        # 6️⃣ Decode token IDs
        decoded_sentence = sp.DecodeIds([id for id in outputs if id not in (PAD_ID, BOS_ID, EOS_ID)])

        # Optional: clean underscores
        # clean_translation = decoded_sentence.replace('_', ' ').strip()

        # 7️⃣ Print nicely
        print(f"[Example {i+1}]")
        print(f"English: {sentence}")
        print(f"Model Prediction: {decoded_sentence}")
        print()


# Example input sentences
input_sentences = [
    "of 17 March 2005",
    "The difference between the spot and forward rates shall be treated as interest payable or receivable.",
    "(4) Council Regulation (EC) No 1039/2003 of 2 June 2003 adopting autonomous and transitional measures."
]

# Call inference function
evaluate_model_on_sentences(model, input_sentences, sp)


[Example 1]
English: of 17 March 2005
Model Prediction: ________________________________________________________________

[Example 2]
English: The difference between the spot and forward rates shall be treated as interest payable or receivable.
Model Prediction: ________________________________________________________________

[Example 3]
English: (4) Council Regulation (EC) No 1039/2003 of 2 June 2003 adopting autonomous and transitional measures.
Model Prediction: ________________________________________________________________



In [84]:
print(sp.IdToPiece(15722))


_


#### Showing examples of nltk.translate (sentence_bleu & corpus_bleu)

In [None]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu

# Reference and candidate sentences
reference = "this is a simple example".split()
candidate = "this is an example".split()

# Calculate BLEU score for a single sentence
bleu_score = sentence_bleu([reference], candidate)
print("BLEU Score (sentence):", bleu_score)

# Calculate BLEU score for a corpus (list of candidate sentences)
corpus_reference = [["this is a simple example".split()]]
corpus_candidate = ["this is an example".split()]

corpus_bleu_score = corpus_bleu(corpus_reference, corpus_candidate)
print("BLEU Score (corpus):", corpus_bleu_score)
