# English–Spanish Translation Dataset Example

This Jupyter Notebook demonstrates how to build and use a neural network with a Transformer architecture to translate from English to Spanish.

To begin, download the `spa.txt` dataset file into your working directory. This file contains paired English–Spanish sentences. You can download it from the following link:

[Download English–Spanish Dataset](https://www.kaggle.com/datasets/lonnieqin/englishspanish-translation-dataset/data)

Make sure the downloaded file is placed in the same directory as this notebook to ensure the dataset loads correctly.


In [None]:
from itertools import chain
import re

from src.tensor import Tensor, op
from src.optimizer import Adam
from src.scheduler import CosineScheduler
from src.preprocessing import DataLoader, Tokenizer, train_test_split
from src.encode import OneHotEncoder
from src.structure import Layer, Embedding, Transformer, Dense
from src.loss import CategoricalCrossentropy

In [None]:
class TransformerSeq2Seq(Layer):
    """A simple sequence-to-sequence model using Transformer architecture."""

    def __init__(
        self,
        src_vocab: set | tuple | list,
        tgt_vocab: set | tuple | list,
        d_model=64,
        nhead=4,
        dim_feedforward=128,
        num_encoder_layers=2,
        num_decoder_layers=2,
        dropout=0.1,
    ):
        self.d_model = d_model
        self.src_tokenizer = Tokenizer(list(src_vocab))
        self.tgt_tokenizer = Tokenizer(list(tgt_vocab))

        self.src_embedding = Embedding(d_model, len(self.src_tokenizer.vocab))
        self.tgt_embedding = Embedding(d_model, len(self.tgt_tokenizer.vocab))

        self.transformer = Transformer(
            d_model,
            nhead,
            num_encoder_layers,
            num_decoder_layers,
            dim_feedforward,
            self.src_embedding,
            self.tgt_embedding,
            dropout,
        )
        self.out_proj = Dense(len(self.tgt_tokenizer.vocab))

        self.encoder = OneHotEncoder(tuple(range(len(self.tgt_tokenizer.vocab))))

    def __call__(self, src: Tensor, tgt: Tensor | None = None, max_len: int = -1) -> Tensor:
        if Tensor.training:
            assert tgt is not None, "Target must not be None when training."

        src_pad_mask = self._generate_padding_mask(src, self.src_tokenizer.word2idx[Tokenizer.PAD])

        if tgt is not None:
            return self.out_proj(self.transformer(src, tgt, attn_mask_encoder=src_pad_mask, attn_mask_decoder=None))
        return self.transformer.generate(
            src,
            tgt,
            max_len=max_len,
            out_proj=self.out_proj,
            sos_token_id=self.tgt_tokenizer.word2idx[Tokenizer.SOS],
            eos_token_id=self.tgt_tokenizer.word2idx[Tokenizer.EOS],
            pad_token_id=self.tgt_tokenizer.word2idx[Tokenizer.PAD],
            attn_mask_encoder=src_pad_mask,
        )

    @Tensor.train()
    def train(self, src: list[list[str]], tgt: list[list[str]], batch_size: int) -> None:
        src_train, tgt_train, src_test, tgt_test = train_test_split(src, tgt)

        loss = CategoricalCrossentropy(ignore_token_id=self.tgt_tokenizer.word2idx["<pad>"])
        loader = DataLoader(src_train, tgt_train, batch_size=batch_size)
        optimizer = Adam(list(self.parameters), CosineScheduler(1e-3, max_steps=4000), b2=0.98)
        optimizer.params_requires_grad(True)

        num_epochs = 100
        for epoch in range(num_epochs + 1):
            for src_batch, tgt_batch in loader:
                src_batch, tgt_batch, tgt_out = self._tokenize(src_batch, tgt_batch)
                optimizer.zero_grad()

                outputs = self(src_batch, tgt_batch)

                outputs = op.softmax(outputs)

                target_one_hot = self.encoder(tgt_out)

                batch_size, seq_len, vocab_size = outputs.shape
                predicted = op.reshape(outputs, (batch_size * seq_len, vocab_size))
                expected = op.reshape(target_one_hot, (batch_size * seq_len, vocab_size))

                loss_val = loss(predicted, expected)

                loss_val.backward()
                optimizer.step()

                optimizer.zero_grad()

            print(f"\nEpoch {epoch}, Train Loss: {loss_val.mean().item():.4f}")
            self.validate(src_test, tgt_test)

    @Tensor.no_grad()
    def validate(self, src_val, tgt_val):
        print("Validating...")

        val_loader = DataLoader(src_val, tgt_val, batch_size=1)

        src_batch, tgt_batch, _ = self._tokenize(*next(val_loader))

        Tensor.set_training(False)
        predicted = self(src_batch, None, max_len=50)
        Tensor.set_training(True)

        src = map(self.src_tokenizer.decode, src_batch)
        decoded_true = map(self.tgt_tokenizer.decode, tgt_batch)
        decoded_pred = map(self.tgt_tokenizer.decode, predicted)

        for src_, truth, guess in zip(src, decoded_true, decoded_pred):
            print("Source:   ", " ".join(src_))
            print("Target:   ", " ".join(truth))
            print("Predicted:", " ".join(guess))
            print()

    def _tokenize(self, src: list[list[str]], tgt: list[list[str]]) -> tuple[Tensor, Tensor, Tensor]:
        max_src = max(len(s) for s in src) + 1  # +1 for <eos>
        max_tgt = max(len(s) for s in tgt) + 2  # +1 sos +1 eos
        src_ids = [self.src_tokenizer(s, max_src, add_sos=False, add_eos=True) for s in src]

        tgt_in_ids, tgt_out_ids = [], []
        for sentence in tgt:
            in_ids = self.tgt_tokenizer(sentence, max_tgt, add_sos=True, add_eos=False)
            out_ids = self.tgt_tokenizer(sentence, max_tgt, add_sos=False, add_eos=True)
            tgt_in_ids.append(in_ids)
            tgt_out_ids.append(out_ids)

        return (
            Tensor(src_ids, dtype=int, requires_grad=True),
            Tensor(tgt_in_ids, dtype=int, requires_grad=True),
            Tensor(tgt_out_ids, dtype=int, requires_grad=False),
        )

    def _generate_padding_mask(self, tokens: Tensor, pad_token_id: int) -> Tensor:
        return Tensor((tokens == pad_token_id).astype(int) * -1e-9, dtype=float)



In [None]:
remove_puntuation = re.compile(r"([?.!,¿¡])")
remove_extra_spaces = re.compile(r"\s+")


def clean_sentence(sentence: str) -> list[str]:
    sentence = re.sub(remove_puntuation, r" \1 ", sentence)
    sentence = re.sub(remove_extra_spaces, " ", sentence)
    return sentence.strip().lower().split()


In [None]:
Tensor.set_default_device("cuda")

with open("examples/spa.txt", "r", encoding="utf-8") as f:
    lines = [line.strip().split("\t")[:2] for line in f if "\t" in line]

src_data, tgt_data = [], []
for src, tgt in lines:
    src_data.append(clean_sentence(src))
    tgt_data.append(clean_sentence(tgt))

src_vocab = set(chain.from_iterable(src_data))
tgt_vocab = set(chain.from_iterable(tgt_data))



In [None]:
model = TransformerSeq2Seq(
    src_vocab=src_vocab,
    tgt_vocab=tgt_vocab,
    d_model=256,
    nhead=4,
    dim_feedforward=256,
    num_encoder_layers=3,
    num_decoder_layers=3,
    dropout=0.1,
)

model.train(src_data, tgt_data, batch_size=32)
