In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import unicodedata
import re

In [None]:
# --------- Hyperparameters ---------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 64
EMBED_SIZE = 256
HIDDEN_SIZE = 512
NUM_EPOCHS = 20
MAX_LEN = 15
TEACHER_FORCING_RATIO = 0.5

In [None]:
# --------- Data Preprocessing ---------
def normalize_text(s):
    s = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    s = re.sub(r"[^a-zA-Z.!?¿]+", r" ", s)
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

In [None]:
# Load and preprocess data
pairs = []
with open('spa.txt', encoding='utf-8') as f:
    lines = f.read().strip().split('\n')
    for line in lines:
        eng, spa = line.split('\t')[:2]
        eng, spa = normalize_text(eng), normalize_text(spa)
        pairs.append((eng, f"<sos> {spa} <eos>"))

eng_sentences, spa_sentences = zip(*pairs)
train_eng, val_eng, train_spa, val_spa = train_test_split(eng_sentences, spa_sentences, test_size=0.1)


In [None]:
# Tokenizer
from collections import Counter

def build_vocab(sentences):
    word_counts = Counter()
    for sentence in sentences:
        word_counts.update(sentence.split())
    vocab = ['<pad>', '<unk>'] + sorted(word_counts.keys())
    word2idx = {w: i for i, w in enumerate(vocab)}
    idx2word = {i: w for w, i in word2idx.items()}
    return word2idx, idx2word

src_word2idx, src_idx2word = build_vocab(train_eng)
tgt_word2idx, tgt_idx2word = build_vocab(train_spa)
SRC_VOCAB_SIZE = len(src_word2idx)
TGT_VOCAB_SIZE = len(tgt_word2idx)

In [None]:
def encode(sentence, word2idx):
    return [word2idx.get(w, word2idx['<unk>']) for w in sentence.split()[:MAX_LEN]]

def pad(seq):
    return seq + [0] * (MAX_LEN - len(seq))

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src, tgt):
        self.src = [pad(encode(s, src_word2idx)) for s in src]
        self.tgt = [pad(encode(s, tgt_word2idx)) for s in tgt]

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return torch.tensor(self.src[idx]), torch.tensor(self.tgt[idx])

train_loader = DataLoader(TranslationDataset(train_eng, train_spa), batch_size=BATCH_SIZE, shuffle=True)


In [None]:
# --------- Encoder ---------
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(SRC_VOCAB_SIZE, EMBED_SIZE)
        self.lstm = nn.LSTM(EMBED_SIZE, HIDDEN_SIZE, batch_first=True)

    def forward(self, x):
        embedded = self.embed(x)
        outputs, (h, c) = self.lstm(embedded)
        return h, c

# --------- Decoder ---------
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(TGT_VOCAB_SIZE, EMBED_SIZE)
        self.lstm = nn.LSTM(EMBED_SIZE, HIDDEN_SIZE, batch_first=True)
        self.fc = nn.Linear(HIDDEN_SIZE, TGT_VOCAB_SIZE)

    def forward(self, x, h, c):
        x = x.unsqueeze(1)
        embedded = self.embed(x)
        output, (h, c) = self.lstm(embedded, (h, c))
        return self.fc(output.squeeze(1)), h, c


In [None]:
# --------- Seq2Seq ---------
class Seq2Seq(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()

    def forward(self, src, tgt):
        batch_size, tgt_len = tgt.shape
        outputs = torch.zeros(batch_size, tgt_len, TGT_VOCAB_SIZE).to(device)

        h, c = self.encoder(src.to(device))
        x = tgt[:, 0].to(device)

        for t in range(1, tgt_len):
            output, h, c = self.decoder(x, h, c)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < TEACHER_FORCING_RATIO
            x = tgt[:, t] if teacher_force else output.argmax(1)
        return outputs


In [None]:
# --------- Training ---------
model = Seq2Seq().to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        output = model(src, tgt)
        output = output[:, 1:].reshape(-1, TGT_VOCAB_SIZE)
        tgt = tgt[:, 1:].reshape(-1)
        loss = criterion(output, tgt)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

Epoch 1, Loss: 5.3167
Epoch 2, Loss: 3.8150
Epoch 3, Loss: 3.0193
Epoch 4, Loss: 2.4428
Epoch 5, Loss: 2.0017
Epoch 6, Loss: 1.6759
Epoch 7, Loss: 1.4310
Epoch 8, Loss: 1.2421
Epoch 9, Loss: 1.0975
Epoch 10, Loss: 0.9807
Epoch 11, Loss: 0.8792
Epoch 12, Loss: 0.7982
Epoch 13, Loss: 0.7251
Epoch 14, Loss: 0.6597
Epoch 15, Loss: 0.6180
Epoch 16, Loss: 0.5689
Epoch 17, Loss: 0.5336
Epoch 18, Loss: 0.4987
Epoch 19, Loss: 0.4697
Epoch 20, Loss: 0.4441


In [None]:
# --------- Translation ---------
def translate(sentence):
    model.eval()
    src = pad(encode(normalize_text(sentence), src_word2idx))
    src_tensor = torch.tensor([src]).to(device)
    h, c = model.encoder(src_tensor)

    tgt_indices = [tgt_word2idx['<sos>']]
    for _ in range(MAX_LEN):
        x = torch.tensor([tgt_indices[-1]]).to(device)
        output, h, c = model.decoder(x, h, c)
        pred = output.argmax(1).item()
        if pred == tgt_word2idx['<eos>']:
            break
        tgt_indices.append(pred)

    return ' '.join([tgt_idx2word[i] for i in tgt_indices[1:]])


In [None]:
# Sample Translations
print("Translate: 'Hey! What are you doing?'", "->", translate("Hey! What are you doing?"))
print("Translate: 'Run! There is danger ahead'", "->", translate("Run! There is danger ahead"))


Translate: 'Hey! What are you doing?' -> ¿que ¿que significa haciendo?
Translate: 'Run! There is danger ahead' -> algo alla es bajo el aire.
