# LSTM Traductor

Replicar y extender el traductor:
- Replicar el modelo en PyTorch.
- Extender el entrenamiento a más datos y tamaños de
secuencias mayores.
- Explorar el impacto de la cantidad de neuronas en
las capas recurrentes.
- Mostrar 5 ejemplos de traducciones generadas.
- Extras que se pueden probar: Embeddings
pre-entrenados para los dos idiomas; cambiar la
estrategia de generación (por ejemplo muestreo
aleatorio);

**1) librerias**

In [None]:
import os
import re
import torch
import random
import urllib.request
import zipfile
import requests
import gzip
import shutil
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm


**2) Descargar y preparar dataset spa-eng**

In [None]:
dataset_zip = "spa-eng.zip"
dataset_folder = "spa-eng"
dataset_txt = os.path.join(dataset_folder, "spa.txt")
if not os.path.exists(dataset_folder):
    if not os.path.exists(dataset_zip):
        print("Descargando spa-eng.zip …")
        urllib.request.urlretrieve("http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip", dataset_zip)
    print("Descomprimiendo …")
    with zipfile.ZipFile(dataset_zip, "r") as z:
        z.extractall()
else:
    print("Dataset spa-eng ya presente")

assert os.path.exists(dataset_txt), "No se encontró el archivo spa.txt después de la descarga."


Descargando spa-eng.zip …
Descomprimiendo …


**3) Lectura y preprocesamiento**

In [None]:
pairs = []
with open(dataset_txt, encoding='utf-8') as f:
    for line in f:
        eng, spa = line.strip().split('\t')[:2]
        eng = eng.lower().strip()
        spa = spa.lower().strip()
        pairs.append((eng, spa))

print(f"Total de pares cargados: {len(pairs)}")
print("Ejemplo:", pairs[0])


Total de pares cargados: 118964
Ejemplo: ('go.', 've.')


**4) Tokenización y vocabularios**

In [None]:
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

def build_vocab(sentences, min_freq=2):
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    freqs = {}
    for s in sentences:
        for w in tokenize(s):
            freqs[w] = freqs.get(w, 0) + 1
    for w, f in freqs.items():
        if f >= min_freq:
            vocab[w] = len(vocab)
    return vocab

eng_vocab = build_vocab([p[0] for p in pairs])
spa_vocab = build_vocab([p[1] for p in pairs])

print("Vocab ENG:", len(eng_vocab))
print("Vocab ESP:", len(spa_vocab))


Vocab ENG: 8845
Vocab ESP: 15254


**5) Codificación y Dataset**

In [None]:
def encode_sentence(sentence, vocab, max_len=20):
    tokens = tokenize(sentence)
    ids = [vocab.get(w, vocab['<unk>']) for w in tokens]
    ids = [vocab['<sos>']] + ids[:max_len-2] + [vocab['<eos>']]
    pad_len = max_len - len(ids)
    ids += [vocab['<pad>']] * pad_len
    return ids

MAX_LEN = 20
data = [(encode_sentence(e, eng_vocab, MAX_LEN),
         encode_sentence(s, spa_vocab, MAX_LEN))
        for e, s in pairs]

train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return (torch.tensor(self.data[idx][0]),
                torch.tensor(self.data[idx][1]))

train_loader = DataLoader(TranslationDataset(train_data), batch_size=64, shuffle=True)
val_loader   = DataLoader(TranslationDataset(val_data), batch_size=64)


**6) Carga de Embeddings (GloVe inglés, FastText español)**

In [None]:
os.makedirs("embeddings", exist_ok=True)

# Descarga GloVe inglés
glove_path = "embeddings/glove.6B.300d.txt"
if not os.path.exists(glove_path):
    print("Descargando GloVe 300d (en inglés)...")
    glove_zip = "embeddings/glove.6B.zip"
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    r = requests.get(url)
    open(glove_zip, "wb").write(r.content)
    with zipfile.ZipFile(glove_zip, "r") as zip_ref:
        zip_ref.extract("glove.6B.300d.txt", "embeddings")

# Descarga FastText español
fasttext_path = "embeddings/cc.es.300.vec"
if not os.path.exists(fasttext_path):
    print("Descargando FastText español...")
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.es.300.vec.gz"
    r = requests.get(url)
    open("embeddings/cc.es.300.vec.gz", "wb").write(r.content)
    with gzip.open("embeddings/cc.es.300.vec.gz", "rb") as f_in:
        with open(fasttext_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

print("Embeddings listos.")


Descargando GloVe 300d (en inglés)...
Descargando FastText español...
Embeddings listos.


**7) Cargar los vectores a memoria**

In [None]:
def load_embeddings(file_path, vocab, dim=300):
    matrix = np.random.normal(scale=0.6, size=(len(vocab), dim))
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        for line in tqdm(f, desc=f"Cargando {file_path}"):
            parts = line.rstrip().split(' ')
            if len(parts) <= dim:
                continue
            word = parts[0]
            if word in vocab:
                vec = np.asarray(parts[1:dim+1], dtype='float32')
                matrix[vocab[word]] = vec
    return torch.tensor(matrix, dtype=torch.float32)

emb_eng = load_embeddings(glove_path, eng_vocab)
emb_spa = load_embeddings(fasttext_path, spa_vocab)


Cargando embeddings/glove.6B.300d.txt: 400000it [00:10, 36576.54it/s]
Cargando embeddings/cc.es.300.vec: 2000001it [00:42, 47423.52it/s]


**8) Modelo Encoder-Decoder**

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, embeddings):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=True)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
    def forward(self, x):
        x = self.embedding(x)
        outputs, (h, c) = self.lstm(x)
        return h, c

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, embeddings):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=True)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    def forward(self, x, h, c):
        x = self.embedding(x)
        output, (h, c) = self.lstm(x, (h, c))
        out = self.fc(output)
        return out, h, c

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size, trg_len = trg.shape
        vocab_size = len(spa_vocab)
        outputs = torch.zeros(batch_size, trg_len, vocab_size).to(self.device)
        h, c = self.encoder(src)
        x = trg[:, 0].unsqueeze(1)
        for t in range(1, trg_len):
            out, h, c = self.decoder(x, h, c)
            outputs[:, t] = out.squeeze(1)
            best = out.argmax(2)
            x = trg[:, t].unsqueeze(1) if np.random.rand() < teacher_forcing_ratio else best
        return outputs


**9) Entrenamiento**

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_model(hidden_size, epochs=10):
    print(f"\nEntrenando modelo con HIDDEN_SIZE = {hidden_size}\n")

    encoder = Encoder(len(eng_vocab), EMBED_DIM, hidden_size, emb_eng)
    decoder = Decoder(len(spa_vocab), EMBED_DIM, hidden_size, emb_spa)
    model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)

    criterion = nn.CrossEntropyLoss(ignore_index=spa_vocab['<pad>'])
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for src, trg in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            src, trg = src.to(DEVICE), trg.to(DEVICE)
            optimizer.zero_grad()
            output = model(src, trg)
            output_dim = output.shape[-1]
            loss = criterion(output[:, 1:].reshape(-1, output_dim), trg[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1} | Loss: {epoch_loss/len(train_loader):.4f}")

    # Guardamos el modelo completo
    model_path = f"seq2seq_hidden{hidden_size}.pt"
    torch.save(model.state_dict(), model_path)
    print(f"Modelo guardado en: {model_path}\n")

    return model


In [None]:
model_256 = train_model(hidden_size=256, epochs=10)


Entrenando modelo con HIDDEN_SIZE = 256



Epoch 1/10: 100%|██████████| 1488/1488 [02:42<00:00,  9.13it/s]


Epoch 1 | Loss: 5.4048


Epoch 2/10: 100%|██████████| 1488/1488 [02:42<00:00,  9.15it/s]


Epoch 2 | Loss: 4.3525


Epoch 3/10: 100%|██████████| 1488/1488 [02:41<00:00,  9.24it/s]


Epoch 3 | Loss: 3.7351


Epoch 4/10: 100%|██████████| 1488/1488 [02:41<00:00,  9.23it/s]


Epoch 4 | Loss: 3.2881


Epoch 5/10: 100%|██████████| 1488/1488 [02:43<00:00,  9.10it/s]


Epoch 5 | Loss: 2.9540


Epoch 6/10: 100%|██████████| 1488/1488 [02:41<00:00,  9.20it/s]


Epoch 6 | Loss: 2.7023


Epoch 7/10: 100%|██████████| 1488/1488 [02:40<00:00,  9.27it/s]


Epoch 7 | Loss: 2.4910


Epoch 8/10: 100%|██████████| 1488/1488 [02:41<00:00,  9.19it/s]


Epoch 8 | Loss: 2.3190


Epoch 9/10: 100%|██████████| 1488/1488 [02:43<00:00,  9.09it/s]


Epoch 9 | Loss: 2.1831


Epoch 10/10: 100%|██████████| 1488/1488 [02:43<00:00,  9.09it/s]


Epoch 10 | Loss: 2.0540
Modelo guardado en: seq2seq_hidden256.pt



In [None]:
model_512 = train_model(hidden_size=512, epochs=10)


Entrenando modelo con HIDDEN_SIZE = 512



Epoch 1/10: 100%|██████████| 1488/1488 [03:03<00:00,  8.09it/s]


Epoch 1 | Loss: 5.2214


Epoch 2/10: 100%|██████████| 1488/1488 [03:04<00:00,  8.08it/s]


Epoch 2 | Loss: 3.8036


Epoch 3/10: 100%|██████████| 1488/1488 [03:03<00:00,  8.09it/s]


Epoch 3 | Loss: 3.0549


Epoch 4/10: 100%|██████████| 1488/1488 [03:03<00:00,  8.09it/s]


Epoch 4 | Loss: 2.5824


Epoch 5/10: 100%|██████████| 1488/1488 [03:04<00:00,  8.08it/s]


Epoch 5 | Loss: 2.2335


Epoch 6/10: 100%|██████████| 1488/1488 [03:03<00:00,  8.11it/s]


Epoch 6 | Loss: 1.9647


Epoch 7/10: 100%|██████████| 1488/1488 [03:03<00:00,  8.11it/s]


Epoch 7 | Loss: 1.7601


Epoch 8/10: 100%|██████████| 1488/1488 [03:03<00:00,  8.12it/s]


Epoch 8 | Loss: 1.6033


Epoch 9/10: 100%|██████████| 1488/1488 [03:04<00:00,  8.06it/s]


Epoch 9 | Loss: 1.4635


Epoch 10/10: 100%|██████████| 1488/1488 [03:02<00:00,  8.18it/s]


Epoch 10 | Loss: 1.3562
Modelo guardado en: seq2seq_hidden512.pt



In [None]:
model_1024 = train_model(hidden_size=1024, epochs=10)


Entrenando modelo con HIDDEN_SIZE = 1024



Epoch 1/10: 100%|██████████| 1488/1488 [04:06<00:00,  6.03it/s]


Epoch 1 | Loss: 4.8830


Epoch 2/10: 100%|██████████| 1488/1488 [04:07<00:00,  6.01it/s]


Epoch 2 | Loss: 3.2279


Epoch 3/10: 100%|██████████| 1488/1488 [04:07<00:00,  6.01it/s]


Epoch 3 | Loss: 2.4786


Epoch 4/10: 100%|██████████| 1488/1488 [04:07<00:00,  6.02it/s]


Epoch 4 | Loss: 1.9949


Epoch 5/10: 100%|██████████| 1488/1488 [04:06<00:00,  6.03it/s]


Epoch 5 | Loss: 1.6579


Epoch 6/10: 100%|██████████| 1488/1488 [04:07<00:00,  6.01it/s]


Epoch 6 | Loss: 1.4066


Epoch 7/10: 100%|██████████| 1488/1488 [04:07<00:00,  6.01it/s]


Epoch 7 | Loss: 1.2138


Epoch 8/10: 100%|██████████| 1488/1488 [04:07<00:00,  6.00it/s]


Epoch 8 | Loss: 1.0576


Epoch 9/10: 100%|██████████| 1488/1488 [04:07<00:00,  6.00it/s]


Epoch 9 | Loss: 0.9405


Epoch 10/10: 100%|██████████| 1488/1488 [04:07<00:00,  6.00it/s]


Epoch 10 | Loss: 0.8354
Modelo guardado en: seq2seq_hidden1024.pt



**10) Traducción (Inferencia)**

In [None]:
# Función de traducción
def translate_sentence(sentence, model, max_len=MAX_LEN):
    model.eval()
    with torch.no_grad():
        src = torch.tensor(encode_sentence(sentence, eng_vocab, MAX_LEN)).unsqueeze(0).to(DEVICE)
        h, c = model.encoder(src)
        x = torch.tensor([[spa_vocab['<sos>']]]).to(DEVICE)
        translated = []
        for _ in range(max_len):
            out, h, c = model.decoder(x, h, c)
            pred = out.argmax(2)
            token = pred.item()
            if token == spa_vocab['<eos>']:
                break
            translated.append(token)
            x = pred
        inv_spa_vocab = {i: w for w, i in spa_vocab.items()}
        return ' '.join([inv_spa_vocab.get(t, '?') for t in translated])


# 🔹 Cargar modelos entrenados
def load_model(hidden_size, model_path):
    encoder = Encoder(len(eng_vocab), EMBED_DIM, hidden_size, emb_eng)
    decoder = Decoder(len(spa_vocab), EMBED_DIM, hidden_size, emb_spa)
    model = Seq2Seq(encoder, decoder, DEVICE).to(DEVICE)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()
    return model

model_256 = load_model(256, "seq2seq_hidden256.pt")
model_512 = load_model(512, "seq2seq_hidden512.pt")
model_1024 = load_model(1024, "seq2seq_hidden1024.pt")

# 🔹 Oraciones de prueba
test_sentences = [
    "I love music.",
    "This book is very interesting.",
    "I need help.",
    "Thanks for everything.",
    "my mother calls me.",
    "You're very nice.",
    "I need to go home.",
    "I'm cleaning my room.",
    "This is very important.",
    "See you later."
]

# 🔹 Comparar traducciones
for s in test_sentences:
    print(f"\nEnglish: {s}")
    print(f"🔸 256 units: {translate_sentence(s, model_256)}")
    print(f"🔸 512 units: {translate_sentence(s, model_512)}")
    print(f"🔸 1024 units: {translate_sentence(s, model_1024)}")



English: I love music.
🔸 256 units: me encanta la música
🔸 512 units: me encanta la música
🔸 1024 units: me encanta el música

English: This book is very interesting.
🔸 256 units: este libro es muy interesante
🔸 512 units: este libro es muy interesante
🔸 1024 units: este libro es muy interesante

English: I need help.
🔸 256 units: necesito ayuda
🔸 512 units: necesito ayuda
🔸 1024 units: necesito ayuda

English: Thanks for everything.
🔸 256 units: gracias por todo
🔸 512 units: gracias por todo
🔸 1024 units: gracias por todo

English: my mother calls me.
🔸 256 units: mi madre me invitó
🔸 512 units: mi madre me llamó
🔸 1024 units: mi madre me llama

English: You're very nice.
🔸 256 units: eres muy bien
🔸 512 units: eres muy bueno
🔸 1024 units: eres muy simpático

English: I need to go home.
🔸 256 units: necesito irme a casa
🔸 512 units: necesito ir a casa
🔸 1024 units: necesito ir a casa

English: I'm cleaning my room.
🔸 256 units: estoy limpiando mi cuarto
🔸 512 units: estoy limpiando m

**Conclusiones**

Calidad de las traducciones:

Todos los modelos logran buenas traducciones, pero:

- El modelo con 512 neuronas ofrece un balance ideal entre precisión y eficiencia, produciendo traducciones más naturales y fluidas.

- El modelo con 1024 neuronas, aunque más preciso, a veces genera palabras contextualmente similares pero no tan exactas.

- Aumento de hidden size mejora la calidad, pero incrementa el tiempo de entrenamiento.