# Importing Libraries

In [33]:
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
!pip install torchtext==0.18

Looking in indexes: https://download.pytorch.org/whl/cu121


In [34]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Disable tensorflow debugging logs
os.environ["KERAS_BACKEND"] = "torch"
import keras
import torch
import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torch.nn as nn
from torch import optim
import time
torch.__version__, keras.__version__


('2.3.1+cu121', '3.4.1')

In [35]:
torch.manual_seed(17)

<torch._C.Generator at 0x782360d96190>

# Processing Data

In [36]:
def process_text(paths):
    total_words = 0

    for path in paths:
        book = open(path, 'rb').read().decode(encoding='utf-8').lower()
        words = book.split()
        print(f'{path} - Words: {len(words)}')
        total_words += len(words)

    print(f'Total Words: {total_words}')

    # Extrae palabras y signos de puntuación de un texto.
    words = re.findall(r'\b\w+\b|[\.,;!?()"\']', book)

    # Genera pares de secuencias de palabras y sus subsecuencias para entrenamiento.
    maxlen = 32
    text_pairs = []
    for i in range(0, len(words), maxlen):
        inp = words[i:i + maxlen]
        out = words[i + maxlen :i + maxlen * 2]
        text_pairs.append((' '.join(inp), ' '.join(out)))
        for j in range(maxlen - 1):
            text_pairs.append((' '.join(inp[j + 1:]), ' '.join(out)))

    # Verificando cómo quedó text_pairs
    for i in range(5):
        print(text_pairs[i])

    return text_pairs

In [39]:
paths = ['./01 Harry Potter and the Sorcerers Stone.txt','./02 Harry Potter and the Chamber of Secrets.txt','./03 Harry Potter and the Prisoner of Azkaban.txt',
         './04 Harry Potter and the Goblet of Fire.txt', './05 Harry Potter and the Order of the Phoenix.txt']
text_pairs = process_text(paths)

./01 Harry Potter and the Sorcerers Stone.txt - Words: 78431
./02 Harry Potter and the Chamber of Secrets.txt - Words: 86258
./03 Harry Potter and the Prisoner of Azkaban.txt - Words: 109563
./04 Harry Potter and the Goblet of Fire.txt - Words: 191798
./05 Harry Potter and the Order of the Phoenix.txt - Words: 261673
Total Words: 727723
('the hottest day of the summer so far was drawing to a close and a drowsy silence lay over the large , square houses of privet drive . cars that were usually', 'gleaming stood dusty in their drives and lawns that were once emerald green lay parched and yellowing for the use of hosepipes had been banned due to drought . deprived of their')
('hottest day of the summer so far was drawing to a close and a drowsy silence lay over the large , square houses of privet drive . cars that were usually', 'gleaming stood dusty in their drives and lawns that were once emerald green lay parched and yellowing for the use of hosepipes had been banned due to drought . d

# Pipeline

In [40]:
# !python -m spacy download en_core_web_sm
# Crea un tokenizador de spaCy en inglés.
eng_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')



In [41]:
# Se construye un vocabulario a partir de un conjunto de texto, contando las frecuencias de las palabras y filtrando aquellas que no alcanzan un umbral mínimo

def build_vocab(text, tokenizers, min_freq=5):
    eng_tokenizer = tokenizers
    eng_counter = Counter()
    for eng_string_prev_, eng_string_post_ in text:
        eng_counter.update(eng_tokenizer(eng_string_prev_))
        eng_counter.update(eng_tokenizer(eng_string_post_))
    eng_vocab = Vocab(eng_counter, min_freq=min_freq, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
    return eng_vocab

In [42]:
eng_vocab = build_vocab(text_pairs, eng_tokenizer, min_freq=0)

In [43]:
eng_vocab_size = len(eng_vocab)
eng_vocab_size

12198

In [44]:
# Convertir pares de texto en secuencias tensoriales de índices de vocabulario

def data_process(text,maxlen):
    data = []
    for eng_prev, eng_post in text:
        eng_prev_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(eng_prev)],
                                dtype=torch.long)
        eng_post_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(eng_post)],
                                dtype=torch.long)

        if eng_prev_tensor_.shape[0] < maxlen + 1:
            data.append((eng_prev_tensor_, eng_post_tensor_))
    return data

In [45]:
train_data = data_process(text_pairs, maxlen=32)

In [46]:
len(train_data)

331922

In [47]:
batch_size = 64
PAD_IDX = eng_vocab['<pad>']
BOS_IDX = eng_vocab['<bos>']
EOS_IDX = eng_vocab['<eos>']

In [48]:
# Organizar un lote de datos para el entrenamiento, agrega tokens de inicio y fin a las salidas, iguala las longitudes de las secuencias y devuelve las entradas y salidas ajustadas.

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(x_item)
        y.append(torch.cat([torch.tensor([BOS_IDX]),
                            y_item,
                            torch.tensor([EOS_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y[:, :-1], y[:, 1:]

In [49]:
# Cargar y preprocesar los datos de entrenamiento en lotes

train_loader = DataLoader(train_data,
                        batch_size = batch_size,
                        shuffle = True,
                        collate_fn = generate_batch,
                        num_workers = 2,
                        pin_memory = True)

In [50]:
enc_batch, dec_batch, target_batch = next(iter(train_loader))

In [51]:
enc_batch.shape, dec_batch.shape, target_batch.shape

(torch.Size([64, 32]), torch.Size([64, 33]), torch.Size([64, 33]))

# Model Construction

In [52]:
emb_dim = 128
model_dim = 256

## Encoder

La clase Encoder implementa un modelo de codificación utilizando embeddings y una LSTM para procesar secuencias de texto, transformando entradas de palabras en vectores y produciendo estados ocultos para su uso en otras partes de un sistema de aprendizaje profundo.

In [53]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, model_dim=512):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim,
                        hidden_size=model_dim,
                        num_layers=1,
                        batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x)
        return (hidden, cell)

In [54]:
encoder = Encoder(eng_vocab_size, emb_dim, model_dim)
state_batch = encoder(enc_batch)
state_batch[0].shape

torch.Size([1, 64, 256])

## Decoder

La clase Decoder implementa un modelo de decodificación que utiliza embeddings y una LSTM para generar secuencias, produciendo logits que representan la probabilidad de cada palabra en el vocabulario en base a la entrada y al estado previo del modelo.

In [55]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim=256, model_dim=512):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(input_size=emb_dim,
                        hidden_size=model_dim,
                        num_layers=1,
                        batch_first=True)
        self.fc1 = nn.Linear(model_dim, vocab_size)

    def forward(self, x, state):
        x = self.embedding(x)
        x, (hidden, cell) = self.rnn(x, state)
        x = self.fc1(x)
        return x

In [56]:
decoder = Decoder(eng_vocab_size, emb_dim, model_dim)
output_batch = decoder(dec_batch, state_batch)
output_batch.shape, target_batch.shape

(torch.Size([64, 33, 12198]), torch.Size([64, 33]))

## seq2seq

La clase Seq2seq integra un encoder y un decoder en un único modelo, donde el encoder procesa la secuencia de entrada y produce un estado que el decoder utiliza para generar la secuencia de salida.

In [57]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, inp, tar):
        state = self.encoder(inp)
        x = self.decoder(tar, state)
        return x

In [58]:
seq2seq = Seq2seq(encoder, decoder)
output_batch = seq2seq(enc_batch, dec_batch)
output_batch.shape

torch.Size([64, 33, 12198])

# Training

In [59]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

## train



In [60]:
def train(model, device, train_loader, optimizer, epoch):
    start = time.time()
    running_loss = 0.0
    model.train()
    for inp_enc, inp_dec, tar_dec in train_loader:
        tar_dec = tar_dec.reshape(-1)
        inp_enc = inp_enc.to(device)
        inp_dec = inp_dec.to(device)
        tar_dec = tar_dec.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inp_enc, inp_dec)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = loss_fn(outputs, tar_dec)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'\nTime for epoch {epoch} is {time.time()-start:.4f} sec Train loss: {running_loss / len(train_loader):.4f}')

## translate



In [61]:
def translate(model, sentence, device):
    with torch.no_grad():
        model.eval()
        eng_idx = torch.tensor([eng_vocab[token] for token in eng_tokenizer(sentence)],
                                    dtype=torch.long)
        eng_idx = eng_idx.reshape([1, -1])

        spa_idx = torch.tensor(BOS_IDX, dtype=torch.long)
        spa_idx = spa_idx.reshape([1, -1])

        while spa_idx[:, -1] != EOS_IDX:
            eng_idx = eng_idx.to(device)
            spa_idx = spa_idx.to(device)
            logits = model(eng_idx, spa_idx)[:, -1, :]
            probs = torch.softmax(logits, dim=-1)

            _, idx_next = torch.topk(probs, k=1, dim=-1)
            spa_idx = torch.cat((spa_idx, idx_next), dim=1)

        output = " ".join([eng_vocab.get_itos()[_] for _ in spa_idx[0]])
        output = output.replace("<bos>", "").replace("<eos>", "")
    print(f'Input: {sentence}')
    print(f'Output: {output}')

## Implementation

In [62]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [63]:
torch.cuda.is_available()

True

In [64]:
seq2seq.to(device)
optimizer = optim.Adam(seq2seq.parameters(), lr=0.001)

In [67]:
sentences = ['harry and hermione are dancing under the rain, when voldemort and dumbledore are']

In [None]:
epochs = 50

for epoch in range(epochs):
    train(seq2seq, device, train_loader, optimizer, epoch)
    for s in sentences:
        translate(seq2seq, s, device)


Time for epoch 0 is 129.3793 sec Train loss: 0.5708
Input: harry and hermione are dancing under the rain, when voldemort and dumbledore are
Output:  that you had been dreaming about the corridor . she seemed completely forgotten , she 

Time for epoch 1 is 129.2112 sec Train loss: 0.4703
Input: harry and hermione are dancing under the rain, when voldemort and dumbledore are
Output:  so that meant , listen to your whereabouts . i don ' t know 

Time for epoch 2 is 130.1787 sec Train loss: 0.3953
Input: harry and hermione are dancing under the rain, when voldemort and dumbledore are
Output:  that you had told the examiner about us just slip an excuse to meet 

Time for epoch 3 is 129.6139 sec Train loss: 0.3389
Input: harry and hermione are dancing under the rain, when voldemort and dumbledore are
Output:  that you might have been umbridge in professor snape discovered removed the note in his 

Time for epoch 4 is 129.8480 sec Train loss: 0.2969
Input: harry and hermione are dancing und