In [None]:
import keras
import torch
import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as Vocab
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

torch.__version__, keras.__version__


In [None]:
torch.manual_seed(17)

### Process Data

In [26]:
def process_text(paths):
    total_words = 0

    for path in paths:
        book = open(path, 'rb').read().decode(encoding='utf-8').lower()
        words = book.split()
        print(f'{path} - Words: {len(words)}')
        total_words += len(words)

    print(f'Total Words: {total_words}')
    
    # Extrae palabras y signos de puntuación de un texto.
    words = re.findall(r'\b\w+\b|[\.,;!?()"\']', book)

    # Genera pares de secuencias de palabras y sus subsecuencias para entrenamiento.
    maxlen = 15
    text_pairs = []
    for i in range(0, len(words), maxlen):
        inp = words[i:i + maxlen]
        out = words[i + maxlen :i + maxlen * 2]
        text_pairs.append((' '.join(inp), ' '.join(out)))
        for j in range(maxlen - 1):
            text_pairs.append((' '.join(inp[j + 1:]), ' '.join(out)))

    # Verificando cómo quedó text_pairs
    for i in range(5):
        print(text_pairs[i])
    
    return text_pairs

In [None]:
paths = []
text_pairs = process_text(paths)

### Pipeline

In [None]:
# !python -m spacy download en_core_web_sm
# Crea un tokenizador de spaCy en inglés.
eng_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [6]:
# Se construye un vocabulario a partir de un conjunto de texto, contando las frecuencias de las palabras y filtrando aquellas que no alcanzan un umbral mínimo

def build_vocab(text, tokenizers, min_freq=5):
    eng_tokenizer = tokenizers
    eng_counter = Counter()
    for eng_string_prev_, eng_string_post_ in text:
        eng_counter.update(eng_tokenizer(eng_string_prev_))
        eng_counter.update(eng_tokenizer(eng_string_post_))
    eng_vocab = Vocab(eng_counter, min_freq=min_freq, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
    return eng_vocab, _

In [None]:
eng_vocab, _ = build_vocab(text_pairs, eng_tokenizer, min_freq=0)

In [None]:
len(eng_vocab)

In [8]:
# Convertir pares de texto en secuencias tensoriales de índices de vocabulario

def data_process(text,maxlen=15):
    data = []
    for eng_prev, eng_post in text:
        eng_prev_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(eng_prev)],
                                dtype=torch.long)
        eng_post_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(eng_post)],
                                dtype=torch.long)

        if eng_prev_tensor_.shape[0] < maxlen + 1:
            data.append((eng_prev_tensor_, eng_post_tensor_))
        else:
            print(0)
    return data

In [None]:
train_data = data_process(text_pairs, maxlen=15)

In [None]:
len(train_data)

In [None]:
batch_size = 64
PAD_IDX = eng_vocab['<pad>']
BOS_IDX = eng_vocab['<bos>']
EOS_IDX = eng_vocab['<eos>']

In [None]:
# Organizar un lote de datos para el entrenamiento, agrega tokens de inicio y fin a las salidas, iguala las longitudes de las secuencias y devuelve las entradas y salidas ajustadas.

def generate_batch(data_batch):
    x, y = [], []
    for (x_item, y_item) in data_batch:
        x.append(x_item)
        y.append(torch.cat([torch.tensor([BOS_IDX]),
                            y_item,
                            torch.tensor([EOS_IDX])], dim=0))

    x = pad_sequence(x, batch_first=True, padding_value=PAD_IDX)
    y = pad_sequence(y, batch_first=True, padding_value=PAD_IDX)
    return x, y[:, :-1], y[:, 1:]

In [None]:
# Cargar y preprocesar los datos de entrenamiento en lotes

train_loader = DataLoader(train_data, 
                        batch_size = batch_size, 
                        shuffle = True, 
                        collate_fn = generate_batch,
                        num_workers = 4, 
                        pin_memory = True)

In [None]:
enc_batch, dec_batch, target_batch = next(iter(train_loader))

In [None]:
enc_batch.shape, dec_batch.shape, target_batch.shape