# Fase 1: Importar las dependencias

**Paper original**: All you need is Attention https://arxiv.org/pdf/1706.03762.pdf

In [1]:
import os
import numpy as np
import pandas as pd
import re
import time

In [2]:
try:
    %tensorflow_version 2.x
except:
    pass
import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [3]:
from mlearner.nlp import Transformer
from mlearner.nlp import Processor_data

In [4]:
%load_ext autoreload
%autoreload 2

%matplotlib inline


Bad key savefig.frameon in file C:\Users\AUTIS\Anaconda3\envs\Tensorflow 2.0\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle, line 421 ('savefig.frameon : True')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.3.0/matplotlibrc.template
or from the matplotlib source distribution

Bad key verbose.level in file C:\Users\AUTIS\Anaconda3\envs\Tensorflow 2.0\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle, line 472 ('verbose.level  : silent      # one of silent, helpful, debug, debug-annoying')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.3.0/matplotlibrc.template
or from the matplotlib source distribution

Bad key verbose.fileo in file C:\Users\AUTIS\Anaconda3\envs\Tensorflow 2.0\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle, line 473 ('verbose.fileo  : sys.stdout  # a log filename, sys.stdout or sys.s

# Fase 2: Pre Procesado de Datos



## Carga de Ficheros

In [5]:
with open("data/europarl-v7.es-en.en", 
          mode = "r", encoding = "utf-8") as f:
    europarl_en = f.read()
with open("data/europarl-v7.es-en.es", 
          mode = "r", encoding = "utf-8") as f:
    europarl_es = f.read()
with open("data/P85-Non-Breaking-Prefix.en", 
          mode = "r", encoding = "utf-8") as f:
    non_breaking_prefix_en = f.read()
with open("data/P85-Non-Breaking-Prefix.en", 
          mode = "r", encoding = "utf-8") as f:
    non_breaking_prefix_es = f.read()

In [6]:
europarl_en[:230]

'Resumption of the session\nI declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive peri'

In [7]:
europarl_es[:225]

'Reanudación del período de sesiones\nDeclaro reanudado el período de sesiones del Parlamento Europeo, interrumpido el viernes 17 de diciembre pasado, y reitero a Sus Señorías mi deseo de que hayan tenido unas buenas vacaciones'

## Limpieza de datos

Definimos funcion de procesado de texto basada en expresiones regulares

In [8]:
def Function_clean(text):
    
    # Eliminamos la @ y su mención
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    # Eliminamos los links de las URLs
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    return text


Se procesan los textos para cada uno de los idiomas:

In [9]:
processor_en = Processor_data(target_vocab_size=2**13, 
                              language="en", 
                              function = Function_clean,
                              name="processor_en",
                             )
processor_es = Processor_data(target_vocab_size=2**13, 
                              language="es", 
                              function = Function_clean,
                              name="processor_es"
                              )

In [10]:
if not os.path.isfile("data/corpus_en.csv"):
    corpus_en = europarl_en
    corpus_en = processor_en.clean(corpus_en)
    pd.DataFrame(corpus_en).to_csv("data/corpus_en.csv", index=False)

if not os.path.isfile("data/corpus_es.csv"):
    corpus_es = europarl_es
    corpus_es = processor_es.clean(corpus_es)
    pd.DataFrame(corpus_es).to_csv("data/corpus_es.csv", index=False)

corpus_en = pd.read_csv("data/corpus_en.csv")    
corpus_es = pd.read_csv("data/corpus_es.csv")

Exploramos los textos para cada idioma:

In [11]:
corpus_en[0:2]

Unnamed: 0,0
0,Resumption of the session
1,I declare resumed the session of the European ...


In [12]:
corpus_es[0:2]

Unnamed: 0,0
0,Reanudación del período de sesiones
1,Declaro reanudado el período de sesiones del P...


In [13]:
len(corpus_en), len(corpus_es)

(1965735, 1965735)

## Tokenizar el Texto

Tokenizado del texto sin aplicar limpieza (aplicada en el apartado anterior) y sin padding:

In [14]:
tokens_en = processor_en.process_text(corpus_en, 
                                         isclean=True, 
                                         padding=False)

In [15]:
tokens_es = processor_es.process_text(corpus_es, 
                                         isclean=True, 
                                         padding=False)

Tamaño de Vocabulario para los dos idiomas.

In [16]:
VOCAB_SIZE_EN = processor_en.tokenizer.vocab_size + 2
VOCAB_SIZE_ES = processor_es.tokenizer.vocab_size + 2

print(VOCAB_SIZE_ES, VOCAB_SIZE_EN)

8225 8198


Sustituimos los valores NaN con valores vacios:

In [27]:
corpus_es = corpus_es.fillna(" ")
corpus_en = corpus_en.fillna(" ")

Preparación de las frases como inputs/outputs del Modelo:

> _**[ \INICIO ]**_ + frase + _**[ \FIN ]**_

- **[ \INICIO ]**: Carácter que determina el inicio de frase.
- **[ \FIN ]**: Carácter que determina el final de frase.

In [28]:
inputs = [[VOCAB_SIZE_EN-2] + \
          processor_en.tokenizer.encode(sentence[0]) + [VOCAB_SIZE_EN-1] \
            for sentence in corpus_en.values]

outputs = [[VOCAB_SIZE_ES-2] + \
           processor_es.tokenizer.encode(sentence[0]) + [VOCAB_SIZE_ES-1] 
            for sentence in corpus_es.values ]

In [29]:
len(inputs), len(outputs)

(1965735, 1965735)

## Eliminamos las frases demasiado largas

In [30]:
MAX_LENGTH = 20

idx_to_remove = [count for count, sent in enumerate(inputs)
                 if len(sent) > MAX_LENGTH]
if len(idx_to_remove) > 0:
    for idx in reversed(idx_to_remove):
        del inputs[idx]
        del outputs[idx]
    
idx_to_remove = [count for count, sent in enumerate(outputs)
                 if len(sent) > MAX_LENGTH]
if len(idx_to_remove) > 0:
    for idx in reversed(idx_to_remove):
        del inputs[idx]
        del outputs[idx]

In [31]:
len(inputs), len(outputs)

(411131, 411131)

In [32]:
import pandas as pd
pd.DataFrame(inputs).to_csv("data/inputs.csv", index=False)
pd.DataFrame(outputs).to_csv("data/outputs.csv", index=False)

## Creamos las entradas y las salidas

A medida que entrenamos con bloques, necesitaremos que cada entrada tenga la misma longitud. Rellenamos con el token apropiado, y nos aseguraremos de que este token de relleno no interfiera con nuestro entrenamiento más adelante.

In [46]:
inputs = pd.read_csv("data/inputs.csv").fillna(0).astype(int)   
outputs = pd.read_csv("data/outputs.csv").fillna(0).astype(int)

In [47]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs.values,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)
outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs.values,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

Se crea el daset generador para servir los inputs/outputs procesados.

In [48]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Modelo Transformer - Entrenamiento

In [49]:
tf.keras.backend.clear_session()

# Hiper Parámetros
D_MODEL = 128 # 512
NB_LAYERS = 4 # 6
FFN_UNITS = 512 # 2048
NB_PROJ = 8 # 8
DROPOUT_RATE = 0.1 # 0.1

transformer = Transformer(vocab_size_enc=VOCAB_SIZE_EN,
                          vocab_size_dec=VOCAB_SIZE_ES,
                          d_model=D_MODEL,
                          nb_layers=NB_LAYERS,
                          FFN_units=FFN_UNITS,
                          nb_proj=NB_PROJ,
                          dropout_rate=DROPOUT_RATE)

In [50]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction="none")

def loss_function(target, pred):
    """
    Función de cáculo de pérdidas.
    
    Se calcula a partir de elementos no nulos, por ello
    extraemos elementos del padding.
    """
    # Mascara para deshacerme de los "ceros" del padding
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss_ = loss_object(target, pred)
    
    # Equiparar el tipo de dato de la mascara al de la perdida
    mask = tf.cast(mask, dtype=loss_.dtype)
    
    # Multiplicamos elemento a elemento.
    loss_ *= mask
    
    # Media por bloques (media de los resultados no nulos)
    return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")

In [51]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    """
    Custom Learning Rate Schedule:
    
        (5.3.) https://arxiv.org/pdf/1706.03762.pdf
    """
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

leaning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(leaning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)
        

In [56]:
checkpoint_path = "ckpt/"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Último checkpoint restaurado!!")

Último checkpoint restaurado!!


In [57]:
EPOCHS = 10
for epoch in range(EPOCHS):
    print("Inicio del epoch {}".format(epoch+1))
    start = time.time()
    
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        dec_inputs = targets[:, :-1]
        dec_outputs_real = targets[:, 1:]
        with tf.GradientTape() as tape:
            predictions = transformer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, predictions)
        
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
        
        train_loss(loss)
        train_accuracy(dec_outputs_real, predictions)
        
        if batch % 50 == 0:
            print("Epoch {} Lote {} Pérdida {:.4f} Precisión {:.4f}".format(
                epoch+1, batch, train_loss.result(), train_accuracy.result()))
            
    ckpt_save_path = ckpt_manager.save()
    print("Guardando checkpoint para el epoch {} en {}".format(epoch+1,
                                                        ckpt_save_path))
    print("Tiempo que ha tardado 1 epoch: {} segs\n".format(time.time() - start))

Inicio del epoch 1
Epoch 1 Lote 0 Pérdida 1.2496 Precisión 0.4252


KeyboardInterrupt: 

# Evaluación

In [None]:
def evaluate(inp_sentence):
    inp_sentence = \
        [VOCAB_SIZE_EN-2] + tokenizer_en.encode(inp_sentence) + [VOCAB_SIZE_EN-1]
    enc_input = tf.expand_dims(inp_sentence, axis=0)
    
    output = tf.expand_dims([VOCAB_SIZE_ES-2], axis=0)
    
    for _ in range(MAX_LENGTH):
        predictions = transformer(enc_input, output, False) #(1, seq_length, VOCAB_SIZE_ES)
        
        prediction = predictions[:, -1:, :]
        
        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)
        
        if predicted_id == VOCAB_SIZE_ES-1:
            return tf.squeeze(output, axis=0)
        
        output = tf.concat([output, predicted_id], axis=-1)
        
    return tf.squeeze(output, axis=0)

In [None]:
def translate(sentence):
    output = evaluate(sentence).numpy()
    
    predicted_sentence = tokenizer_es.decode(
        [i for i in output if i < VOCAB_SIZE_ES-2]
    )
    
    print("Entrada: {}".format(sentence))
    print("Traducción predicha: {}".format(predicted_sentence))

In [None]:
translate("This is a problem we have to solve.")

In [None]:
translate("This is a really powerful tool!")

In [None]:
translate("This is an interesting course about Natural Language Processing")