In [49]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
import tensorflow as tf
from tensorflow.keras import layers
import pandas as pd

In [51]:
from preprocessing import load_non_breaking_prefixes, sentence_boundary_disambiguation
from model import Transformer, CustomSchedule, main_train
import config

In [52]:
df = pd.read_csv(config.TRAIN_PATH, sep="\t", names=["eng", "spa"], usecols=[0, 1])

nonbreaking_prefixes_spa = load_non_breaking_prefixes(config.NONBREAKING_SPA_PATH)
nonbreaking_prefixes_eng = load_non_breaking_prefixes(config.NONBREAKING_ENG_PATH)

df["spa"] = df["spa"].apply(lambda x : sentence_boundary_disambiguation(x, nonbreaking_prefixes_spa))
df["eng"] = df["eng"].apply(lambda x : sentence_boundary_disambiguation(x, nonbreaking_prefixes_eng))

In [53]:
def vectorize_text(corpus, config, vocab=None):
    int_vectorize_layer = layers.TextVectorization(
        max_tokens=config.VOCAB_SIZE,
        output_mode='int',
        output_sequence_length=config.MAX_SEQUENCE_LENGTH,
        vocabulary=vocab
    )
    if vocab is None:
        int_vectorize_layer.adapt(corpus)
    vocab = int_vectorize_layer.get_vocabulary()
    return int_vectorize_layer(corpus), vocab

input, eng_vocab = vectorize_text(df["eng"], config)
output, spa_vocab = vectorize_text(df["spa"], config)

In [54]:
dataset = tf.data.Dataset.from_tensor_slices((input, output))
dataset = dataset.shuffle(buffer_size=df.shape[0], reshuffle_each_iteration=True)
dataset = dataset.batch(batch_size=config.BATCH_SIZE, drop_remainder=True)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [55]:
tf.keras.backend.clear_session()

In [56]:
# Create the Transformer model
transformer = Transformer(vocab_size_enc=config.VOCAB_SIZE,
                          vocab_size_dec=config.VOCAB_SIZE,
                          d_model=config.D_MODEL,
                          n_layers=config.N_LAYERS,
                          FFN_units=config.FFN_DIM,
                          n_heads=config.N_HEADS,
                          dropout_rate=config.DROPOUT_RATE)

In [None]:
# Train the model
losses, accuracies = main_train(dataset, transformer, config)

Las checkpoint restored.
Inicio del epoch 1
Epoch 1 Lote 0 Pérdida 1.0262 Precisión 0.0015
Epoch 1 Lote 100 Pérdida 0.6582 Precisión 0.0197
Epoch 1 Lote 200 Pérdida 0.6010 Precisión 0.0250
Epoch 1 Lote 300 Pérdida 0.5583 Precisión 0.0293
Epoch 1 Lote 400 Pérdida 0.5239 Precisión 0.0326
Epoch 1 Lote 500 Pérdida 0.4982 Precisión 0.0353
Epoch 1 Lote 600 Pérdida 0.4772 Precisión 0.0376
Epoch 1 Lote 700 Pérdida 0.4608 Precisión 0.0395
Epoch 1 Lote 800 Pérdida 0.4468 Precisión 0.0411
Epoch 1 Lote 900 Pérdida 0.4348 Precisión 0.0425
Epoch 1 Lote 1000 Pérdida 0.4244 Precisión 0.0438
Epoch 1 Lote 1100 Pérdida 0.4147 Precisión 0.0450
Epoch 1 Lote 1200 Pérdida 0.4064 Precisión 0.0460
Epoch 1 Lote 1300 Pérdida 0.3988 Precisión 0.0469
Epoch 1 Lote 1400 Pérdida 0.3919 Precisión 0.0478
Epoch 1 Lote 1500 Pérdida 0.3857 Precisión 0.0486
Epoch 1 Lote 1600 Pérdida 0.3800 Precisión 0.0494
Epoch 1 Lote 1700 Pérdida 0.3745 Precisión 0.0500
Epoch 1 Lote 1800 Pérdida 0.3696 Precisión 0.0507
Epoch 1 Lote 1900 

In [None]:
def predict(input_sentence, vocab_input, vocab_output, transformer, config):
    input = f"{config.SOS_TOKEN} {input_sentence} {config.EOS_TOKEN}"
    input_encoded, _ = vectorize_text([input], config, vocab=vocab_input)

    # Set the initial output sentence to sos
    output = config.SOS_TOKEN
    output_encoded, _ = vectorize_text([output], config, vocab=vocab_input)

    # For max target len tokens
    for _ in range(config.MAX_SEQUENCE_LENGTH):
        # Call the transformer and get the logits 
        predictions = transformer(input_encoded, output_encoded, False) #(1, seq_length, VOCAB_SIZE_ES)
        # Extract the logists of the next word
        prediction = predictions[:, -1:, :]
        # The highest probability is taken
        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)
        print(predicted_id)
        # Check if it is the eos token
        if predicted_id == 2:
            return tf.squeeze(output_encoded, axis=0)
        # Concat the predicted word to the output sequence
        output = tf.concat([output_encoded, predicted_id], axis=-1)
    return tf.squeeze(output_encoded, axis=0)

In [None]:
predict("Who are you?.", eng_vocab, spa_vocab, transformer, config)

In [23]:
df

Unnamed: 0,eng,spa
0,Go. <EOS>,Ve. <EOS>
1,Go. <EOS>,Vete. <EOS>
2,Go. <EOS>,Vaya. <EOS>
3,Go. <EOS>,Váyase. <EOS>
4,Hi. <EOS>,Hola. <EOS>
...,...,...
140863,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
140864,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...
140865,"If you want to sound like a native speaker, yo...","Si quieres sonar como un hablante nativo, debe..."
140866,It may be impossible to get a completely error...,Puede que sea imposible obtener un corpus comp...
