In [1]:
%load_ext autoreload
%autoreload 2

In [79]:
import tensorflow as tf
from tensorflow.keras import layers
import pandas as pd
from matplotlib import pyplot as plt

In [3]:
from preprocessing import load_non_breaking_prefixes, sentence_boundary_disambiguation
from model import Transformer, CustomSchedule, main_train
import config

In [4]:
df = pd.read_csv(config.TRAIN_PATH, sep="\t", names=["eng", "spa"], usecols=[0, 1])

nonbreaking_prefixes_spa = load_non_breaking_prefixes(config.NONBREAKING_SPA_PATH)
nonbreaking_prefixes_eng = load_non_breaking_prefixes(config.NONBREAKING_ENG_PATH)

df["spa"] = df["spa"].apply(lambda x : sentence_boundary_disambiguation(x, nonbreaking_prefixes_spa))
df["eng"] = df["eng"].apply(lambda x : sentence_boundary_disambiguation(x, nonbreaking_prefixes_eng))

In [73]:
def tokenize_text(corpus, config, vocab=None):
    int_vectorize_layer = layers.TextVectorization(
        max_tokens=config.VOCAB_SIZE,
        output_mode='int',
        output_sequence_length=config.MAX_TOKENS + 1,
        vocabulary=vocab
    )
    if vocab is None:
        int_vectorize_layer.adapt(corpus)
    vocab = int_vectorize_layer.get_vocabulary()
    return int_vectorize_layer(corpus), vocab

eng, eng_vocab = vectorize_text(df["eng"], config)
spa, spa_vocab = vectorize_text(df["spa"], config)

In [74]:
def prepare_batch(eng, spa):
    eng_input = eng[:, :config.MAX_TOKENS]

    spa = spa[:, :config.MAX_TOKENS+1]
    spa_input = spa[:, :-1]
    spa_labels = spa[:, 1:]

    return (eng_input, spa_input), spa_labels

def make_batches(ds):
  return (
      ds
      .shuffle(config.BUFFER_SIZE)
      .batch(config.BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

In [75]:
dataset = tf.data.Dataset.from_tensor_slices((eng, spa))
dataset = make_batches(dataset)

In [76]:
for (eng, spa), spa_labels in dataset.take(1):
  break

print(eng.shape)
print(spa.shape)
print(spa_labels.shape)

(64, 64)
(64, 64)
(64, 64)


In [82]:
from model import PositionalEmbedding
embed_eng = PositionalEmbedding(vocab_size=config.VOCAB_SIZE, d_model=config.D_MODEL)
embed_spa = PositionalEmbedding(vocab_size=config.VOCAB_SIZE, d_model=config.D_MODEL)

eng_emb = embed_eng(eng)
spa_emb = embed_spa(spa)

In [9]:
tf.keras.backend.clear_session()


In [10]:
# Create the Transformer model
transformer = Transformer(vocab_size_enc=config.VOCAB_SIZE,
                          vocab_size_dec=config.VOCAB_SIZE,
                          d_model=config.D_MODEL,
                          n_layers=config.N_LAYERS,
                          FFN_units=config.FFN_DIM,
                          n_heads=config.N_HEADS,
                          dropout_rate=config.DROPOUT_RATE)

In [14]:
# Train the model
losses, accuracies = main_train(dataset, transformer, config)

Las checkpoint restored.
Inicio del epoch 1
Epoch 1 Lote 0 Pérdida 1.0381 Precisión 0.0015
Epoch 1 Lote 100 Pérdida 0.6676 Precisión 0.0199
Epoch 1 Lote 200 Pérdida 0.6064 Precisión 0.0251
Epoch 1 Lote 300 Pérdida 0.5621 Precisión 0.0292
Epoch 1 Lote 400 Pérdida 0.5283 Precisión 0.0325
Epoch 1 Lote 500 Pérdida 0.5025 Precisión 0.0353
Epoch 1 Lote 600 Pérdida 0.4830 Precisión 0.0377
Epoch 1 Lote 700 Pérdida 0.4652 Precisión 0.0395
Epoch 1 Lote 800 Pérdida 0.4504 Precisión 0.0411
Epoch 1 Lote 900 Pérdida 0.4379 Precisión 0.0426
Epoch 1 Lote 1000 Pérdida 0.4261 Precisión 0.0439
Epoch 1 Lote 1100 Pérdida 0.4168 Precisión 0.0450
Epoch 1 Lote 1200 Pérdida 0.4080 Precisión 0.0460
Epoch 1 Lote 1300 Pérdida 0.4001 Precisión 0.0470
Epoch 1 Lote 1400 Pérdida 0.3932 Precisión 0.0478
Epoch 1 Lote 1500 Pérdida 0.3869 Precisión 0.0487
Epoch 1 Lote 1600 Pérdida 0.3811 Precisión 0.0494
Epoch 1 Lote 1700 Pérdida 0.3758 Precisión 0.0501
Epoch 1 Lote 1800 Pérdida 0.3709 Precisión 0.0508
Epoch 1 Lote 1900 

In [22]:
def predict(input_sentence, vocab_input, vocab_output, transformer, config):
    input = f"{config.SOS_TOKEN} {input_sentence} {config.EOS_TOKEN}"
    input_encoded, _ = vectorize_text([input], config, vocab=vocab_input)

    # Set the initial output sentence to sos
    output = config.SOS_TOKEN
    output_encoded, _ = vectorize_text([output], config, vocab=vocab_input)

    # For max target len tokens
    for _ in range(config.MAX_SEQUENCE_LENGTH):
        # Call the transformer and get the logits 
        predictions = transformer(input_encoded, output_encoded, False) #(1, seq_length, VOCAB_SIZE_ES)
        # Extract the logists of the next word
        prediction = predictions[:, -1:, :]
        # The highest probability is taken
        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int64)
        print(predicted_id)
        # Check if it is the eos token
        if predicted_id == 3:
            return tf.squeeze(output_encoded, axis=0)
        # Concat the predicted word to the output sequence
        output = tf.concat([output_encoded, predicted_id], axis=-1)
    return tf.squeeze(output_encoded, axis=0)

In [23]:
predict("Who are you?.", eng_vocab, spa_vocab, transformer, config)

tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), dtype=int64)
tf.Tensor([[142]], shape=(1, 1), d

<tf.Tensor: shape=(64,), dtype=int64, numpy=
array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])>

In [17]:
df

Unnamed: 0,eng,spa
0,sos Go. eos,sos Ve. eos
1,sos Go. eos,sos Vete. eos
2,sos Go. eos,sos Vaya. eos
3,sos Go. eos,sos Váyase. eos
4,sos Hi. eos,sos Hola. eos
...,...,...
140863,sos A carbon footprint is the amount of carbon...,sos Una huella de carbono es la cantidad de co...
140864,sos Since there are usually multiple websites ...,sos Como suele haber varias páginas web sobre ...
140865,sos If you want to sound like a native speaker...,"sos Si quieres sonar como un hablante nativo, ..."
140866,sos It may be impossible to get a completely e...,sos Puede que sea imposible obtener un corpus ...
