In [None]:
# SageMaker
#%pip install tensorflow==2.15.1 keras==2.15.0

In [1]:
%pip install keras-nlp

Note: you may need to restart the kernel to use updated packages.


In [2]:
import tensorflow as tf
import keras
print(tf.__version__)
print(keras.__version__)

2.15.1
2.15.0


In [3]:
# SageMaker cannot use @keras.saving
from keras import saving

## Data

In [4]:
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib
import random
import re
import string

import tensorflow as tf
from tensorflow import keras


def download_data():
    text_file = keras.utils.get_file(
        fname="spa-eng.zip",
        origin=(
            "http://storage.googleapis.com/download.tensorflow.org/data/"
            + "spa-eng.zip"
        ),
        extract=True,
    )
    return pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"


def read_data(filepath):
    with open(filepath) as f:
        lines = f.read().split("\n")[:-1]
        text_pairs = []
        for line in lines:
            eng, spa = line.split("\t")
            spa = "[start] " + spa + " [end]"
            text_pairs.append((eng, spa))
    return text_pairs


def split_train_val_test(text_pairs):
    random.shuffle(text_pairs)
    num_val_samples = int(0.15 * len(text_pairs))
    num_train_samples = len(text_pairs) - 2 * num_val_samples
    train_pairs = text_pairs[:num_train_samples]
    val_end_index = num_train_samples + num_val_samples
    val_pairs = text_pairs[num_train_samples:val_end_index]
    test_pairs = text_pairs[val_end_index:]
    return train_pairs, val_pairs, test_pairs


strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


@saving.register_keras_serializable()
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase,
        "[%s]" % re.escape(strip_chars),
        "",
    )


def prepare_tokenizer(train_pairs, sequence_length, vocab_size):
    """Preapare English and Spanish tokenizer."""
    eng_tokenizer = keras.layers.TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        output_sequence_length=sequence_length,
    )
    spa_tokenizer = keras.layers.TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        output_sequence_length=sequence_length + 1,
        standardize=custom_standardization,
    )
    eng_texts, spa_texts = zip(*train_pairs)
    eng_tokenizer.adapt(eng_texts)
    spa_tokenizer.adapt(spa_texts)
    return eng_tokenizer, spa_tokenizer


def prepare_datasets(text_pairs, batch_size, eng_tokenizer, spa_tokenizer):
    """Transform raw text pairs to tf datasets."""
    eng_texts, spa_texts = zip(*text_pairs)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)

    def format_dataset(eng, spa):
        """Format the dataset given input English and Spanish text.

        The output format is:
            x: a pair of English and Spanish sentence.
            y: The Spanish sentence in x shifts 1 token towards right, because
                we are predicting the next token.
        """
        eng = eng_tokenizer(eng)
        spa = spa_tokenizer(spa)
        return (
            {
                "encoder_inputs": eng,
                "decoder_inputs": spa[:, :-1],
            },
            spa[:, 1:],
            tf.cast((spa[:, 1:] != 0), "float32"),  # mask as sample weights
        )

    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(tf.data.AUTOTUNE).cache()


def get_dataset_and_tokenizer(sequence_length, vocab_size, batch_size):
    """Main method to get the formatted machine translation dataset."""
    filepath = download_data()
    text_pairs = read_data(filepath)
    train_pairs, val_pairs, test_pairs = split_train_val_test(text_pairs)
    eng_tokenizer, spa_tokenizer = prepare_tokenizer(
        train_pairs, sequence_length, vocab_size
    )
    train_ds = prepare_datasets(
        train_pairs,
        batch_size,
        eng_tokenizer,
        spa_tokenizer,
    )
    val_ds = prepare_datasets(
        val_pairs,
        batch_size,
        eng_tokenizer,
        spa_tokenizer,
    )
    test_ds = prepare_datasets(
        test_pairs,
        batch_size,
        eng_tokenizer,
        spa_tokenizer,
    )
    return (train_ds, val_ds, test_ds), (eng_tokenizer, spa_tokenizer)

## Model

In [5]:
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tensorflow as tf
from tensorflow import keras

from keras_nlp.layers import TransformerDecoder
from keras_nlp.layers import TransformerEncoder


class PositionalEmbedding(keras.layers.Layer):
    """The positional embedding class."""

    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = keras.layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

@saving.register_keras_serializable()
class TranslationModel(keras.Model):
    """The machine translation model.

    The model is an encoder-decoder structure model. The encoder is a stack of
    `keras_nlp.TransformerEncoder`, and the decoder is a stack of
    `keras_nlp.TransformerDecoder`. We also pass in the tokenizer for encoder
    and decoder so that during save/load, the tokenizer is also kept.
    """

    def __init__(
        self,
        encoder_tokenizer,
        decoder_tokenizer,
        num_encoders,
        num_decoders,
        num_heads,
        transformer_intermediate_dim,
        encoder_vocab_size,
        decoder_vocab_size,
        embed_dim,
        sequence_length,
    ):
        super().__init__()
        self.encoders = []
        self.decoders = []
        for _ in range(num_encoders):
            self.encoders.append(
                TransformerEncoder(
                    num_heads=num_heads,
                    intermediate_dim=transformer_intermediate_dim,
                )
            )
        for _ in range(num_decoders):
            self.decoders.append(
                TransformerDecoder(
                    num_heads=num_heads,
                    intermediate_dim=transformer_intermediate_dim,
                )
            )

        self.encoder_tokenizer = encoder_tokenizer
        self.decoder_tokenizer = decoder_tokenizer

        self.encoder_embedding = PositionalEmbedding(
            sequence_length=sequence_length,
            vocab_size=encoder_vocab_size,
            embed_dim=embed_dim,
        )

        self.decoder_embedding = PositionalEmbedding(
            sequence_length=sequence_length,
            vocab_size=decoder_vocab_size,
            embed_dim=embed_dim,
        )

        self.dense = keras.layers.Dense(
            decoder_vocab_size,
            activation="softmax",
        )

    def call(self, inputs):
        encoder_input, decoder_input = (
            inputs["encoder_inputs"],
            inputs["decoder_inputs"],
        )
        encoded = self.encoder_embedding(encoder_input)
        for encoder in self.encoders:
            encoded = encoder(encoded)

        decoded = self.decoder_embedding(decoder_input)
        for decoder in self.decoders:
            decoded = decoder(
                decoded,
                encoded,
                use_causal_mask=True,
            )

        output = self.dense(decoded)
        return output

Using TensorFlow backend


## Train

In [6]:
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import platform

import keras_nlp

# from absl import app
# from absl import flags
from tensorflow import keras

# FLAGS = flags.FLAGS

# flags.DEFINE_integer("num_epochs", 1, "Number of epochs to train.")
# flags.DEFINE_integer("steps_per_epoch", None, "Number of steps per epoch.")
# flags.DEFINE_integer("num_encoders", 2, "Number of Transformer encoder layers.")
# flags.DEFINE_integer("num_decoders", 2, "Number of Transformer decoder layers.")
# flags.DEFINE_integer("batch_size", 64, "The training batch size.")
# flags.DEFINE_float("learning_rate", 0.001, "The initial learning rate.")
# flags.DEFINE_integer("model_dim", 64, "Embedding size.")
# flags.DEFINE_integer(
#     "intermediate_dim",
#     128,
#     "Intermediate dimension (feedforward network) of transformer.",
# )
# flags.DEFINE_integer(
#     "num_heads",
#     8,
#     "Number of head of the multihead attention.",
# )
# flags.DEFINE_integer(
#     "sequence_length",
#     20,
#     "Input and output sequence length.",
# )
# flags.DEFINE_integer(
#     "vocab_size",
#     15000,
#     "Vocabulary size, required by tokenizer.",
# )

# flags.DEFINE_string(
#     "saved_model_path",
#     "saved_models/machine_translation_model",
#     "The path to saved model",
# )

FLAGS_learning_rate = 0.001
FLAGS_num_epochs = 1
FLAGS_steps_per_epoch = None
FLAGS_sequence_length = 20
FLAGS_vocab_size = 15000
FLAGS_batch_size = 64
FLAGS_num_encoders = 8
FLAGS_num_decoders = 8
FLAGS_num_heads = 32
FLAGS_intermediate_dim = 512
FLAGS_model_dim = 64

# FLAGS_learning_rate = 0.001
# FLAGS_num_epochs = 20
# FLAGS_steps_per_epoch = None
# FLAGS_sequence_length = 20
# FLAGS_vocab_size = 15000
# FLAGS_batch_size = 64
# FLAGS_num_encoders = 8
# FLAGS_num_decoders = 8
# FLAGS_num_heads = 16
# FLAGS_intermediate_dim = 3072
# FLAGS_model_dim = 64

if platform.system() == "Darwin" and platform.processor() == "arm":
    FLAGS_saved_model_path = 'machine_translation_model.keras/machine_translation_model'
else:
    FLAGS_saved_model_path = 'machine_translation_model.keras'

def run_training(model, train_ds, val_ds):
    learning_rate = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=FLAGS_learning_rate,
        decay_steps=20,
        decay_rate=0.98,
    )
    if platform.system() == "Darwin" and platform.processor() == "arm":
        optimizer = keras.optimizers.legacy.Adam(learning_rate=learning_rate)
    else:
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    loss_fn = keras.losses.SparseCategoricalCrossentropy(
        reduction=keras.losses.Reduction.NONE
    )
    metrics = [
        keras.metrics.SparseCategoricalAccuracy(),
        #keras_nlp.metrics.Bleu(), #  This cannot be used here
    ]
    model.compile(
        optimizer=optimizer,
        metrics=metrics,
        loss=loss_fn
    )
    model.fit(
        train_ds,
        epochs=FLAGS_num_epochs,
        validation_data=val_ds,
        steps_per_epoch=FLAGS_steps_per_epoch,
    )


def main():
    (
        (train_ds, val_ds, test_ds),
        (
            eng_tokenizer,
            spa_tokenizer,
        ),
    ) = get_dataset_and_tokenizer(
        FLAGS_sequence_length, FLAGS_vocab_size, FLAGS_batch_size
    )
    english_vocab_size = eng_tokenizer.vocabulary_size()
    spanish_vocab_size = spa_tokenizer.vocabulary_size()
    model = TranslationModel(
        encoder_tokenizer=eng_tokenizer,
        decoder_tokenizer=spa_tokenizer,
        num_encoders=FLAGS_num_encoders,
        num_decoders=FLAGS_num_decoders,
        num_heads=FLAGS_num_heads,
        transformer_intermediate_dim=FLAGS_intermediate_dim,
        encoder_vocab_size=english_vocab_size,
        decoder_vocab_size=spanish_vocab_size,
        embed_dim=FLAGS_model_dim,
        sequence_length=FLAGS_sequence_length,
    )

    run_training(model, train_ds, val_ds)

    print(f"Saving to {FLAGS_saved_model_path}")
    model.save(FLAGS_saved_model_path)

    print(f"Successfully saved model to {FLAGS_saved_model_path}")


In [7]:
main()

2024-06-16 01:11:29.980823: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-06-16 01:11:29.980850: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-06-16 01:11:29.980853: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-06-16 01:11:29.980887: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-16 01:11:29.980903: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-06-16 01:11:30.306071: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Saving to machine_translation_model.keras/machine_translation_model
INFO:tensorflow:Assets written to: machine_translation_model.keras/machine_translation_model/assets


INFO:tensorflow:Assets written to: machine_translation_model.keras/machine_translation_model/assets










Successfully saved model to machine_translation_model.keras/machine_translation_model


***

In [8]:
# Copyright 2023 The KerasNLP Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import tensorflow as tf
# from absl import app
# from absl import flags
# from absl import logging
from tensorflow import keras

# Import data module to include the customized serializable, required for
# loading tokenizer.
# import examples.machine_translation.data  # noqa: F401.

# FLAGS = flags.FLAGS

# flags.DEFINE_integer(
#     "sequence_length",
#     20,
#     "Input and output sequence length.",
# )

# flags.DEFINE_string(
#     "saved_model_path",
#     "saved_models/machine_translation_model",
#     "The path to saved model",
# )

# flags.DEFINE_string("inputs", None, "The inputs to run machine translation on.")
FLAGS_inputs = None

EXAMPLES = [
    (
        "Tom doesn't listen to anyone.",
        "[start] Tomás no escucha a nadie. [end]",
    ),
    ("I got soaked to the skin.", "[start] Estoy chorreando. [end]"),
    ("I imagined that.", "[start] Me imaginé eso. [end]"),
    ("The baby is crying.", "[start] El bebé está llorando. [end]"),
    (
        "I've never felt so exhilarated.",
        "[start] Nunca me he sentido tan animado. [end]",
    ),
    (
        "Please forgive me for not having written sooner.",
        "[start] Perdóname por no haberte escrito antes, por favor. [end]",
    ),
    ("I expected more from you.", "[start] Esperaba más de vos. [end]"),
    ("I have a computer.", "[start] Tengo un computador. [end]"),
    ("Dinner's ready!", "[start] ¡La cena está lista! [end]"),
    ("Let me finish.", "[start] Déjame terminar. [end]"),
]


def decode_sequence(input_sentence, model, max_sequence_length, lookup_table):
    encoder_tokenizer = model.encoder_tokenizer
    decoder_tokenizer = model.decoder_tokenizer
    tokenized_input = encoder_tokenizer([input_sentence])

    start_token = decoder_tokenizer("[start]")[0].numpy()
    end_token = decoder_tokenizer("[end]")[0].numpy()

    decoded_sentence = [start_token]
    for i in range(max_sequence_length):
        decoder_inputs = tf.convert_to_tensor(
            [decoded_sentence],
            dtype="int64",
        )
        decoder_inputs = tf.concat(
            [
                decoder_inputs,
                tf.zeros(
                    [1, max_sequence_length - i - 1],
                    dtype="int64",
                ),
            ],
            axis=1,
        )
        input = {
            "encoder_inputs": tokenized_input,
            "decoder_inputs": decoder_inputs,
        }
        predictions = model(input)
        predicted_token = np.argmax(predictions[0, i, :])
        decoded_sentence.append(predicted_token)
        if predicted_token == end_token:
            break

    detokenized_output = []
    for token in decoded_sentence:
        detokenized_output.append(lookup_table[token])
    return " ".join(detokenized_output)


def predict_main():
    loaded_model = keras.models.load_model(FLAGS_saved_model_path)

    decoder_tokenizer = loaded_model.decoder_tokenizer
    vocab = decoder_tokenizer.get_vocabulary()
    index_lookup_table = dict(zip(range(len(vocab)), vocab))

    if FLAGS_inputs is not None:
        # Run inference on user-specified sentence.
        translated = decode_sequence(
            FLAGS_inputs,
            loaded_model,
            FLAGS_sequence_length,
            index_lookup_table,
        )
        print(f"Translated results: {translated}")

    else:
        translated = []
        for example in EXAMPLES:
            translated.append(
                decode_sequence(
                    example[0],
                    loaded_model,
                    FLAGS_sequence_length,
                    index_lookup_table,
                )
            )

        for i in range(len(EXAMPLES)):
            print("ENGLISH SENTENCE: ", EXAMPLES[i][0])
            print("MACHINE TRANSLATED RESULT: ", translated[i])
            print("GOLDEN: ", EXAMPLES[i][1])


In [9]:
predict_main()

ENGLISH SENTENCE:  Tom doesn't listen to anyone.
MACHINE TRANSLATED RESULT:  [start] tom no se dijo a mary que mary no se dijo que mary [end]
GOLDEN:  [start] Tomás no escucha a nadie. [end]
ENGLISH SENTENCE:  I got soaked to the skin.
MACHINE TRANSLATED RESULT:  [start] tom no se dijo a mary que mary no se dijo que mary [end]
GOLDEN:  [start] Estoy chorreando. [end]
ENGLISH SENTENCE:  I imagined that.
MACHINE TRANSLATED RESULT:  [start] tom no se dijo a mary que mary no se dijo que mary [end]
GOLDEN:  [start] Me imaginé eso. [end]
ENGLISH SENTENCE:  The baby is crying.
MACHINE TRANSLATED RESULT:  [start] tom no se dijo a mary que mary no se dijo que mary [end]
GOLDEN:  [start] El bebé está llorando. [end]
ENGLISH SENTENCE:  I've never felt so exhilarated.
MACHINE TRANSLATED RESULT:  [start] tom no se dijo a mary que mary no se dijo que mary [end]
GOLDEN:  [start] Nunca me he sentido tan animado. [end]
ENGLISH SENTENCE:  Please forgive me for not having written sooner.
MACHINE TRANSLAT