In [1]:
!!pip install -q rouge-score
!!pip install -q git+https://github.com/keras-team/keras-nlp.git --upgrade

['  Installing build dependencies ... \x1b[?25l\x1b[?25hdone',
 '  Getting requirements to build wheel ... \x1b[?25l\x1b[?25hdone',
 '  Preparing metadata (pyproject.toml) ... \x1b[?25l\x1b[?25hdone',
 '\x1b[?25l     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m0.0/950.8 kB\x1b[0m \x1b[31m?\x1b[0m eta \x1b[36m-:--:--\x1b[0m',
 '\x1b[2K     \x1b[91m━━━━━\x1b[0m\x1b[90m╺\x1b[0m\x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m122.9/950.8 kB\x1b[0m \x1b[31m3.5 MB/s\x1b[0m eta \x1b[36m0:00:01\x1b[0m',
 '\x1b[2K     \x1b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m\x1b[91m╸\x1b[0m \x1b[32m942.1/950.8 kB\x1b[0m \x1b[31m13.6 MB/s\x1b[0m eta \x1b[36m0:00:01\x1b[0m',
 '\x1b[2K     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m950.8/950.8 kB\x1b[0m \x1b[31m11.0 MB/s\x1b[0m eta \x1b[36m0:00:00\x1b[0m',
 '\x1b[?25h\x1b[?25l     \x1b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\x1b[0m \x1b[32m0.0/6.5 MB\x1b[0m \x1b[31m?\x1b[0m eta \x1b[36m-:--:--\x1b

In [23]:
import keras_nlp
import pathlib
import random
import tensorflow as tf

from tensorflow import keras
from tensorflow_text.tools.wordpiece_vocab import (
    bert_vocab_from_dataset as bert_vocab,
)

Let's also define our parameters/hyperparameters.

In [97]:
BATCH_SIZE = 32
EPOCHS = 10 # This should be at least 10 for convergence
MAX_SEQUENCE_LENGTH = 5
NUMBERS_SIZE = 1000
WORDS_SIZE = 1000

EMBED_DIM = 128
INTERMEDIATE_DIM = 1024
NUM_HEADS = 8

## Data

Creating the pairs

In [98]:
# lists for digit names
digit_names = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
tens_names = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
teen_names = ["ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"]

text_pairs = []

for num in range(1000): #0-999
    #for 100-999, hundreds word value = {floor of num/100 (ith value in digit array)} OR {"" if number is not 100-999}
    hundreds = (digit_names[num//100] + " hundred ") if (num >= 100) else ""
    hundredsdigit = str(num//100) + " " if num >= 100 else ""
    remainder = num % 100 #gets all other numbers in 0-99

    if remainder == 0: #taking care of 0 separately for easier if else conditions
        text_pairs.append(("0", "zero"))
    else:
        if remainder < 10: #1-9
            ones = digit_names[remainder] #ith value in digit array -> ones word value
            text_pairs.append((hundredsdigit + "0 " + str(remainder), hundreds + ones)) #adding tuple to text_pairs
        elif remainder < 20: #10-19 (the teens)
            teens = teen_names[remainder % 10] #gets remainder of number/10 to get the ith value in teens array -> teens word value
            text_pairs.append((hundredsdigit +str(remainder)[0] + " " + str(remainder)[1], hundreds + teens)) #adding tuple to text_pairs
        else: #20-99
            tens = tens_names[remainder // 10] #floor of number/10 = ith value in tens -> tens word value
            ones = (" " + digit_names[remainder % 10]) if (remainder % 10) > 0 else "" #remainder of number/10 = ith value in digit array -> ones word value
            text_pairs.append((hundredsdigit + str(remainder)[0] + " " + str(remainder)[1],hundreds +  tens + ones)) #adding tuple to text_pairs


Here's what our number pairs look like:

In [99]:
for _ in range(5):
    print(random.choice(text_pairs))

('2 5 5', 'two hundred fifty five')
('0 8', 'eight')
('9 4 6', 'nine hundred forty six')
('2 4 2', 'two hundred forty two')
('6 2 9', 'six hundred twenty nine')


Now, let's split the sentence pairs into a training set, a validation set,
and a test set.

In [51]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")


1000 total pairs
700 training pairs
150 validation pairs
150 test pairs


## Tokenizing the data


In [100]:
def train_word_piece(the_samples, the_size, reserved_tokens):
    tokens = tf.data.Dataset.from_tensor_slices(the_samples)
    number = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        tokens.batch(1000).prefetch(2),
        vocabulary_size=the_size,
        reserved_tokens=reserved_tokens,
    )
    return number


In [85]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

num_samples = [text_pair[0] for text_pair in train_pairs]
num_lib = train_word_piece(num_samples, NUMBERS_SIZE, reserved_tokens)

words_samples = [text_pair[1] for text_pair in train_pairs]
words_lib = train_word_piece(words_samples, WORDS_SIZE, reserved_tokens)

In [101]:
num_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=num_lib, lowercase=False
)
word_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=words_lib, lowercase=False
)

Let's try and tokenize a sample from our dataset! To verify whether the text has
been tokenized correctly, we can also detokenize the list of tokens back to the
original text.

In [89]:
num_input_ex = text_pairs[3][0]
num_tokens_ex = num_tokenizer.tokenize(num_input_ex)
print("Number: ", num_input_ex)
print("Tokens: ", num_tokens_ex)
print()

words_input_ex = text_pairs[3][1]
words_tokens_ex = word_tokenizer.tokenize(words_input_ex)
print("Number in words: ", words_input_ex)
print("Tokens: ", words_tokens_ex)

Number:  4 5 9
Tokens:  tf.Tensor([ 8  9 13], shape=(3,), dtype=int32)

Number in words:  four hundred fifty nine
Tokens:  tf.Tensor([31 22 33 29], shape=(4,), dtype=int32)


## Format datasets

In [90]:

def preprocess_batch(digits, words):
    batch_size = tf.shape(words)[0]

    digits = num_tokenizer(digits)
    words = word_tokenizer(words)

    # Pad `digits` to `MAX_SEQUENCE_LENGTH`.
    digits_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=num_tokenizer.token_to_id("[PAD]"),
    )
    digits = digits_start_end_packer(digits)

    # Add special tokens (`"[START]"` and `"[END]"`) to `words` and pad it as well.
    words_start_end_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH + 1,
        start_value=word_tokenizer.token_to_id("[START]"),
        end_value=word_tokenizer.token_to_id("[END]"),
        pad_value=word_tokenizer.token_to_id("[PAD]"),
    )
    words = words_start_end_packer(words)

    return (
        {
            "encoder_inputs": digits,
            "decoder_inputs": words[:, :-1],
        },
        words[:, 1:],
    )


def make_dataset(pairs):
    digits_texts, words_texts = zip(*pairs)
    digits_texts = list(digits_texts)
    words_texts = list(words_texts)
    dataset = tf.data.Dataset.from_tensor_slices((digits_texts, words_texts))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

Let's take a quick look at the sequence shapes
(we have batches of 64 pairs, and all sequences are 40 steps long):

In [91]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")


inputs["encoder_inputs"].shape: (32, 5)
inputs["decoder_inputs"].shape: (32, 5)
targets.shape: (32, 5)


## Building the model

In [102]:
# Encoder
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=NUMBERS_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(encoder_inputs)

encoder_outputs = keras_nlp.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
encoder = keras.Model(encoder_inputs, encoder_outputs)


# Decoder
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=WORDS_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(decoder_inputs)

x = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(WORDS_SIZE, activation="softmax")(x)
decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs,
    ],
    decoder_outputs,
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="transformer",
)

## Training our model

In [105]:
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f210d3984c0>