In [1]:
import tensorflow as tf
import keras
import keras_nlp

EMBEDDING_DIM = 64
NUM_HEADS = 2
INTERMIDIATE_DIM = 256

dataset = [
    (
        "Giant pig fell into the swimming pool at his home in Ringwood, Hampshire. It took the efforts of a team of firefighters to winch him out of the water. A wayward horse also had to be rescued from a swimming pool in Sussex.",
        "<start> Giant pig fell into the swimming pool.",
        "Giant pig fell into the swimming pool. <end>",
    ),
    (
        "There are two chickens in the garden.",
        "<start> There are chickens.",
        "There are chickens. <end>",
    ),
]

# Preprocessing
input_texts, target_texts, decoder_target_text = zip(*dataset)

tokenizer = keras.preprocessing.text.Tokenizer(
    split=' ',
    filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
)
tokenizer.fit_on_texts(input_texts + target_texts + decoder_target_text)
vocab_size = len(tokenizer.word_index) + 1

input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)
decoder_target_sequences = tokenizer.texts_to_sequences(decoder_target_text)

max_input_length = max(len(seq) for seq in input_sequences)
max_target_length = max(len(seq) for seq in target_sequences)

input_sequences = keras.preprocessing.sequence.pad_sequences(
    input_sequences,
    maxlen=max_input_length,
    padding='post'
)
target_sequences = keras.preprocessing.sequence.pad_sequences(
    target_sequences,
    maxlen=max_target_length,
    padding='post'
)
decoder_target_sequences = keras.preprocessing.sequence.pad_sequences(
    decoder_target_sequences,
    maxlen=max_target_length,
    padding='post'
)
decoder_target_sequences = tf.expand_dims(decoder_target_sequences, axis=-1)

# Build model / Encoder & Decoder model
encoder_inputs = keras.Input(
    shape=(max_input_length,),
    name="encoder_inputs"
)
encoder_embedding = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=vocab_size,
    sequence_length=max_input_length,
    embedding_dim=EMBEDDING_DIM,
)(encoder_inputs)
encoder_outputs = keras_nlp.layers.TransformerEncoder(
    num_heads=NUM_HEADS,
    intermediate_dim=INTERMIDIATE_DIM,
)(encoder_embedding)

decoder_inputs = keras.Input(
    shape=(max_target_length,),
    name="decoder_inputs"
)
decoder_embedding = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=vocab_size,
    sequence_length=max_target_length,
    embedding_dim=EMBEDDING_DIM,
)(decoder_inputs)
decoder_outputs = keras_nlp.layers.TransformerDecoder(
    num_heads=NUM_HEADS,
    intermediate_dim=INTERMIDIATE_DIM,
)(decoder_embedding, encoder_outputs)

outputs = keras.layers.Dense(
    vocab_size,
    activation="softmax"
)(decoder_outputs)

model = keras.Model(
    [encoder_inputs, decoder_inputs],
    outputs
)
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

# Training
model.fit(
    [input_sequences, target_sequences],
    decoder_target_sequences,
    epochs=100,
)


Using TensorFlow backend


2024-06-04 10:13:41.865912: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-06-04 10:13:41.865935: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-06-04 10:13:41.865940: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-06-04 10:13:41.865971: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-04 10:13:41.865990: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/100


2024-06-04 10:13:44.807645: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.src.callbacks.History at 0x355bff550>

In [2]:
def summarize(text):
    """
    Summarize text
    :param text: original text
    :return: summarized text
    """
    input_sequence = tokenizer.texts_to_sequences([text])
    input_sequence = keras.preprocessing.sequence.pad_sequences(
        input_sequence,
        maxlen=max_input_length,
        padding='post'
    )
    idx = tokenizer.word_index['<start>']
    decoder_input_sequence = tf.constant(
        [[idx]],
        dtype=tf.int64
    )
    
    summary = []
    for _ in range(max_target_length):
        predictions = model.predict(
            [input_sequence, decoder_input_sequence],
            verbose=0
        )
        next_token = tf.argmax(predictions[0, -1, :])
        next_word = tokenizer.index_word.get(next_token.numpy(), '<unk>')
        if next_word == '<end>':
            break
        summary.append(next_word)
        decoder_input_sequence = tf.concat(
            [decoder_input_sequence, tf.expand_dims([next_token], axis=-1)],
            axis=-1
        )
    return ' '.join(summary)

# Sample
sample_text = "Giant pig fell into the swimming pool at his home in Ringwood, Hampshire. It took the efforts of a team of firefighters to winch him out of the water. A wayward horse also had to be rescued from a swimming pool in Sussex."
print("Original text:", sample_text)
print("Summary:", summarize(sample_text))

sample_text = "There are two chickens in the garden."
print("Original text:", sample_text)
print("Summary:", summarize(sample_text))

sample_text = "Two chickens fell into the swimming pool in the garden."
print("Original text:", sample_text)
print("Summary:", summarize(sample_text))

Original text: Giant pig fell into the swimming pool at his home in Ringwood, Hampshire. It took the efforts of a team of firefighters to winch him out of the water. A wayward horse also had to be rescued from a swimming pool in Sussex.
Summary: giant pig fell into the swimming pool
Original text: There are two chickens in the garden.
Summary: there are chickens
Original text: Two chickens fell into the swimming pool in the garden.
Summary: there are chickens


In [3]:
tokenizer.word_index

{'the': 1,
 'swimming': 2,
 'pool': 3,
 'giant': 4,
 'pig': 5,
 'fell': 6,
 'into': 7,
 'in': 8,
 'of': 9,
 'a': 10,
 'there': 11,
 'are': 12,
 'chickens': 13,
 'to': 14,
 '<start>': 15,
 '<end>': 16,
 'at': 17,
 'his': 18,
 'home': 19,
 'ringwood': 20,
 'hampshire': 21,
 'it': 22,
 'took': 23,
 'efforts': 24,
 'team': 25,
 'firefighters': 26,
 'winch': 27,
 'him': 28,
 'out': 29,
 'water': 30,
 'wayward': 31,
 'horse': 32,
 'also': 33,
 'had': 34,
 'be': 35,
 'rescued': 36,
 'from': 37,
 'sussex': 38,
 'two': 39,
 'garden': 40}