In [1]:
# Install KerasNLP, and so on
%pip install keras-nlp rouge-score tensorflow-datasets datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
import platform
import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds
import keras
import keras_nlp

# Hyperparameters
EMBEDDING_DIM = 64
NUM_HEADS = 8
INTERMIDIATE_DIM = 1024
VOCAB_SIZE = 15000
BATCH_SIZE = 64 # 1 does not work
NUM_EPOCHS = 10 # 1 100

Using TensorFlow backend


In [3]:
import pandas as pd

In [4]:
# Downloaded from
# https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail
df_train = pd.read_csv('data/train.csv')
df_validation = pd.read_csv('data/validation.csv')
df_test = pd.read_csv('data/test.csv')

In [5]:
summarized_text_size = 256 #  1437 is the longest summarized text in dataset
min_summarized_text_size = 128

# @TODO The followings should programmatically be derived.
max_input_length = 2137
max_target_length = summarized_text_size + 1
max_decoder_target_length = summarized_text_size + 1

In [6]:
# Wrong
# def filter_data_frame(df):
#     return df[(df['highlights'].str.len() <= summarized_text_size) & (min_summarized_text_size <= df['highlights'].str.len())]
# df_train = filter_data_frame(df=df_train)
# df_validation = filter_data_frame(df=df_validation)
# df_test = filter_data_frame(df=df_test)

In [7]:
def add_start_end_token(df):
    df['highlights'] = '[start] ' + df['highlights'] + ' [end]'
add_start_end_token(df=df_train)
add_start_end_token(df=df_validation)
add_start_end_token(df=df_test)

In [8]:
def split_input_target(df):
    return df['article'].to_numpy(), df['highlights'].to_numpy()
train_article, train_highlights = split_input_target(df=df_train)
validation_article, validation_highlights = split_input_target(df=df_validation)
test_article, test_highlights = split_input_target(df=df_test)

In [9]:
# @see https://github.com/keras-team/keras-nlp/blob/50e041487b1d8b30b34c5fb738db3ed3406363bc/examples/machine_translation/data.py
import string
import re

strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase,
        "[%s]" % re.escape(strip_chars),
        "",
    )

vectorization_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    ragged=True,
)
# Warning: adapt, which clear the already held data inside, must be called only once.
vectorization_layer.adapt(np.concatenate([
    train_article,
    train_highlights,
    validation_article,
    validation_highlights,
    test_article,
    test_highlights,
]))

input_vectorization_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    # @TODO This should be programmatically obtained
    output_sequence_length=2137,
)
target_vectorization_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=summarized_text_size + 1,
)
input_vectorization_layer.set_vocabulary(vectorization_layer.get_vocabulary())
target_vectorization_layer.set_vocabulary(vectorization_layer.get_vocabulary())

2024-06-15 02:34:33.581964: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-06-15 02:34:33.581994: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-06-15 02:34:33.581998: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-06-15 02:34:33.582042: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-15 02:34:33.582068: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-06-15 02:34:34.372053: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [10]:
vectorization_layer.vocabulary_size(), vectorization_layer.get_vocabulary(include_special_tokens=True)[0:8]

(15000, ['', '[UNK]', 'the', 'to', 'a', 'and', 'of', 'in'])

In [11]:
# Must be False
assert not vectorization_layer(['[start]'])[0] == vectorization_layer(['start'])[0]

In [12]:
# def filter(article, highlights):
#     idx = []
#     for i, h in enumerate(reversed(highlights)):
#         if len(h) < min_summarized_text_size or summarized_text_size < len(h):
#             idx.append(i)
#     highlights = np.delete(highlights, idx)
#     article = np.delete(article, idx)
#     return article, highlights
# train_article, train_highlights = filter(
#     article=train_article,
#     highlights=train_highlights
# )
# validation_article, validation_highlights = filter(
#     article=validation_article,
#     highlights=validation_highlights
# )
# train_article, train_highlights = filter(article=train_article, highlights=train_highlights)

In [13]:
def convert_sequences(article, highlights):
    h = vectorization_layer(highlights)
    rows = h.row_lengths().shape[0]
    sequences = h.to_tensor(shape=(rows, summarized_text_size + 1 + 1))
    highlights_decoder_input = sequences[:, :-1] # summarized_text_size - 1
    highlights_decoder_output = sequences[:, 1:] # summarized_text_size - 1
    return input_vectorization_layer(article), highlights_decoder_input, highlights_decoder_output

train_input_sequences, train_target_sequences, train_decoder_target_sequences = convert_sequences(
    article=train_article,
    highlights=train_highlights
)
validation_input_sequences, validation_target_sequences, validation_decoder_target_sequences = convert_sequences(
    article=validation_article,
    highlights=validation_highlights
)
test_input_sequences, test_target_sequences, test_decoder_target_sequences = convert_sequences(
    article=test_article,
    highlights=test_highlights
)

In [14]:
learning_rate = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=20,
    decay_rate=0.99,
)
if platform.system() == "Darwin" and platform.processor() == "arm":
    """
    Apple Silicon mac shows tht following warning.
    WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs,
    please use the legacy Keras optimizer instead,
    located at `tf.keras.optimizers.legacy.Adam`
    Therefore, keras.optimizers.legacy.Adam is used.
    """
    optimizer = keras.optimizers.legacy.RMSprop()
else:
    optimizer = keras.optimizers.RMSprop()

# Build model / Encoder & Decoder model
# The encoder encodes text and represents the feature vector.
# However, the decoder scheme contains this working, especially in its heads.
# That is, it is not certain whether the encoder is necessary for this task.
# There is value in the investigation.
encoder_inputs = keras.Input(
    shape=(max_input_length,),
    name="encoder_inputs"
)
encoder_embedding = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=max_input_length,
    embedding_dim=EMBEDDING_DIM,
    mask_zero=True,
)(encoder_inputs)
encoder_outputs = keras_nlp.layers.TransformerEncoder(
    num_heads=NUM_HEADS,
    intermediate_dim=INTERMIDIATE_DIM,
)(encoder_embedding)
### <temporary>
encoder_outputs = keras_nlp.layers.TransformerEncoder(
    num_heads=NUM_HEADS,
    intermediate_dim=INTERMIDIATE_DIM,
)(encoder_outputs)
### </temporary>

decoder_inputs = keras.Input(
    shape=(max_target_length,),
    name="decoder_inputs"
)
decoder_embedding = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=max_target_length,
    embedding_dim=EMBEDDING_DIM,
    mask_zero=True,
)(decoder_inputs)
decoder_outputs = keras_nlp.layers.TransformerDecoder(
    num_heads=NUM_HEADS,
    intermediate_dim=INTERMIDIATE_DIM,
)(decoder_embedding, encoder_outputs, use_causal_mask=True)
### <temporary>
decoder_outputs = keras_nlp.layers.TransformerDecoder(
    num_heads=NUM_HEADS,
    intermediate_dim=INTERMIDIATE_DIM,
)(decoder_outputs, encoder_outputs, use_causal_mask=True)
### </temporary>

outputs = keras.layers.Dense(
    VOCAB_SIZE,
    activation="softmax"
)(decoder_outputs)

model = keras.Model(
    [encoder_inputs, decoder_inputs],
    outputs,
    name="transformer_text_summarization_model",
)
# Note
# In the case that the dataset is large and the dimension is small,
# the learning rate of Adam needed to be smaller.
model.compile(
    optimizer=optimizer,
    loss="sparse_categorical_crossentropy",
    metrics=[
        keras.metrics.SparseCategoricalAccuracy()
        # 'accuracy', #  This should not be used.
        # "loss", # This is not necessarily specified.
        # keras_nlp.metrics.RougeL()
    ]
)
model.summary()

Model: "transformer_text_summarization_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, 2137)]               0         []                            
 )                                                                                                
                                                                                                  
 token_and_position_embeddi  (None, 2137, 64)             1096768   ['encoder_inputs[0][0]']      
 ng (TokenAndPositionEmbedd                                                                       
 ing)                                                                                             
                                                                                                  
 decoder_inputs (InputLayer  [(None, 257)]                0    

In [15]:
# Training
history = model.fit(
    x=(train_input_sequences, train_target_sequences),
    y=train_decoder_target_sequences,
    validation_data=(
        (validation_input_sequences, validation_target_sequences),
        validation_decoder_target_sequences
    ),
    epochs=NUM_EPOCHS,
)

Epoch 1/10

KeyboardInterrupt: 