## Load the dataset

The following code can show the list of available huggingface dataset.

```python
import tensorflow_datasets

hf_datasets = tensorflow_datasets.core.lazy_imports.datasets
hf_names = hf_datasets.list_datasets()
for n in hf_names:
    if 'cnn_dailymail' in n:
        print(n)
```

`ccdv/cnn_dailymail` must be `ccdv__cnn_dailymail`.

```python
def from_hf_to_tfds(hf_name: str) -> str:
  """Converts Huggingface dataset name to a TFDS compatible name.

  Huggingface dataset names can contain characters that are not supported in
  TFDS. For example, in Huggingface a dataset name like `a/b` is supported,
  while in TFDS `b` would be parsed as the config.

  Examples:
  - `hf_name='codeparrot/github-code'` becomes `codeparrot__github_code`.

  Arguments:
    hf_name: the dataset name in Huggingface.

  Returns:
    the TFDS compatible dataset name.
  """
  return hf_name.replace("-", "_").replace(".", "_").replace("/", "__").lower()
```

### Reference

1. https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/core/load.py
2. https://www.tensorflow.org/datasets/api_docs/python/tfds/load

In [1]:
import tensorflow_datasets as tfds

# train_dataset, validation_dataset, test_dataset = tfds.load(
#     'huggingface:ccdv__cnn_dailymail/3.0.0',
#     split=['train', 'validation', 'test'],
#     with_info=False,
# )
args = {
    'trust_remote_code': False,
}
train_dataset, validation_dataset, test_dataset = tfds.load(
    'huggingface:ccdv__cnn_dailymail/3.0.0',
    split=['train', 'validation', 'test'],
    builder_kwargs=args,
)

  hf_names = hf_datasets.list_datasets()
2024-06-03 18:40:22.343176: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-06-03 18:40:22.343197: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-06-03 18:40:22.343202: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-06-03 18:40:22.343236: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-03 18:40:22.343251: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [2]:
for entry in train_dataset.take(1):
    print('article: ', entry['article'])
    print('highlights: ', entry['highlights'])
    print('id: ', entry['id'])

article:  tf.Tensor(b'It\'s official: U.S. President Barack Obama wants lawmakers to weigh in on whether to use military force in Syria. Obama sent a letter to the heads of the House and Senate on Saturday night, hours after announcing that he believes military action against Syrian targets is the right step to take over the alleged use of chemical weapons. The proposed legislation from Obama asks Congress to approve the use of military force "to deter, disrupt, prevent and degrade the potential for future uses of chemical weapons or other weapons of mass destruction." It\'s a step that is set to turn an international crisis into a fierce domestic political battle. There are key questions looming over the debate: What did U.N. weapons inspectors find in Syria? What happens if Congress votes no? And how will the Syrian government react? In a televised address from the White House Rose Garden earlier Saturday, the president said he would take his case to Congress, not because he has to -

***

In [3]:
def tuplize(x):
    """
    Transform a row from the dataset to learn.
    :param x: a single row of the dataset.
    :return: a tuple of the feature and the target.
    """
    return (
        x['article'],# x: feature
        x['highlights']# y: target
    )


In [4]:
train_dataset = train_dataset.map(tuplize, num_parallel_calls=4)
validation_dataset = validation_dataset.map(tuplize, num_parallel_calls=4)
test_dataset = test_dataset.map(tuplize, num_parallel_calls=4)

In [5]:
import platform
import numpy as np

import keras
import keras_nlp
import tensorflow as tf
import tensorflow_datasets as tfds

Using TensorFlow backend


In [6]:
SEQUENCE_LENGTH = 50
MAX_TOKENS = 15000
EMBEDDING_DIM = 256
INTERMIDIATE_DIM = 2048
NUM_HEADS = 2
LEARNING_RATE = 2e-6 #  changed from 2e-5
BATCH_SIZE = 2048

In [7]:
text_vectorization = keras.layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
)
text_vectorization.adapt(train_dataset)
text_vectorization.adapt(validation_dataset) #  Not good
text_vectorization.adapt(test_dataset) #  Not good

2024-06-03 18:40:22.773708: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [17]:
if platform.system() == "Darwin" and platform.processor() == "arm":
    """
    Apple Silicon mac shows tht following warning.
    WARNING:absl:At this time, the v2.11+ optimizer `tf.keras.optimizers.Adam` runs slowly on M1/M2 Macs,
    please use the legacy Keras optimizer instead,
    located at `tf.keras.optimizers.legacy.Adam`
    Therefore, keras.optimizers.legacy.Adam is used.
    """
    optimizer = keras.optimizers.legacy.Adam(learning_rate=LEARNING_RATE)
else:
    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
  
inputs = keras.Input(shape=(1,), dtype="int64")
x = text_vectorization(inputs)
x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=MAX_TOKENS,
    sequence_length=SEQUENCE_LENGTH,
    embedding_dim=EMBEDDING_DIM,
)(x)
x = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=INTERMIDIATE_DIM,
    num_heads=NUM_HEADS
)(x, x)
outputs = keras.layers.Dense(
    SEQUENCE_LENGTH,
    activation="softmax"
)(x)
model = keras.Model(inputs, outputs)
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=optimizer,
)
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 text_vectorization (TextVe  (None, 50)                   0         ['input_4[0][0]']             
 ctorization)                                                                                     
                                                                                                  
 token_and_position_embeddi  (None, 50, 256)              3852800   ['text_vectorization[0][0]']  
 ng_2 (TokenAndPositionEmbe                                                                       
 dding)                                                                                     

In [18]:
tokens_index = dict(enumerate(text_vectorization.get_vocabulary()))

def sample_next(predictions, temperature=1.0):
    predictions = np.asarray(predictions).astype("float64")
    predictions = np.log(predictions) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, predictions, 1)
    return np.argmax(probas)

class TextGenerator(keras.callbacks.Callback):
    def __init__(
            self,
            prompt,
            generate_length,
            model_input_length,
            temperatures=(1.,),
            print_freq=1):
        self.prompt = prompt
        self.generate_length = generate_length
        self.model_input_length = model_input_length
        self.temperatures = temperatures
        self.print_freq = print_freq
  
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.print_freq != 0:
            return
        for temperature in self.temperatures:
            sentence = self.prompt
            for i in range(self.generate_length):
                tokenized_sentence = text_vectorization([sentence])
                predictions = self.model(tokenized_sentence)
                next_token = sample_next(predictions[0, i, :])
                sampled_token = tokens_index[next_token]
                sentence += " " + sampled_token
            print(f"\nTemperature {temperature}: {sentence}")

prompt = "This movie" 
text_gen_callback = TextGenerator(
    prompt,
    generate_length=50,
    model_input_length=SEQUENCE_LENGTH,
    temperatures=(0., 0.2, 0.5, 0.7, 1., 1.5)
)

class EpochModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
    def __init__(
        self,
        filepath,
        frequency=1,
        monitor='val_loss',
        verbose=0,
        save_best_only=False,
        save_weights_only=False,
        mode='auto',
        options=None,
        **kwargs):
        super(EpochModelCheckpoint, self).__init__(
            filepath,
            monitor,
            verbose,
            save_best_only,
            save_weights_only,
            mode,
            "epoch",
            options
        )
        self.epochs_since_last_save = 0
        self.frequency = frequency

    def on_epoch_end(self, epoch, logs=None):
        self.epochs_since_last_save += 1
        if self.epochs_since_last_save % self.frequency == 0:
            self._save_model(epoch=epoch, batch=None, logs=logs)

    def on_train_batch_end(self, batch, logs=None):
        pass

model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath='example_transformer_next_word_prediction.keras',
    monitor='loss',
    mode='min',
    save_best_only=False,
)

In [19]:
history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=200,
    batch_size=BATCH_SIZE,
)

Epoch 1/200


ValueError: in user code:

    File "/Users/mitsuaki.ishimoto/.pyenv/versions/3.10.11/lib/python3.10/site-packages/keras/src/engine/training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "/Users/mitsuaki.ishimoto/.pyenv/versions/3.10.11/lib/python3.10/site-packages/keras/src/engine/training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/mitsuaki.ishimoto/.pyenv/versions/3.10.11/lib/python3.10/site-packages/keras/src/engine/training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "/Users/mitsuaki.ishimoto/.pyenv/versions/3.10.11/lib/python3.10/site-packages/keras/src/engine/training.py", line 1126, in train_step
        y_pred = self(x, training=True)
    File "/Users/mitsuaki.ishimoto/.pyenv/versions/3.10.11/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/mitsuaki.ishimoto/.pyenv/versions/3.10.11/lib/python3.10/site-packages/keras_nlp/src/layers/modeling/transformer_decoder.py", line 265, in __call__
        return super().__call__(
    File "/var/folders/yb/ls4k026509ndv565p_y41swh0000gn/T/__autograph_generated_filetbz7sq0i.py", line 118, in tf__call
        attention_output = ag__.converted_call(ag__.ld(self)._self_attention_layer, (), dict(query=ag__.ld(x), value=ag__.ld(x), attention_mask=ag__.ld(self_attention_mask), cache=ag__.ld(self_attention_cache), cache_update_index=ag__.ld(self_attention_cache_update_index), training=ag__.ld(training)), fscope)
    File "/var/folders/yb/ls4k026509ndv565p_y41swh0000gn/T/__autograph_generated_file52tseyrw.py", line 39, in tf__call
        query = ag__.converted_call(ag__.ld(self)._query_dense, (ag__.ld(query),), None, fscope)

    ValueError: Exception encountered when calling layer 'transformer_decoder_2' (type TransformerDecoder).
    
    in user code:
    
        File "/Users/mitsuaki.ishimoto/.pyenv/versions/3.10.11/lib/python3.10/site-packages/keras_nlp/src/layers/modeling/transformer_decoder.py", line 386, in call  *
            attention_output = self._self_attention_layer(
        File "/Users/mitsuaki.ishimoto/.pyenv/versions/3.10.11/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/var/folders/yb/ls4k026509ndv565p_y41swh0000gn/T/__autograph_generated_file52tseyrw.py", line 39, in tf__call
            query = ag__.converted_call(ag__.ld(self)._query_dense, (ag__.ld(query),), None, fscope)
    
        ValueError: Exception encountered when calling layer 'self_attention' (type CachedMultiHeadAttention).
        
        in user code:
        
            File "/Users/mitsuaki.ishimoto/.pyenv/versions/3.10.11/lib/python3.10/site-packages/keras_nlp/src/layers/modeling/cached_multi_head_attention.py", line 100, in call  *
                query = self._query_dense(query)
            File "/Users/mitsuaki.ishimoto/.pyenv/versions/3.10.11/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
                raise e.with_traceback(filtered_tb) from None
        
            ValueError: Exception encountered when calling layer 'query' (type EinsumDense).
            
            Shape must be rank 3 but is rank 2
            	 for 0th input and equation: abc,cde->abde for '{{node model_2/transformer_decoder_2/self_attention/query/einsum/Einsum}} = Einsum[N=2, T=DT_FLOAT, equation="abc,cde->abde"](model_2/token_and_position_embedding_2/add, model_2/transformer_decoder_2/self_attention/query/einsum/Einsum/ReadVariableOp)' with input shapes: [?,256], [256,2,128].
            
            Call arguments received by layer 'query' (type EinsumDense):
              • inputs=tf.Tensor(shape=(None, 256), dtype=float32)
        
        
        Call arguments received by layer 'self_attention' (type CachedMultiHeadAttention):
          • query=tf.Tensor(shape=(None, 256), dtype=float32)
          • value=tf.Tensor(shape=(None, 256), dtype=float32)
          • key=None
          • attention_mask=tf.Tensor(shape=(None, 256, 256), dtype=bool)
          • cache=None
          • cache_update_index=None
          • training=True
    
    
    Call arguments received by layer 'transformer_decoder_2' (type TransformerDecoder):
      • decoder_sequence=tf.Tensor(shape=(None, 256), dtype=float32)
      • encoder_sequence=tf.Tensor(shape=(None, 256), dtype=float32)
      • decoder_padding_mask=None
      • decoder_attention_mask=None
      • encoder_padding_mask=None
      • encoder_attention_mask=None
      • self_attention_cache=None
      • self_attention_cache_update_index=None
      • cross_attention_cache=None
      • cross_attention_cache_update_index=None
      • use_causal_mask=True
      • training=True
