<a href="https://colab.research.google.com/github/huangwenbing4github/keras_demo/blob/main/keras_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import keras_nlp 
import tensorflow as tf
from tensorflow import keras

In [2]:
!pip install keras_nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_nlp
  Downloading keras_nlp-0.4.1-py3-none-any.whl (466 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m466.8/466.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-text
  Downloading tensorflow_text-2.12.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-text, keras_nlp
Successfully installed keras_nlp-0.4.1 tensorflow-text-2.12.0


In [4]:
BATCH_SIZE=64
SEQ_LEN = 128
MIN_TRAINING_SEQ_LEN=450

EMBED_DIM=256
FEED_FORWARD_DIM=256
NUM_HEADS=3
NUM_LAYERS=2
VOCAB_SIZE=5000

EPOCHS=6
NUM_TOKENS_TO_GENERATE=80

In [5]:
keras.utils.get_file(
    origin='https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip',
    extract=True,
    cache_subdir='/content'
)
dir = os.path.expanduser('/content/simplebooks/')
raw_train_ds = (
    tf.data.TextLineDataset('/content/simplebooks/simplebooks-92-raw/train.txt')
    .filter(lambda x:tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)
raw_val_ds = (
    tf.data.TextLineDataset('/content/simplebooks/simplebooks-92-raw/valid.txt')
    .filter(lambda x:tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
)

Downloading data from https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip


In [6]:
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,
    vocabulary_size=VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=['[PAD]','[UNK]','[BOS]']
)

In [7]:
print(vocab)



In [None]:
!ls -l /content/simplebooks/simplebooks-92-row/

ls: cannot access '/content/simplebooks/simplebooks-92-row/': No such file or directory


In [8]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True
)
start_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id('[BOS]')
)
def preprocess(inputs):
  outputs = tokenizer(inputs)
  features = start_packer(outputs)
  labels = outputs
  return features,labels

train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

In [9]:
inputs = keras.layers.Input(shape=(None,),dtype=tf.int32)
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True
)
x = embedding_layer(inputs)
for _ in range(NUM_LAYERS):
  decoder_layer = keras_nlp.layers.TransformerDecoder(
      num_heads=NUM_HEADS,
      intermediate_dim=FEED_FORWARD_DIM,
  )
  x = decoder_layer(x)
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs,outputs=outputs)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True,mask_token_id=0)
model.compile(optimizer='adam',loss=loss_fn,metrics=[perplexity])


In [10]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddin  (None, None, 256)        1312768   
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_decoder (Transf  (None, None, 256)        394749    
 ormerDecoder)                                                   
                                                                 
 transformer_decoder_1 (Tran  (None, None, 256)        394749    
 sformerDecoder)                                                 
                                                                 
 dense (Dense)               (None, None, 5000)        128500

In [11]:
model.fit(train_ds,validation_data=val_ds,verbose=2,epochs=EPOCHS)

Epoch 1/6
3169/3169 - 385s - loss: 4.5747 - perplexity: 97.3710 - val_loss: 4.1101 - val_perplexity: 61.5153 - 385s/epoch - 122ms/step
Epoch 2/6
3169/3169 - 232s - loss: 4.0477 - perplexity: 57.4883 - val_loss: 3.9968 - val_perplexity: 55.0295 - 232s/epoch - 73ms/step
Epoch 3/6
3169/3169 - 231s - loss: 3.9374 - perplexity: 51.4791 - val_loss: 3.9345 - val_perplexity: 51.5681 - 231s/epoch - 73ms/step
Epoch 4/6
3169/3169 - 231s - loss: 3.8760 - perplexity: 48.4129 - val_loss: 3.8886 - val_perplexity: 49.2482 - 231s/epoch - 73ms/step
Epoch 5/6
3169/3169 - 240s - loss: 3.8351 - perplexity: 46.4743 - val_loss: 3.8484 - val_perplexity: 47.3795 - 240s/epoch - 76ms/step
Epoch 6/6
3169/3169 - 244s - loss: 3.8043 - perplexity: 45.0633 - val_loss: 3.8385 - val_perplexity: 46.8944 - 244s/epoch - 77ms/step


<keras.callbacks.History at 0x7ff779e77eb0>

In [12]:
prompt_tokens = tf.convert_to_tensor([tokenizer.token_to_id('[BOS]')])
def token_logits_fn(inputs):
  cur_len = inputs.shape[1]
  output = model(inputs)
  return output[:,cur_len-1,:]


In [13]:
output_tokens = keras_nlp.utils.greedy_search(
    token_logits_fn,
    prompt_tokens,
    max_length=NUM_TOKENS_TO_GENERATE
)
txt = tokenizer.detokenize(output_tokens)
print(f'Greedy search generated text:\n{txt}\n')

Greedy search generated text:
b'[BOS] " i am glad to have you , sir , " said the knight , " and i am glad that you have been a knight , and i have been so long before you have been able to do so . i have been thinking of your being a knight , and i have been so long before you have been able to do so , and i have been able to do so . i have been'



In [15]:
output_tokens = keras_nlp.utils.beam_search(
    token_logits_fn,
    prompt_tokens,
    max_length=NUM_TOKENS_TO_GENERATE,
    num_beams=10,
    from_logits=True
)
txt = tokenizer.detokenize(output_tokens)
print(f'Beam search generated text:\n{txt}\n')

Beam search generated text:
b'[BOS] " i don \' t know , " he said , " but i don \' t know what i am going to do . i don \' t know what i have to do , but i don \' t know what i have to do . i don \' t know what i have to do , but i don \' t know what i have to do . i don \' t know whether i am'



In [18]:
output_tokens = keras_nlp.utils.random_search(
    token_logits_fn,
    prompt_tokens,
    max_length=NUM_TOKENS_TO_GENERATE,
    from_logits=True
)
txt = tokenizer.detokenize(output_tokens)
print(f'random search generated text\n{txt}\n')

random search generated text
b'[BOS] the little girls happened before landing , and during her pains was in the million . this was the artist of his two best friends she had known and sirny for her ; she had thought enough she could get something behind they had . etcermentors and clumsily was as worthy of all , the considered herself as healthy . at sunset and the afternoon sky beganness'

