! pip install tensorflow-text

! pip install keras_nlp

import tensorflow
print(tensorflow.__version__)

import keras_nlp
print(keras_nlp.__version__)

import tensorflow_text
print(tensorflow_text.__version__)

In [7]:
import os

import keras_nlp
import tensorflow as tf
from tensorflow import keras

In [8]:
# Preprocessing params.
PRETRAINING_BATCH_SIZE = 128
FINETUNING_BATCH_SIZE = 32

SEQ_LENGTH = 512
MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32

# Model params.
NUM_LAYERS = 3
MODEL_DIM = 256
INTERMEDIATE_DIM = 512
NUM_HEADS = 4
DROPOUT = 0.1
NORM_EPSILON = 1e-5

# Training params.
PRETRAINING_LEARNING_RATE = 5e-4
PRETRAINING_EPOCHS = 8
FINETUNING_LEARNING_RATE = 5e-5
FINETUNING_EPOCHS = 3

# Download vocabulary data.
vocab_file = keras.utils.get_file(
    origin="https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt",
)

# Setting sequence_length will trim or pad the token outputs to shape
# (batch_size, SEQ_LENGTH).
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab_file,
    sequence_length=SEQ_LENGTH,
    lowercase=True,
    strip_accents=True,
)

# Setting mask_selection_length will trim or pad the mask outputs to shape
# (batch_size, PREDICTIONS_PER_SEQ).
masker = keras_nlp.layers.MLMMaskGenerator(
    vocabulary_size=tokenizer.vocabulary_size(),
    mask_selection_rate=MASK_RATE,
    mask_selection_length=PREDICTIONS_PER_SEQ,
    mask_token_id=tokenizer.token_to_id("[MASK]"),
)

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer.vocab_size

In [18]:
def transformer_encoder(
    seq_length = SEQ_LENGTH,
    ):
    inputs = keras.Input(shape=(seq_length,), dtype=tf.int32)

    # Embed our tokens with a positional embedding.
    embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
        vocabulary_size=tokenizer.vocab_size,
        sequence_length=seq_length,
        embedding_dim=MODEL_DIM,
    )
    outputs = embedding_layer(inputs)

    # Apply layer normalization and dropout to the embedding.
    outputs = keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)
    outputs = keras.layers.Dropout(rate=DROPOUT)(outputs)

    # Add a number of encoder blocks
    outputs = keras_nlp.layers.TransformerEncoder(
        intermediate_dim=INTERMEDIATE_DIM,
        num_heads=NUM_HEADS,
        dropout=DROPOUT,
        layer_norm_epsilon=NORM_EPSILON,
    )(outputs)

    encoder_model = keras.Model(inputs, outputs)
    
    return encoder_model

In [13]:
def transformer_text_single_label_classifier(
    num_label = 2,
    seq_length = SEQ_LENGTH,
    ):
    
    encoder_model = transformer_encoder(
        seq_length,
        )

    # Take as input the tokenized input.
    inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32)

    # Encode and pool the tokens.
    encoded_tokens = encoder_model(inputs)
    pooled_tokens = keras.layers.GlobalAveragePooling1D()(encoded_tokens)

    # Predict an output label.
    outputs = keras.layers.Dense(num_label, activation="softmax")(pooled_tokens)\
    
    finetuning_model = keras.Model(inputs, outputs)
    
    return finetuning_model

In [14]:
def transformer_text_multi_label_classifier(
    num_label = 2,
    seq_length = SEQ_LENGTH,
    ):
    
    encoder_model = transformer_encoder(
        seq_length,
        )

    # Take as input the tokenized input.
    inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=tf.int32)

    # Encode and pool the tokens.
    encoded_tokens = encoder_model(inputs)
    pooled_tokens = keras.layers.GlobalAveragePooling1D()(encoded_tokens)

    # Predict an output label.
    outputs = keras.layers.Dense(num_label, activation="sigmoid")(pooled_tokens)\
    
    finetuning_model = keras.Model(inputs, outputs)
    
    return finetuning_model