<a href="https://colab.research.google.com/github/jbischof/keras-nlp/blob/classifier_preset/bert_tiny_en_uncased_sst2_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q git+https://github.com/mattdangerw/keras-nlp.git@bert-pipeline tensorflow==2.10 --upgrade

[K     |████████████████████████████████| 578.0 MB 16 kB/s 
[K     |████████████████████████████████| 5.8 MB 72.0 MB/s 
[K     |████████████████████████████████| 5.9 MB 68.7 MB/s 
[K     |████████████████████████████████| 438 kB 88.6 MB/s 
[K     |████████████████████████████████| 1.7 MB 80.4 MB/s 
[K     |████████████████████████████████| 5.9 MB 61.9 MB/s 
[?25h  Building wheel for keras-nlp (setup.py) ... [?25l[?25hdone


In [None]:
import keras_nlp
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

# Data

In [None]:
train_ds, valid_ds = tfds.load(
    "glue/sst2",
    split=["train", "validation"],
    batch_size=16,
)


def split_features(x):
    # GLUE comes with dictonary data, we convert it to a uniform format
    # (features, label), where features is a tuple consisting of all
    # features.
    features = x["sentence"]
    label = x["label"]
    return (features, label)


train_ds = train_ds.map(
    split_features, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)
valid_ds = valid_ds.map(
    split_features, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

# Look first training set batch
# The format is (string_tensor, label_tensor)
train_ds.take(1).get_single_element()

Downloading and preparing dataset 7.09 MiB (download: 7.09 MiB, generated: 7.22 MiB, total: 14.31 MiB) to ~/tensorflow_datasets/glue/sst2/2.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/67349 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/glue/sst2/2.0.0.incompleteQH5CX4/glue-train.tfrecord*...:   0%|          | 0/6…

Generating validation examples...:   0%|          | 0/872 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/glue/sst2/2.0.0.incompleteQH5CX4/glue-validation.tfrecord*...:   0%|          …

Generating test examples...:   0%|          | 0/1821 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/glue/sst2/2.0.0.incompleteQH5CX4/glue-test.tfrecord*...:   0%|          | 0/18…

Dataset glue downloaded and prepared to ~/tensorflow_datasets/glue/sst2/2.0.0. Subsequent calls will reuse this data.


(<tf.Tensor: shape=(16,), dtype=string, numpy=
 array([b'for the uninitiated plays better on video with the sound ',
        b'like a giant commercial for universal studios , where much of the action takes place ',
        b'company once again dazzle and delight us ',
        b"'s no surprise that as a director washington demands and receives excellent performances , from himself and from newcomer derek luke ",
        b', this cross-cultural soap opera is painfully formulaic and stilted . ',
        b", the film is n't nearly as downbeat as it sounds , but strikes a tone that 's alternately melancholic , hopeful and strangely funny . ",
        b'only masochistic moviegoers need apply . ',
        b'convince almost everyone that it was put on the screen , just for them ',
        b"like the english patient and the unbearable lightness of being , the hours is one of those reputedly `` unfilmable '' novels that has bucked the odds to emerge as an exquisite motion picture in its own righ

In [None]:
classifier = keras_nlp.models.BertClassifier.from_preset(
    "bert_tiny_en_uncased",
    num_classes=2,
)
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.experimental.AdamW(5e-5),
    metrics=keras.metrics.SparseCategoricalAccuracy(),
    jit_compile=True,
)
classifier.fit(
    train_ds,
    validation_data=valid_ds,
    epochs=2,
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f6bbd0923d0>

In [None]:
classifier.save_weights("/content/model.h5")

In [None]:
!ls -lh /content/model.h5
!md5sum /content/model.h5

-rw-r--r-- 1 root root 17M Nov 28 19:58 /content/model.h5
1f9c2d59f9e229e08f3fbd44239cfb0b  /content/model.h5


In [None]:
classifier.load_weights("/content/model.h5")

In [None]:
config = classifier.get_config()
# Zero out vocab so readable
config["preprocessor"]["config"]["tokenizer"]["config"]["vocabulary"] = []
config

{'backbone': {'class_name': 'keras_nlp>Bert',
  'config': {'vocabulary_size': 30522,
   'hidden_dim': 128,
   'intermediate_dim': 512,
   'num_layers': 2,
   'num_heads': 2,
   'max_sequence_length': 512,
   'num_segments': 2,
   'dropout': 0.1,
   'name': 'backbone',
   'trainable': True}},
 'preprocessor': {'class_name': 'keras_nlp>BertPreprocessor',
  'config': {'name': 'bert_preprocessor_1',
   'trainable': True,
   'dtype': 'float32',
   'tokenizer': {'class_name': 'keras_nlp>BertTokenizer',
    'config': {'name': 'bert_tokenizer_1',
     'trainable': True,
     'dtype': 'int32',
     'vocabulary': [],
     'sequence_length': None,
     'lowercase': True,
     'strip_accents': False,
     'split': True,
     'suffix_indicator': '##',
     'oov_token': '[UNK]'}},
   'sequence_length': 512,
   'truncate': 'round_robin'}},
 'num_classes': 2,
 'name': 'bert_classifier_1',
 'trainable': True}

In [None]:
config.pop("preprocessor")
config

{'backbone': {'class_name': 'keras_nlp>Bert',
  'config': {'vocabulary_size': 30522,
   'hidden_dim': 128,
   'intermediate_dim': 512,
   'num_layers': 2,
   'num_heads': 2,
   'max_sequence_length': 512,
   'num_segments': 2,
   'dropout': 0.1,
   'name': 'backbone',
   'trainable': True}},
 'num_classes': 2,
 'name': 'bert_classifier_1',
 'trainable': True}