keras-team · mattdangerw · Mar 17, 2023 · Mar 13, 2023
diff --git a/keras_nlp/models/bert/bert_backbone.py b/keras_nlp/models/bert/bert_backbone.py
@@ -33,17 +33,17 @@ def bert_kernel_initializer(stddev=0.02):
 
 @keras_nlp_export("keras_nlp.models.BertBackbone")
 class BertBackbone(Backbone):
-    """BERT encoder network.
+    """A BERT encoder network.
 
     This class implements a bi-directional Transformer-based encoder as
     described in ["BERT: Pre-training of Deep Bidirectional Transformers for
     Language Understanding"](https://arxiv.org/abs/1810.04805). It includes the
     embedding lookups and transformer layers, but not the masked language model
     or next sentence prediction heads.
 
-    The default constructor gives a fully customizable, randomly initialized BERT
-    encoder with any number of layers, heads, and embedding dimensions. To load
-    preset architectures and weights, use the `from_preset` constructor.
+    The default constructor gives a fully customizable, randomly initialized
+    BERT encoder with any number of layers, heads, and embedding dimensions. To
+    load preset architectures and weights, use the `from_preset()` constructor.
 
     Disclaimer: Pre-trained models are provided on an "as is" basis, without
     warranties or conditions of any kind.
@@ -76,20 +76,20 @@ class BertBackbone(Backbone):
         ),
     }
 
-    # Pretrained BERT encoder
+    # Pretrained BERT encoder.
     model = keras_nlp.models.BertBackbone.from_preset("bert_base_en_uncased")
-    output = model(input_data)
+    model(input_data)
 
-    # Randomly initialized BERT encoder with a custom config
+    # Randomly initialized BERT encoder with a custom config.
     model = keras_nlp.models.BertBackbone(
         vocabulary_size=30552,
-        num_layers=12,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12,
+        num_layers=4,
+        num_heads=4,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128,
     )
-    output = model(input_data)
+    model(input_data)
     ```
     """
 

diff --git a/keras_nlp/models/bert/bert_classifier.py b/keras_nlp/models/bert/bert_classifier.py
@@ -30,12 +30,12 @@
 
 @keras_nlp_export("keras_nlp.models.BertClassifier")
 class BertClassifier(Task):
-    """An end-to-end BERT model for classification tasks
+    """An end-to-end BERT model for classification tasks.
 
-    This model attaches a classification head to a `keras_nlp.model.BertBackbone`
-    backbone, mapping from the backbone outputs to logit output suitable for
-    a classification task. For usage of this model with pre-trained weights, see
-    the `from_preset()` method.
+    This model attaches a classification head to a
+    `keras_nlp.model.BertBackbone` instance, mapping from the backbone outputs
+    to logits suitable for a classification task. For usage of this model with
+    pre-trained weights, use the `from_preset()` constructor.
 
     This model can optionally be configured with a `preprocessor` layer, in
     which case it will automatically apply preprocessing to raw inputs during
@@ -56,90 +56,34 @@ class BertClassifier(Task):
 
     Examples:
 
-    Example usage.
+    Raw string data.
     ```python
-    # Define the preprocessed inputs.
-    preprocessed_features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
-    }
-    labels = [0, 3]
-
-    # Randomly initialize a BERT backbone.
-    backbone = keras_nlp.models.BertBackbone(
-        vocabulary_size=30552,
-        num_layers=12,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12
-    )
-
-    # Create a BERT classifier and fit your data.
-    classifier = keras_nlp.models.BertClassifier(
-        backbone,
-        num_classes=4,
-        preprocessor=None,
-    )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
-
-    # Access backbone programatically (e.g., to change `trainable`)
-    classifier.backbone.trainable = False
-    ```
-
-    Raw string inputs.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
     features = ["The quick brown fox jumped.", "I forgot my homework."]
     labels = [0, 3]
 
-    # Create a BertClassifier and fit your data.
+    # Pretrained classifier.
     classifier = keras_nlp.models.BertClassifier.from_preset(
         "bert_base_en_uncased",
         num_classes=4,
     )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
     classifier.fit(x=features, y=labels, batch_size=2)
-    ```
-
-    Raw string inputs with customized preprocessing.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
-    features = ["The quick brown fox jumped.", "I forgot my homework."]
-    labels = [0, 3]
-
-    # Use a shorter sequence length.
-    preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
-        "bert_base_en_uncased",
-        sequence_length=128,
-    )
+    classifier.predict(x=features, batch_size=2)
 
-    # Create a BertClassifier and fit your data.
-    classifier = keras_nlp.models.BertClassifier.from_preset(
-        "bert_base_en_uncased",
-        num_classes=4,
-        preprocessor=preprocessor,
-    )
+    # Re-compile (e.g., with a new learning rate).
     classifier.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.Adam(5e-5),
+        jit_compile=True,
     )
+    # Access backbone programatically (e.g., to change `trainable`).
+    classifier.backbone.trainable = False
+    # Fit again.
     classifier.fit(x=features, y=labels, batch_size=2)
     ```
 
-    Preprocessed inputs.
+    Preprocessed integer data.
     ```python
-    # Create a dataset with preprocessed features in an `(x, y)` format.
-    preprocessed_features = {
+    features = {
         "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
         "segment_ids": tf.constant(
             [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
@@ -150,16 +94,43 @@ class BertClassifier(Task):
     }
     labels = [0, 3]
 
-    # Create a BERT classifier and fit your data.
+    # Pretrained classifier without preprocessing.
     classifier = keras_nlp.models.BertClassifier.from_preset(
         "bert_base_en_uncased",
         num_classes=4,
         preprocessor=None,
     )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    classifier.fit(x=features, y=labels, batch_size=2)
+    ```
+
+    Custom backbone and vocabulary.
+    ```python
+    features = ["The quick brown fox jumped.", "I forgot my homework."]
+    labels = [0, 3]
+
+    vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
+    vocab += ["The", "quick", "brown", "fox", "jumped", "."]
+    tokenizer = keras_nlp.models.BertTokenizer(
+        vocabulary=vocab,
+    )
+    preprocessor = keras_nlp.models.BertPreprocessor(
+        tokenizer=tokenizer,
+        sequence_length=128,
     )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
+    backbone = keras_nlp.models.BertBackbone(
+        vocabulary_size=30552,
+        num_layers=4,
+        num_heads=4,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128,
+    )
+    classifier = keras_nlp.models.BertClassifier(
+        backbone=backbone,
+        preprocessor=preprocessor,
+        num_classes=4,
+    )
+    classifier.fit(x=features, y=labels, batch_size=2)
     ```
     """
 

diff --git a/keras_nlp/models/bert/bert_masked_lm.py b/keras_nlp/models/bert/bert_masked_lm.py
@@ -37,7 +37,7 @@ class BertMaskedLM(Task):
     This model will train BERT on a masked language modeling task.
     The model will predict labels for a number of masked tokens in the
     input data. For usage of this model with pre-trained weights, see the
-    `from_preset()` method.
+    `from_preset()` constructor.
 
     This model can optionally be configured with a `preprocessor` layer, in
     which case inputs can be raw string features during `fit()`, `predict()`,
@@ -56,26 +56,32 @@ class BertMaskedLM(Task):
 
     Example usage:
 
-    Raw string inputs and pretrained backbone.
+    Raw string data.
     ```python
-    # Create a dataset with raw string features. Labels are inferred.
     features = ["The quick brown fox jumped.", "I forgot my homework."]
 
-    # Create a BertMaskedLM with a pretrained backbone and further train
-    # on an MLM task.
+    # Pretrained language model.
     masked_lm = keras_nlp.models.BertMaskedLM.from_preset(
-        "bert_base_en",
+        "bert_base_en_uncased",
     )
+    masked_lm.fit(x=features, batch_size=2)
+
+    # Re-compile (e.g., with a new learning rate).
     masked_lm.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.Adam(5e-5),
+        jit_compile=True,
     )
+    # Access backbone programatically (e.g., to change `trainable`).
+    masked_lm.backbone.trainable = False
+    # Fit again.
     masked_lm.fit(x=features, batch_size=2)
     ```
 
-    Preprocessed inputs and custom backbone.
+    Preprocessed integer data.
     ```python
-    # Create a preprocessed dataset where 0 is the mask token.
-    preprocessed_features = {
+    # Create preprocessed batch where 0 is the mask token.
+    features = {
         "token_ids": tf.constant(
             [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
         ),
@@ -88,24 +94,11 @@ class BertMaskedLM(Task):
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
 
-    # Randomly initialize a BERT encoder
-    backbone = keras_nlp.models.BertBackbone(
-        vocabulary_size=50265,
-        num_layers=12,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12
-    )
-    # Create a BERT masked LM model and fit the data.
-    masked_lm = keras_nlp.models.BertMaskedLM(
-        backbone,
+    masked_lm = keras_nlp.models.BertMaskedLM.from_preset(
+        "bert_base_en_uncased",
         preprocessor=None,
     )
-    masked_lm.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
-    masked_lm.fit(x=preprocessed_features, y=labels, batch_size=2)
+    masked_lm.fit(x=features, y=labels, batch_size=2)
     ```
     """