keras-team · soma2000-lang · Mar 17, 2023 · Mar 21, 2023 · Mar 26, 2023 · chenmoneygithub
diff --git a/keras_nlp/models/albert/albert_backbone.py b/keras_nlp/models/albert/albert_backbone.py
@@ -33,11 +33,12 @@ def albert_kernel_initializer(stddev=0.02):
 
 @keras_nlp_export("keras_nlp.models.AlbertBackbone")
 class AlbertBackbone(Backbone):
-    """ALBERT encoder network.
+    """An ALBERT encoder network.
 
     This class implements a bi-directional Transformer-based encoder as
     described in
-    ["ALBERT: A Lite BERT for Self-supervised Learning of Language Representations"](https://arxiv.org/abs/1909.11942).
+    ["ALBERT: A Lite BERT for Self-supervised Learning of Language Representations"]
+    (https://arxiv.org/abs/1909.11942).
     ALBERT is a more efficient variant of BERT, and uses parameter reduction
     techniques such as cross-layer parameter sharing and factorized embedding
     parameterization. This model class includes the embedding lookups and
@@ -46,7 +47,7 @@ class AlbertBackbone(Backbone):
 
     The default constructor gives a fully customizable, randomly initialized
     ALBERT encoder with any number of layers, heads, and embedding dimensions.
-    To load preset architectures and weights, use the `from_preset` constructor.
+    To load preset architectures and weights, use the `from_preset()` constructor.
 
     Disclaimer: Pre-trained models are provided on an "as is" basis, without
     warranties or conditions of any kind.
@@ -90,14 +91,14 @@ class AlbertBackbone(Backbone):
     # Randomly initialized ALBERT encoder
     model = keras_nlp.models.AlbertBackbone(
         vocabulary_size=30000,
-        num_layers=12,
-        num_heads=12,
+        num_layers=4,
+        num_heads=4,
         num_groups=1,
         num_inner_repetitions=1,
         embedding_dim=128,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128,
     )
     output = model(input_data)
     ```

diff --git a/keras_nlp/models/albert/albert_classifier.py b/keras_nlp/models/albert/albert_classifier.py
@@ -31,10 +31,10 @@
 class AlbertClassifier(Task):
     """An end-to-end ALBERT model for classification tasks
 
-    This model attaches a classification head to a `keras_nlp.model.AlbertBackbone`
-    backbone, mapping from the backbone outputs to logit output suitable for
-    a classification task. For usage of this model with pre-trained weights, see
-    the `from_preset()` method.
+    This model attaches a classification head to a
+    `keras_nlp.model.AlbertBackbone` instance, mapping from the backbone outputs to logit output suitable for
+    a classification task. For usage of this model with pre-trained weights, use
+    the `from_preset()` constructor.
 
     This model can optionally be configured with a `preprocessor` layer, in
     which case it will automatically apply preprocessing to raw inputs during
@@ -55,49 +55,8 @@ class AlbertClassifier(Task):
 
     Examples:
 
-    Example usage.
+    Raw string data.
     ```python
-    # Define the preprocessed inputs.
-    preprocessed_features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
-    }
-    labels = [0, 3]
-
-    # Randomly initialize a ALBERT backbone.
-    backbone = AlbertBackbone(
-        vocabulary_size=1000,
-        num_layers=2,
-        num_heads=2,
-        embedding_dim=8,
-        hidden_dim=64,
-        intermediate_dim=128,
-        max_sequence_length=128,
-        name="encoder",
-    )
-
-    # Create a ALBERT classifier and fit your data.
-    classifier = keras_nlp.models.AlbertClassifier(
-        backbone,
-        num_classes=4,
-        preprocessor=None,
-    )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
-
-    # Access backbone programatically (e.g., to change `trainable`)
-    classifier.backbone.trainable = False
-
-    Raw string inputs with customized preprocessing.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
     features = ["The quick brown fox jumped.", "I forgot my homework."]
     labels = [0, 3]
 
@@ -107,19 +66,25 @@ class AlbertClassifier(Task):
         sequence_length=128,
     )
 
-    # Create a AlbertClassifier and fit your data.
+    # Pretrained classifier.
     classifier = keras_nlp.models.AlbertClassifier.from_preset(
         "albert_base_en_uncased",
         num_classes=4,
         preprocessor=preprocessor,
     )
+    classifier.fit(x=features, y=labels, batch_size=2)
+    classifier.predict(x=features, batch_size=2)
+
+    # Re-compile (e.g., with a new learning rate).
     classifier.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.Adam(5e-5),
+        jit_compile=True,
     )
     classifier.fit(x=features, y=labels, batch_size=2)
     ```
 
-    Preprocessed inputs.
+    Preprocessed integer data.
     ```python
     # Create a dataset with preprocessed features in an `(x, y)` format.
     preprocessed_features = {
@@ -133,17 +98,42 @@ class AlbertClassifier(Task):
     }
     labels = [0, 3]
 
-    # Create a ALBERT classifier and fit your data.
+    # Pretrained classifier without preprocessing.
     classifier = keras_nlp.models.AlbertClassifier.from_preset(
         "albert_base_en_uncased",
         num_classes=4,
         preprocessor=None,
     )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
     classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
     ```
+
+     Custom backbone and vocabulary.
+    ```python
+    features = ["The quick brown fox jumped.", "I forgot my homework."]
+    labels = [0, 3]
+    vocab = ["[CLS]", "[SEP]","[UNK]", "[PAD]", "[MASK]"]
+    vocab += ["The", "quick", "brown", "fox", "jumped", "."]
+    tokenizer = keras_nlp.models.AlbertTokenizer(
+        vocabulary=vocab,
+    )
+    preprocessor = keras_nlp.models.AlbertPreprocessor(
+        tokenizer=tokenizer,
+        sequence_length=128,
+    )
+    backbone = keras_nlp.models.AlbertBackbone(
+        vocabulary_size=30552,
+        num_layers=4,
+        num_heads=4,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128,
+    )
+    classifier = keras_nlp.models.AlbertClassifier(
+        backbone=backbone,
+        preprocessor=preprocessor,
+        num_classes=4,
+    )
+    classifier.fit(x=features, y=labels, batch_size=2)
     """
 
     def __init__(

diff --git a/keras_nlp/models/albert/albert_masked_lm.py b/keras_nlp/models/albert/albert_masked_lm.py
@@ -38,7 +38,7 @@ class AlbertMaskedLM(Task):
     This model will train ALBERT on a masked language modeling task.
     The model will predict labels for a number of masked tokens in the
     input data. For usage of this model with pre-trained weights, see the
-    `from_preset()` method.
+    `from_preset()` constructor.
 
     This model can optionally be configured with a `preprocessor` layer, in
     which case inputs can be raw string features during `fit()`, `predict()`,
@@ -57,26 +57,31 @@ class AlbertMaskedLM(Task):
 
     Example usage:
 
-    Raw string inputs and pretrained backbone.
+    Raw string data.
     ```python
-    # Create a dataset with raw string features. Labels are inferred.
     features = ["The quick brown fox jumped.", "I forgot my homework."]
 
-    # Create a AlbertMaskedLM with a pretrained backbone and further train
-    # on an MLM task.
+    # Pretrained language model.
     masked_lm = keras_nlp.models.AlbertMaskedLM.from_preset(
         "albert_base_en_uncased",
     )
+    masked_lm.fit(x=features, batch_size=2)
+    # Re-compile (e.g., with a new learning rate).
     masked_lm.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.Adam(5e-5),
+        jit_compile=True,
     )
+    # Access backbone programatically (e.g., to change `trainable`).
+    masked_lm.backbone.trainable = False
+    # Fit again.
     masked_lm.fit(x=features, batch_size=2)
     ```
 
-    Preprocessed inputs and custom backbone.
+    Preprocessed integer data.
     ```python
-    # Create a preprocessed dataset where 0 is the mask token.
-    preprocessed_features = {
+    # Create a preprocessed batch where 0 is the mask token.
+    features = {
         "segment_ids": tf.constant(
             [[1, 0, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
         ),
@@ -91,24 +96,9 @@ class AlbertMaskedLM(Task):
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
 
-    # Randomly initialize a ALBERT encoder
-    backbone = keras_nlp.models.AlbertBackbone(
-        vocabulary_size=1000,
-        num_layers=2,
-        num_heads=2,
-        embedding_dim=64,
-        hidden_dim=64,
-        intermediate_dim=128,
-        max_sequence_length=128)
-
-    # Create a ALBERT masked LM and fit the data.
-    masked_lm = keras_nlp.models.AlbertMaskedLM(
-        backbone,
-        preprocessor=None,
-    )
-    masked_lm.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-        jit_compile=True
+    masked_lm = keras_nlp.models.AlbertMaskedLM.from_preset(
+        "albert_base_en_uncased",
+    preprocessor=None,
     )
     masked_lm.fit(x=preprocessed_features, y=labels, batch_size=2)
     ```

diff --git a/keras_nlp/models/albert/albert_masked_lm_preprocessor.py b/keras_nlp/models/albert/albert_masked_lm_preprocessor.py
@@ -31,14 +31,14 @@ class AlbertMaskedLMPreprocessor(AlbertPreprocessor):
     `keras_nlp.models.AlbertMaskedLM` task model. Preprocessing will occur in
     multiple steps.
 
-    - Tokenize any number of input segments using the `tokenizer`.
-    - Pack the inputs together with the appropriate `"<s>"`, `"</s>"` and
+    1. Tokenize any number of input segments using the `tokenizer`.
+    2. Pack the inputs together with the appropriate `"<s>"`, `"</s>"` and
       `"<pad>"` tokens, i.e., adding a single `"<s>"` at the start of the
       entire sequence, `"</s></s>"` between each segment,
       and a `"</s>"` at the end of the entire sequence.
-    - Randomly select non-special tokens to mask, controlled by
+    3. Randomly select non-special tokens to mask, controlled by
       `mask_selection_rate`.
-    - Construct a `(x, y, sample_weight)` tuple suitable for training with a
+    4. Construct a `(x, y, sample_weight)` tuple suitable for training with a
       `keras_nlp.models.AlbertMaskedLM` task model.
 
     Args:
@@ -68,6 +68,15 @@ class AlbertMaskedLMPreprocessor(AlbertPreprocessor):
                     left-to-right manner and fills up the buckets until we run
                     out of budget. It supports an arbitrary number of segments.
 
+        Call arguments:
+        x: A tensor of single string sequences, or a tuple of multiple
+            tensor sequences to be packed together. Inputs may be batched or
+            unbatched. For single sequences, raw python inputs will be converted
+            to tensors. For multiple sequences, pass tensors directly.
+        y: Label data. Should always be `None` as the layer generates labels.
+        sample_weight: Label weights. Should always be `None` as the layer
+            generates label weights.
+
     Examples:
     ```python
     # Load the preprocessor from a preset.
@@ -76,54 +85,37 @@ class AlbertMaskedLMPreprocessor(AlbertPreprocessor):
     )
 
     # Tokenize and mask a single sentence.
-    sentence = tf.constant("The quick brown fox jumped.")
-    preprocessor(sentence)
+    preprocessor("The quick brown fox jumped.")
 
     # Tokenize and mask a batch of sentences.
-    sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    preprocessor(sentences)
+    preprocessor("The quick brown fox jumped.", "Call me Ishmael.")
 
-    # Tokenize and mask a dataset of sentences.
-    features = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    ds = tf.data.Dataset.from_tensor_slices((features))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Alternatively, you can create a preprocessor from your own vocabulary.
-    vocab_data = tf.data.Dataset.from_tensor_slices(
-        ["the quick brown fox", "the earth is round"]
-    )
+    # Tokenize and mask sentence pairs.
+    # In this case, always convert input to tensors before calling the layer.
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+    preprocessor((first, second))
+    ```
 
-    # Creating sentencepiece tokenizer for ALBERT LM preprocessor
-    bytes_io = io.BytesIO()
-
-    sentencepiece.SentencePieceTrainer.train(
-        sentence_iterator=vocab_data.as_numpy_iterator(),
-        model_writer=bytes_io,
-        vocab_size=12,
-        model_type="WORD",
-        pad_id=0,
-        unk_id=1,
-        bos_id=2,
-        eos_id=3,
-        pad_piece="<pad>",
-        unk_piece="<unk>",
-        bos_piece="[CLS]",
-        eos_piece="[SEP]",
-        user_defined_symbols="[MASK]"
+    Mapping with `tf.data.Dataset`.
+    ```python
+    preprocessor = keras_nlp.models.BertMaskedLMPreprocessor.from_preset(
+        "albert_base_en_uncased"
     )
 
-    proto = bytes_io.getvalue()
-
-    tokenizer = keras_nlp.models.AlbertTokenizer(proto=proto)
-
-    preprocessor = keras_nlp.models.AlbertMaskedLMPreprocessor(
-        tokenizer=tokenizer
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+    # Map single sentences.
+    ds = tf.data.Dataset.from_tensor_slices(first)
+    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
+    # Map sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices((first, second))
+    # Watch out for tf.data's default unpacking of tuples here!
+    # Best to invoke the `preprocessor` directly in this case.
+    ds = ds.map(
+        lambda first, second: preprocessor(x=(first, second)),
+        num_parallel_calls=tf.data.AUTOTUNE,
     )
-
     ```
     """