keras-team · mattdangerw · Mar 22, 2023 · Mar 20, 2023 · Mar 20, 2023 · Mar 22, 2023
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
@@ -24,17 +24,16 @@
 
 @keras_nlp_export("keras_nlp.models.XLMRobertaBackbone")
 class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
-    """XLM-RoBERTa encoder.
+    """An XLM-RoBERTa encoder network.
 
-    This network implements a bi-directional Transformer-based encoder as
-    described in
-    ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116).
-    It includes the embedding lookups and transformer layers, but does not
+    This class implements a bi-directional Transformer-based encoder as
+    described in ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116).
+    It includes the embedding lookups and transformer layers, but it does not
     include the masked language modeling head used during pretraining.
 
     The default constructor gives a fully customizable, randomly initialized
-    RoBERTa encoder with any number of layers, heads, and embedding
-    dimensions. To load preset architectures and weights, use the `from_preset`
+    RoBERTa encoder with any number of layers, heads, and embedding dimensions.
+    To load preset architectures and weights, use the `from_preset()`
     constructor.
 
     Disclaimer: Pre-trained models are provided on an "as is" basis, without
@@ -53,34 +52,33 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
         dropout: float. Dropout probability for the Transformer encoder.
         max_sequence_length: int. The maximum sequence length this encoder can
             consume. The sequence length of the input must be less than
-            `max_sequence_length`.
+            `max_sequence_length` default value. This determines the variable
+            shape for positional embeddings.
 
-    Example usage:
+    Examples:
     ```python
     input_data = {
         "token_ids": tf.ones(shape=(1, 12), dtype=tf.int64),
         "padding_mask": tf.constant(
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)),
     }
 
-    # Pretrained XLM-R encoder
+    # Pretrained XLM-R encoder.
     model = keras_nlp.models.XLMRobertaBackbone.from_preset(
         "xlm_roberta_base_multi",
     )
-    output = model(input_data)
+    model(input_data)
 
-    # Randomly initialized XLM-R model with custom config
+    # Randomly initialized XLM-R model with custom config.
     model = keras_nlp.models.XLMRobertaBackbone(
         vocabulary_size=250002,
-        num_layers=12,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12
+        num_layers=4,
+        num_heads=4,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128
     )
-
-    # Call the model on the input data.
-    output = model(input_data)
+    model(input_data)
     ```
     """
 

diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
@@ -34,9 +34,9 @@ class XLMRobertaClassifier(Task):
     """An end-to-end XLM-RoBERTa model for classification tasks.
 
     This model attaches a classification head to a
-    `keras_nlp.model.XLMRobertaBackbone` model, mapping from the backbone
-    outputs to logit output suitable for a classification task. For usage of
-    this model with pre-trained weights, see the `from_preset()` method.
+    `keras_nlp.model.XLMRobertaBackbone` instance, mapping from the backbone
+    outputs to logits suitable for a classification task. For usage of
+    this model with pre-trained weights, see the `from_preset()` constructor.
 
     This model can optionally be configured with a `preprocessor` layer, in
     which case it will automatically apply preprocessing to raw inputs during
@@ -49,7 +49,7 @@ class XLMRobertaClassifier(Task):
     [here](https://github.com/facebookresearch/fairseq).
 
     Args:
-        backbone: A `keras_nlp.models.XLMRoberta` instance.
+        backbone: A `keras_nlp.models.XLMRobertaBackbone` instance.
         num_classes: int. Number of classes to predict.
         hidden_dim: int. The size of the pooler layer.
         dropout: float. The dropout probability value, applied to the pooled
@@ -60,102 +60,92 @@ class XLMRobertaClassifier(Task):
 
     Examples:
 
-    Example usage.
+    Raw string data.
     ```python
-    preprocessed_features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
-    }
-    labels = [0, 3]
-
-    # Randomly initialized XLM-RoBERTa encoder
-    backbone = keras_nlp.models.XLMRobertaBackbone(
-        vocabulary_size=250002,
-        num_layers=12,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12
-    )
-
-    # Create a XLM-RoBERTa classifier and fit your data.
-    classifier = keras_nlp.models.XLMRobertaClassifier(
-        backbone,
-        num_classes=4,
-        preprocessor=None,
-    )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
-
-    # Access backbone programatically (e.g., to change `trainable`)
-    classifier.backbone.trainable = False
-    ```
-
-    Raw string inputs.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
-    features = ["The quick brown fox jumped.", "I forgot my homework."]
+    features = ["The quick brown fox jumped.", "نسيت الواجب"]
     labels = [0, 3]
 
-    # Create a XLMRobertaClassifier and fit your data.
+    # Pretrained classifier.
     classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
         "xlm_roberta_base_multi",
         num_classes=4,
     )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
     classifier.fit(x=features, y=labels, batch_size=2)
-    ```
+    classifier.predict(x=features, batch_size=2)
 
-    Raw string inputs with customized preprocessing.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
-    features = ["The quick brown fox jumped.", "I forgot my homework."]
-    labels = [0, 3]
-
-    # Use a shorter sequence length.
-    preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
-        "xlm_roberta_base_multi",
-        sequence_length=128,
-    )
-
-    # Create a XLMRobertaClassifier and fit your data.
-    classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
-        "xlm_roberta_base_multi",
-        num_classes=4,
-        preprocessor=preprocessor,
-    )
+    # Re-compile (e.g., with a new learning rate).
     classifier.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.Adam(5e-5),
+        jit_compile=True,
     )
+    # Access backbone programmatically (e.g., to change `trainable`).
+    classifier.backbone.trainable = False
+    # Fit again.
     classifier.fit(x=features, y=labels, batch_size=2)
     ```
 
-    Preprocessed inputs.
+    Preprocessed integer data.
     ```python
-    # Create a dataset with preprocessed features in an `(x, y)` format.
-    preprocessed_features = {
+    features = {
         "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
         "padding_mask": tf.constant(
             [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
         ),
     }
     labels = [0, 3]
 
-    # Create a XLMRobertaClassifier and fit your data.
+    # Pretrained classifier without preprocessing.
     classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
         "xlm_roberta_base_multi",
         num_classes=4,
         preprocessor=None,
     )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    classifier.fit(x=features, y=labels, batch_size=2)
+    ```
+
+    Custom backbone and vocabulary.
+    ```python
+    features = ["The quick brown fox jumped.", "نسيت الواجب"]
+    labels = [0, 3]
+
+    def train_sentencepiece(ds, vocab_size):
+        bytes_io = io.BytesIO()
+        sentencepiece.SentencePieceTrainer.train(
+            sentence_iterator=ds.as_numpy_iterator(),
+            model_writer=bytes_io,
+            vocab_size=vocab_size,
+            model_type="WORD",
+            unk_id=0,
+            bos_id=1,
+            eos_id=2,
+        )
+        return bytes_io.getvalue()
+    ds = tf.data.Dataset.from_tensor_slices(
+        ["the quick brown fox", "the earth is round"]
+    )
+    proto = train_sentencepiece(ds, vocab_size=10)
+    tokenizer = keras_nlp.models.XLMRobertaTokenizer(
+        proto=proto
+    )
+    preprocessor = keras_nlp.models.XLMRobertaPreprocessor(
+        tokenizer,
+        sequence_length=128,
+    )
+    backbone = keras_nlp.models.XLMRobertaBackbone(
+        vocabulary_size=250002,
+        num_layers=4,
+        num_heads=4,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128,
     )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
+    classifier = keras_nlp.models.XLMRobertaClassifier(
+        backbone=backbone,
+        preprocessor=preprocessor,
+        num_classes=4,
+    )
+    classifier.fit(x=features, y=labels, batch_size=2)
     ```
     """