diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
index 6bef8e2c0a..db5967cb38 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
@@ -24,17 +24,16 @@
 
 @keras_nlp_export("keras_nlp.models.XLMRobertaBackbone")
 class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
-    """XLM-RoBERTa encoder.
+    """An XLM-RoBERTa encoder network.
 
-    This network implements a bi-directional Transformer-based encoder as
-    described in
-    ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116).
-    It includes the embedding lookups and transformer layers, but does not
+    This class implements a bi-directional Transformer-based encoder as
+    described in ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116).
+    It includes the embedding lookups and transformer layers, but it does not
     include the masked language modeling head used during pretraining.
 
     The default constructor gives a fully customizable, randomly initialized
-    RoBERTa encoder with any number of layers, heads, and embedding
-    dimensions. To load preset architectures and weights, use the `from_preset`
+    RoBERTa encoder with any number of layers, heads, and embedding dimensions.
+    To load preset architectures and weights, use the `from_preset()`
     constructor.
 
     Disclaimer: Pre-trained models are provided on an "as is" basis, without
@@ -53,9 +52,10 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
         dropout: float. Dropout probability for the Transformer encoder.
         max_sequence_length: int. The maximum sequence length this encoder can
             consume. The sequence length of the input must be less than
-            `max_sequence_length`.
+            `max_sequence_length` default value. This determines the variable
+            shape for positional embeddings.
 
-    Example usage:
+    Examples:
     ```python
     input_data = {
         "token_ids": tf.ones(shape=(1, 12), dtype=tf.int64),
@@ -63,24 +63,22 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)),
     }
 
-    # Pretrained XLM-R encoder
+    # Pretrained XLM-R encoder.
     model = keras_nlp.models.XLMRobertaBackbone.from_preset(
         "xlm_roberta_base_multi",
     )
-    output = model(input_data)
+    model(input_data)
 
-    # Randomly initialized XLM-R model with custom config
+    # Randomly initialized XLM-R model with custom config.
     model = keras_nlp.models.XLMRobertaBackbone(
         vocabulary_size=250002,
-        num_layers=12,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12
+        num_layers=4,
+        num_heads=4,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128
     )
-
-    # Call the model on the input data.
-    output = model(input_data)
+    model(input_data)
     ```
     """
 
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
index 3ff68b00b3..8096bb7433 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
@@ -34,9 +34,9 @@ class XLMRobertaClassifier(Task):
     """An end-to-end XLM-RoBERTa model for classification tasks.
 
     This model attaches a classification head to a
-    `keras_nlp.model.XLMRobertaBackbone` model, mapping from the backbone
-    outputs to logit output suitable for a classification task. For usage of
-    this model with pre-trained weights, see the `from_preset()` method.
+    `keras_nlp.model.XLMRobertaBackbone` instance, mapping from the backbone
+    outputs to logits suitable for a classification task. For usage of
+    this model with pre-trained weights, see the `from_preset()` constructor.
 
     This model can optionally be configured with a `preprocessor` layer, in
     which case it will automatically apply preprocessing to raw inputs during
@@ -49,7 +49,7 @@ class XLMRobertaClassifier(Task):
     [here](https://github.com/facebookresearch/fairseq).
 
     Args:
-        backbone: A `keras_nlp.models.XLMRoberta` instance.
+        backbone: A `keras_nlp.models.XLMRobertaBackbone` instance.
         num_classes: int. Number of classes to predict.
         hidden_dim: int. The size of the pooler layer.
         dropout: float. The dropout probability value, applied to the pooled
@@ -60,85 +60,34 @@ class XLMRobertaClassifier(Task):
 
     Examples:
 
-    Example usage.
+    Raw string data.
     ```python
-    preprocessed_features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
-    }
-    labels = [0, 3]
-
-    # Randomly initialized XLM-RoBERTa encoder
-    backbone = keras_nlp.models.XLMRobertaBackbone(
-        vocabulary_size=250002,
-        num_layers=12,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12
-    )
-
-    # Create a XLM-RoBERTa classifier and fit your data.
-    classifier = keras_nlp.models.XLMRobertaClassifier(
-        backbone,
-        num_classes=4,
-        preprocessor=None,
-    )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
-
-    # Access backbone programatically (e.g., to change `trainable`)
-    classifier.backbone.trainable = False
-    ```
-
-    Raw string inputs.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
-    features = ["The quick brown fox jumped.", "I forgot my homework."]
+    features = ["The quick brown fox jumped.", "نسيت الواجب"]
     labels = [0, 3]
 
-    # Create a XLMRobertaClassifier and fit your data.
+    # Pretrained classifier.
     classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
         "xlm_roberta_base_multi",
         num_classes=4,
     )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
     classifier.fit(x=features, y=labels, batch_size=2)
-    ```
+    classifier.predict(x=features, batch_size=2)
 
-    Raw string inputs with customized preprocessing.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
-    features = ["The quick brown fox jumped.", "I forgot my homework."]
-    labels = [0, 3]
-
-    # Use a shorter sequence length.
-    preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
-        "xlm_roberta_base_multi",
-        sequence_length=128,
-    )
-
-    # Create a XLMRobertaClassifier and fit your data.
-    classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
-        "xlm_roberta_base_multi",
-        num_classes=4,
-        preprocessor=preprocessor,
-    )
+    # Re-compile (e.g., with a new learning rate).
     classifier.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.Adam(5e-5),
+        jit_compile=True,
     )
+    # Access backbone programmatically (e.g., to change `trainable`).
+    classifier.backbone.trainable = False
+    # Fit again.
     classifier.fit(x=features, y=labels, batch_size=2)
     ```
 
-    Preprocessed inputs.
+    Preprocessed integer data.
     ```python
-    # Create a dataset with preprocessed features in an `(x, y)` format.
-    preprocessed_features = {
+    features = {
         "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
         "padding_mask": tf.constant(
             [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
@@ -146,16 +95,57 @@ class XLMRobertaClassifier(Task):
     }
     labels = [0, 3]
 
-    # Create a XLMRobertaClassifier and fit your data.
+    # Pretrained classifier without preprocessing.
     classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
         "xlm_roberta_base_multi",
         num_classes=4,
         preprocessor=None,
     )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    classifier.fit(x=features, y=labels, batch_size=2)
+    ```
+
+    Custom backbone and vocabulary.
+    ```python
+    features = ["The quick brown fox jumped.", "نسيت الواجب"]
+    labels = [0, 3]
+
+    def train_sentencepiece(ds, vocab_size):
+        bytes_io = io.BytesIO()
+        sentencepiece.SentencePieceTrainer.train(
+            sentence_iterator=ds.as_numpy_iterator(),
+            model_writer=bytes_io,
+            vocab_size=vocab_size,
+            model_type="WORD",
+            unk_id=0,
+            bos_id=1,
+            eos_id=2,
+        )
+        return bytes_io.getvalue()
+    ds = tf.data.Dataset.from_tensor_slices(
+        ["the quick brown fox", "the earth is round"]
+    )
+    proto = train_sentencepiece(ds, vocab_size=10)
+    tokenizer = keras_nlp.models.XLMRobertaTokenizer(
+        proto=proto
+    )
+    preprocessor = keras_nlp.models.XLMRobertaPreprocessor(
+        tokenizer,
+        sequence_length=128,
+    )
+    backbone = keras_nlp.models.XLMRobertaBackbone(
+        vocabulary_size=250002,
+        num_layers=4,
+        num_heads=4,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128,
     )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
+    classifier = keras_nlp.models.XLMRobertaClassifier(
+        backbone=backbone,
+        preprocessor=preprocessor,
+        num_classes=4,
+    )
+    classifier.fit(x=features, y=labels, batch_size=2)
     ```
     """
 
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
index fa38f1e380..6ecb5016e7 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
@@ -34,35 +34,23 @@
 
 @keras_nlp_export("keras_nlp.models.XLMRobertaPreprocessor")
 class XLMRobertaPreprocessor(Preprocessor):
-    """XLM-RoBERTa preprocessing layer.
+    """An XLM-RoBERTa preprocessing layer which tokenizes and packs inputs.
 
     This preprocessing layer will do three things:
 
-    - Tokenize any number of input segments using the `tokenizer`.
-    - Pack the inputs together with the appropriate `"<s>"`, `"</s>"` and
-      `"<pad>"` tokens, i.e., adding a single `"<s>"` at the start of the
-      entire sequence, `"</s></s>"` at the end of each segment, save the last
-      and a `"</s>"` at the end of the entire sequence.
-    - Construct a dictionary with keys `"token_ids"` and `"padding_mask"`,
-      that can be passed directly to a XLM-RoBERTa model.
+    1. Tokenize any number of input segments using the `tokenizer`.
+    2. Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`.
+      with the appropriate `"<s>"`, `"</s>"` and `"<pad>"` tokens, i.e., adding
+      a single `"<s>"` at the start of the entire sequence, `"</s></s>"` at the
+      end of each segment, save the last and a `"</s>"` at the end of the
+      entire sequence.
+    3. Construct a dictionary with keys `"token_ids"` and `"padding_mask"`,
+      that can be passed directly to an XLM-RoBERTa model.
 
     This layer can be used directly with `tf.data.Dataset.map` to preprocess
     string data in the `(x, y, sample_weight)` format used by
     `keras.Model.fit`.
 
-    The call method of this layer accepts three arguments, `x`, `y`, and
-    `sample_weight`. `x` can be a python string or tensor representing a single
-    segment, a list of python strings representing a batch of single segments,
-    or a list of tensors representing multiple segments to be packed together.
-    `y` and `sample_weight` are both optional, can have any format, and will be
-    passed through unaltered.
-
-    Special care should be taken when using `tf.data` to map over an unlabeled
-    tuple of string segments. `tf.data.Dataset.map` will unpack this tuple
-    directly into the call arguments of this layer, rather than forward all
-    argument to `x`. To handle this case, it is recommended to  explicitly call
-    the layer, e.g. `ds.map(lambda seg1, seg2: preprocessor(x=(seg1, seg2)))`.
-
     Args:
         tokenizer: A `keras_nlp.tokenizers.XLMRobertaTokenizer` instance.
         sequence_length: The length of the packed inputs.
@@ -77,77 +65,84 @@ class XLMRobertaPreprocessor(Preprocessor):
                     left-to-right manner and fills up the buckets until we run
                     out of budget. It supports an arbitrary number of segments.
 
+    Call arguments:
+        x: A tensor of single string sequences, or a tuple of multiple
+            tensor sequences to be packed together. Inputs may be batched or
+            unbatched. For single sequences, raw python inputs will be converted
+            to tensors. For multiple sequences, pass tensors directly.
+        y: Any label data. Will be passed through unaltered.
+        sample_weight: Any label weight data. Will be passed through unaltered.
+
     Examples:
     ```python
-    # Load the preprocessor from a preset.
-    preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset("xlm_roberta_base_multi")
+    preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
+        "xlm_roberta_base_multi"
+    )
 
     # Tokenize and pack a single sentence.
-    sentence = tf.constant("The quick brown fox jumped.")
-    preprocessor(sentence)
-    # Same output.
     preprocessor("The quick brown fox jumped.")
 
-    # Tokenize and a batch of single sentences.
-    sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
+    # Tokenize a batch of single sentences.
+    preprocessor(["The quick brown fox jumped.", "اسمي اسماعيل"])
+
+    # Preprocess a batch of sentence pairs.
+    # When handling multiple sequences, always convert to tensors first!
+    first = tf.constant(["The quick brown fox jumped.", "اسمي اسماعيل"])
+    second = tf.constant(["The fox tripped.", "الأسد ملك الغابة"])
+    preprocessor((first, second))
+
+    # Custom vocabulary.
+    def train_sentencepiece(ds, vocab_size):
+        bytes_io = io.BytesIO()
+        sentencepiece.SentencePieceTrainer.train(
+            sentence_iterator=ds.as_numpy_iterator(),
+            model_writer=bytes_io,
+            vocab_size=vocab_size,
+            model_type="WORD",
+            unk_id=0,
+            bos_id=1,
+            eos_id=2,
+        )
+        return bytes_io.getvalue()
+    ds = tf.data.Dataset.from_tensor_slices(
+        ["the quick brown fox", "the earth is round"]
     )
-    preprocessor(sentences)
-    # Same output.
-    preprocessor(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
+    proto = train_sentencepiece(ds, vocab_size=10)
+    tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto=proto)
+    preprocessor = keras_nlp.models.XLMRobertaPreprocessor(tokenizer)
+    preprocessor("The quick brown fox jumped.")
+    ```
+
+    Mapping with `tf.data.Dataset`.
+    ```python
+    preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
+        "xlm_roberta_base_multi"
     )
 
-    # Tokenize and pack a sentence pair.
-    first_sentence = tf.constant("The quick brown fox jumped.")
-    second_sentence = tf.constant("The fox tripped.")
-    preprocessor((first_sentence, second_sentence))
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+    label = tf.constant([1, 1])
 
-    # Map a dataset to preprocess a single sentence.
-    features = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    labels = tf.constant([0, 1])
-    ds = tf.data.Dataset.from_tensor_slices((features, labels))
+    # Map labeled single sentences.
+    ds = tf.data.Dataset.from_tensor_slices((first, label))
     ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
 
-    # Map a dataset to preprocess sentence pairs.
-    first_sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    second_sentences = tf.constant(
-        ["The fox tripped.", "Oh look, a whale."]
-    )
-    labels = tf.constant([1, 1])
-    ds = tf.data.Dataset.from_tensor_slices(
-        (
-            (first_sentences, second_sentences), labels
-        )
-    )
+    # Map unlabeled single sentences.
+    ds = tf.data.Dataset.from_tensor_slices(first)
     ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
 
-    # Map a dataset to preprocess unlabeled sentence pairs.
-    first_sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    second_sentences = tf.constant(
-        ["The fox tripped.", "Oh look, a whale."]
-    )
-    ds = tf.data.Dataset.from_tensor_slices((first_sentences, second_sentences))
+    # Map labeled sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices(((first, second), label))
+    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
+
+    # Map unlabeled sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices((first, second))
     # Watch out for tf.data's default unpacking of tuples here!
     # Best to invoke the `preprocessor` directly in this case.
     ds = ds.map(
-        lambda s1, s2: preprocessor(x=(s1, s2)),
+        lambda first, second: preprocessor(x=(first, second)),
         num_parallel_calls=tf.data.AUTOTUNE,
     )
-
-    # Alternatively, you can create a preprocessor from your own vocabulary.
-    # The usage is exactly the same as above.
-    tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto="model.spm")
-    preprocessor = keras_nlp.models.XLMRobertaPreprocessor(
-        tokenizer=tokenizer,
-        sequence_length=10,
-    )
     ```
     """
 
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py
index e077a7643d..d679ab9ba3 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py
@@ -27,19 +27,18 @@
 
 @keras_nlp_export("keras_nlp.models.XLMRobertaTokenizer")
 class XLMRobertaTokenizer(SentencePieceTokenizer):
-    """XLM-RoBERTa tokenizer layer based on SentencePiece.
+    """An XLM-RoBERTa tokenizer using SentencePiece subword segmentation.
 
     This tokenizer class will tokenize raw strings into integer sequences and
     is based on `keras_nlp.tokenizers.SentencePieceTokenizer`. Unlike the
     underlying tokenizer, it will check for all special tokens needed by
     XLM-RoBERTa models and provides a `from_preset()` method to automatically
-    download a matching vocabulary for a XLM-RoBERTa preset.
+    download a matching vocabulary for an XLM-RoBERTa preset.
 
-    The original fairseq implementation of XLM-RoBERTa modifies the indices of
-    the SentencePiece tokenizer output. To preserve compatibility, we make the
-    same changes, i.e., `"<s>"`, `"<pad>"`, `"</s>"` and `"<unk>"` are mapped to
-    0, 1, 2, 3, respectively, and non-special token indices are shifted right
-    by one.
+    Note: If you are providing your own custom SentencePiece model, the original
+    fairseq implementation of XLM-RoBERTa re-maps some token indices from the
+    underlying sentencepiece output. To preserve compatibility, we do the same
+    re-mapping here.
 
     If input is a batch of strings (rank > 0), the layer will output a
     `tf.RaggedTensor` where the last dimension of the output is ragged.
@@ -48,14 +47,27 @@ class XLMRobertaTokenizer(SentencePieceTokenizer):
     `tf.Tensor` with static shape `[None]`.
 
     Args:
-        proto: Either a `string` path to a SentencePiece proto file, or a
+        proto: Either a `string` path to a SentencePiece proto file or a
             `bytes` object with a serialized SentencePiece proto. See the
             [SentencePiece repository](https://github.com/google/sentencepiece)
             for more details on the format.
 
     Examples:
-
     ```python
+    tokenizer = keras_nlp.models.XLMRobertaTokenizer.from_preset(
+        "xlm_roberta_base_multi",
+    )
+
+    # Unbatched inputs.
+    tokenizer("the quick brown fox")
+
+    # Batched inputs.
+    tokenizer(["the quick brown fox", "الأرض كروية"])
+
+    # Detokenization.
+    tokenizer.detokenize(tokenizer("the quick brown fox"))
+
+    # Custom vocabulary
     def train_sentencepiece(ds, vocab_size):
         bytes_io = io.BytesIO()
         sentencepiece.SentencePieceTrainer.train(
@@ -72,18 +84,8 @@ def train_sentencepiece(ds, vocab_size):
     ds = tf.data.Dataset.from_tensor_slices(
         ["the quick brown fox", "the earth is round"]
     )
-
     proto = train_sentencepiece(ds, vocab_size=10)
     tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto=proto)
-
-    # Batched inputs.
-    tokenizer(["the quick brown fox", "the earth is round"])
-
-    # Unbatched inputs.
-    tokenizer("the quick brown fox")
-
-    # Detokenization.
-    tokenizer.detokenize(tf.constant([[0, 4, 9, 5, 7, 2]]))
     ```
     """