diff --git a/keras_nlp/models/deberta_v3/deberta_v3_backbone.py b/keras_nlp/models/deberta_v3/deberta_v3_backbone.py
index 9d3e36bb04..9aaa3bc83c 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_backbone.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_backbone.py
@@ -79,11 +79,11 @@ class DebertaV3Backbone(Backbone):
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)),
     }
 
-    # Pretrained DeBERTa encoder
+    # Pretrained DeBERTa encoder.
     model = keras_nlp.models.DebertaV3Backbone.from_preset(
-        "deberta_base_en",
+        "deberta_v3_base_en",
     )
-    output = model(input_data)
+    model(input_data)
 
     # Randomly initialized DeBERTa encoder with custom config
     model = keras_nlp.models.DebertaV3Backbone(
@@ -96,7 +96,7 @@ class DebertaV3Backbone(Backbone):
         bucket_size=256,
     )
     # Call the model on the input data.
-    output = model(input_data)
+    model(input_data)
     ```
     """
 
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier.py
index 7283a4c039..a179b248f1 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_classifier.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_classifier.py
@@ -68,84 +68,34 @@ class DebertaV3Classifier(Task):
 
     Examples:
 
-    Example usage.
+    Raw string data.
     ```python
-    # Define the preprocessed inputs.
-    preprocessed_features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(1, 12)),
-    }
-    labels = [0, 3]
-
-    # Randomly initialized DeBERTa encoder
-    backbone = keras_nlp.models.DebertaV3Backbone(
-        vocabulary_size=128100,
-        num_layers=12,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12,
-        bucket_size=6,
-    )
-
-    # Create a DeBERTa classifier and fit your data.
-    classifier = keras_nlp.models.DebertaV3Classifier(
-        backbone,
-        num_classes=4,
-        preprocessor=None,
-    )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
-
-    # Access backbone programatically (e.g., to change `trainable`)
-    classifier.backbone.trainable = False
-    ```
-
-    Raw string inputs.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
     features = ["The quick brown fox jumped.", "I forgot my homework."]
     labels = [0, 3]
 
-    # Create a DebertaV3Classifier and fit your data.
+    # Pretrained classifier.
     classifier = keras_nlp.models.DebertaV3Classifier.from_preset(
         "deberta_v3_base_en",
         num_classes=4,
     )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
     classifier.fit(x=features, y=labels, batch_size=2)
-    ```
+    classifier.predict(x=features, batch_size=2)
 
-    Raw string inputs with customized preprocessing.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
-    features = ["The quick brown fox jumped.", "I forgot my homework."]
-    labels = [0, 3]
-
-    # Use a shorter sequence length.
-    preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
-        "deberta_v3_base_en",
-        sequence_length=128,
-    )
-
-    # Create a DebertaV3Classifier and fit your data.
-    classifier = keras_nlp.models.DebertaV3Classifier.from_preset(
-        "deberta_v3_base_en",
-        num_classes=4,
-        preprocessor=preprocessor,
-    )
+    # Re-compile (e.g., with a new learning rate).
     classifier.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.Adam(5e-5),
+        jit_compile=True,
     )
+    # Access backbone programatically (e.g., to change `trainable`).
+    classifier.backbone.trainable = False
+    # Fit again.
     classifier.fit(x=features, y=labels, batch_size=2)
     ```
 
-    Preprocessed inputs.
+    Preprocessed integer data.
     ```python
-    # Create a dataset with preprocessed features in an `(x, y)` format.
-    preprocessed_features = {
+    features = {
         "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
         "padding_mask": tf.constant(
             [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
@@ -153,16 +103,57 @@ class DebertaV3Classifier(Task):
     }
     labels = [0, 3]
 
-    # Create a DebertaV3Classifier and fit your data.
+    # Pretrained classifier without preprocessing.
     classifier = keras_nlp.models.DebertaV3Classifier.from_preset(
         "deberta_v3_base_en",
         num_classes=4,
         preprocessor=None,
     )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    classifier.fit(x=features, y=labels, batch_size=2)
+    ```
+
+    Custom backbone and vocabulary.
+    ```python
+    features = ["The quick brown fox jumped.", "I forgot my homework."]
+    labels = [0, 3]
+
+    bytes_io = io.BytesIO()
+    ds = tf.data.Dataset.from_tensor_slices(features)
+    sentencepiece.SentencePieceTrainer.train(
+        sentence_iterator=ds.as_numpy_iterator(),
+        model_writer=bytes_io,
+        vocab_size=10,
+        model_type="WORD",
+        pad_id=0,
+        bos_id=1,
+        eos_id=2,
+        unk_id=3,
+        pad_piece="[PAD]",
+        bos_piece="[CLS]",
+        eos_piece="[SEP]",
+        unk_piece="[UNK]",
+    )
+    tokenizer = keras_nlp.models.DebertaV3Tokenizer(
+        proto=bytes_io.getvalue(),
     )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
+    preprocessor = keras_nlp.models.DebertaV3Preprocessor(
+        tokenizer=tokenizer,
+        sequence_length=128,
+    )
+    backbone = keras_nlp.models.DebertaV3Backbone(
+        vocabulary_size=30552,
+        num_layers=4,
+        num_heads=4,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128,
+    )
+    classifier = keras_nlp.models.DebertaV3Classifier(
+        backbone=backbone,
+        preprocessor=preprocessor,
+        num_classes=4,
+    )
+    classifier.fit(x=features, y=labels, batch_size=2)
     ```
     """
 
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py
index aa4c58b646..a7ecc2192a 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py
@@ -60,55 +60,48 @@ class DebertaV3MaskedLM(Task):
 
     Example usage:
 
-    Raw string inputs and pretrained backbone.
+    Raw string data.
     ```python
-    # Create a dataset with raw string features. Labels are inferred.
     features = ["The quick brown fox jumped.", "I forgot my homework."]
 
-    # Create a DebertaV3MaskedLM with a pretrained backbone and further train
-    # on an MLM task.
+    # Pretrained language model.
     masked_lm = keras_nlp.models.DebertaV3MaskedLM.from_preset(
         "deberta_v3_base_en",
     )
+    masked_lm.fit(x=features, batch_size=2)
+
+    # Re-compile (e.g., with a new learning rate).
     masked_lm.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.Adam(5e-5),
+        jit_compile=True,
     )
+    # Access backbone programatically (e.g., to change `trainable`).
+    masked_lm.backbone.trainable = False
+    # Fit again.
     masked_lm.fit(x=features, batch_size=2)
     ```
 
-    Preprocessed inputs and custom backbone.
+    Preprocessed integer data.
     ```python
-    # Create a preprocessed dataset where 0 is the mask token.
-    preprocessed_features = {
+    # Create preprocessed batch where 0 is the mask token.
+    features = {
         "token_ids": tf.constant(
             [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
         ),
         "padding_mask": tf.constant(
             [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
         ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2))
+        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2)),
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
 
-    # Randomly initialize a DeBERTaV3 encoder
-    backbone = keras_nlp.models.DebertaV3Backbone(
-        vocabulary_size=50265,
-        num_layers=12,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12
-    )
-    # Create a DeBERTaV3 masked_lm and fit the data.
-    masked_lm = keras_nlp.models.DebertaV3MaskedLM(
-        backbone,
+    masked_lm = keras_nlp.models.DebertaV3MaskedLM.from_preset(
+        "deberta_v3_base_en",
         preprocessor=None,
     )
-    masked_lm.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
-    masked_lm.fit(x=preprocessed_features, y=labels, batch_size=2)
+    masked_lm.fit(x=features, y=labels, batch_size=2)
     ```
     """
 
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py
index 6f3a675545..59c8aeeb12 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py
@@ -71,78 +71,47 @@ class DebertaV3MaskedLMPreprocessor(DebertaV3Preprocessor):
                     out of budget. It supports an arbitrary number of segments.
 
     Examples:
+    Directly calling the layer on data.
     ```python
-    # Load the preprocessor from a preset.
     preprocessor = keras_nlp.models.DebertaV3MaskedLMPreprocessor.from_preset(
         "deberta_v3_base_en"
     )
 
-    # Tokenize and pack a single sentence.
-    sentence = tf.constant("The quick brown fox jumped.")
-    preprocessor(sentence)
-    # Same output.
+    # Tokenize and mask a single sentence.
     preprocessor("The quick brown fox jumped.")
 
-    # Tokenize and a batch of single sentences.
-    sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    preprocessor(sentences)
-    # Same output.
-    preprocessor(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
+    # Tokenize and mask a batch of single sentences.
+    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
 
-    # Tokenize and pack a sentence pair.
-    first_sentence = tf.constant("The quick brown fox jumped.")
-    second_sentence = tf.constant("The fox tripped.")
-    preprocessor((first_sentence, second_sentence))
+    # Tokenize and mask sentence pairs.
+    # In this case, always convert input to tensors before calling the layer.
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+    preprocessor((first, second))
+    ```
 
-    # Map a dataset to preprocess a single sentence.
-    features = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
+    Mapping with `tf.data.Dataset`.
+    ```python
+    preprocessor = keras_nlp.models.DebertaV3MaskedLMPreprocessor.from_preset(
+        "deberta_v3_base_en"
     )
-    labels = tf.constant([0, 1])
-    ds = tf.data.Dataset.from_tensor_slices((features, labels))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
 
-    # Map a dataset to preprocess sentence pairs.
-    first_sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    second_sentences = tf.constant(
-        ["The fox tripped.", "Oh look, a whale."]
-    )
-    labels = tf.constant([1, 1])
-    ds = tf.data.Dataset.from_tensor_slices(
-        (
-            (first_sentences, second_sentences), labels
-        )
-    )
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+
+    # Map single sentences.
+    ds = tf.data.Dataset.from_tensor_slices(first)
     ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
 
-    # Map a dataset to preprocess unlabeled sentence pairs.
-    first_sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    second_sentences = tf.constant(
-        ["The fox tripped.", "Oh look, a whale."]
-    )
-    ds = tf.data.Dataset.from_tensor_slices((first_sentences, second_sentences))
+    # Map sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices((first, second))
     # Watch out for tf.data's default unpacking of tuples here!
     # Best to invoke the `preprocessor` directly in this case.
     ds = ds.map(
-        lambda s1, s2: preprocessor(x=(s1, s2)),
+        lambda first, second: preprocessor(x=(first, second)),
         num_parallel_calls=tf.data.AUTOTUNE,
     )
-
-    # Alternatively, you can create a preprocessor from your own vocabulary.
-    # The usage is the exactly same as above.
-    tokenizer = keras_nlp.models.DebertaV3MaskedLMTokenizer(proto="model.spm")
-    preprocessor = keras_nlp.models.DebertaV3MaskedLMPreprocessor(
-        tokenizer=tokenizer,
-        sequence_length=10,
-    )
+    ```
     """
 
     def __init__(
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py
index b87fb968ed..6b5e870c13 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor.py
@@ -71,76 +71,78 @@ class DebertaV3Preprocessor(Preprocessor):
                     out of budget. It supports an arbitrary number of segments.
 
     Examples:
+    Directly calling the layer on data.
     ```python
-    # Load the preprocessor from a preset.
-    preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset("deberta_v3_base_en")
+    preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
+        "deberta_v3_base_en"
+    )
 
     # Tokenize and pack a single sentence.
-    sentence = tf.constant("The quick brown fox jumped.")
-    preprocessor(sentence)
-    # Same output.
     preprocessor("The quick brown fox jumped.")
 
-    # Tokenize and a batch of single sentences.
-    sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
+    # Tokenize a batch of single sentences.
+    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
+
+    # Preprocess a batch of sentence pairs.
+    # When handling multiple sequences, always convert to tensors first!
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+    preprocessor((first, second))
+
+    # Custom vocabulary.
+    bytes_io = io.BytesIO()
+    ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."])
+    sentencepiece.SentencePieceTrainer.train(
+        sentence_iterator=ds.as_numpy_iterator(),
+        model_writer=bytes_io,
+        vocab_size=9,
+        model_type="WORD",
+        pad_id=0,
+        bos_id=1,
+        eos_id=2,
+        unk_id=3,
+        pad_piece="[PAD]",
+        bos_piece="[CLS]",
+        eos_piece="[SEP]",
+        unk_piece="[UNK]",
     )
-    preprocessor(sentences)
-    # Same output.
-    preprocessor(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
+    tokenizer = keras_nlp.models.DebertaV3Tokenizer(
+        proto=bytes_io.getvalue(),
     )
+    preprocessor = keras_nlp.models.DebertaV3Preprocessor(tokenizer)
+    preprocessor("The quick brown fox jumped.")
+    ```
 
-    # Tokenize and pack a sentence pair.
-    first_sentence = tf.constant("The quick brown fox jumped.")
-    second_sentence = tf.constant("The fox tripped.")
-    preprocessor((first_sentence, second_sentence))
-
-    # Map a dataset to preprocess a single sentence.
-    features = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
+    Mapping with `tf.data.Dataset`.
+    ```python
+    preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
+        "deberta_v3_base_en"
     )
-    labels = tf.constant([0, 1])
-    ds = tf.data.Dataset.from_tensor_slices((features, labels))
+
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+    label = tf.constant([1, 1])
+
+    # Map labeled single sentences.
+    ds = tf.data.Dataset.from_tensor_slices((first, label))
     ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
 
-    # Map a dataset to preprocess sentence pairs.
-    first_sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    second_sentences = tf.constant(
-        ["The fox tripped.", "Oh look, a whale."]
-    )
-    labels = tf.constant([1, 1])
-    ds = tf.data.Dataset.from_tensor_slices(
-        (
-            (first_sentences, second_sentences), labels
-        )
-    )
+    # Map unlabeled single sentences.
+    ds = tf.data.Dataset.from_tensor_slices(first)
     ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
 
-    # Map a dataset to preprocess unlabeled sentence pairs.
-    first_sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    second_sentences = tf.constant(
-        ["The fox tripped.", "Oh look, a whale."]
-    )
-    ds = tf.data.Dataset.from_tensor_slices((first_sentences, second_sentences))
+    # Map labeled sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices(((first, second), label))
+    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
+
+    # Map unlabeled sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices((first, second))
     # Watch out for tf.data's default unpacking of tuples here!
     # Best to invoke the `preprocessor` directly in this case.
     ds = ds.map(
-        lambda s1, s2: preprocessor(x=(s1, s2)),
+        lambda first, second: preprocessor(x=(first, second)),
         num_parallel_calls=tf.data.AUTOTUNE,
     )
-
-    # Alternatively, you can create a preprocessor from your own vocabulary.
-    # The usage is the exactly same as above.
-    tokenizer = keras_nlp.models.DebertaV3Tokenizer(proto="model.spm")
-    preprocessor = keras_nlp.models.DebertaV3Preprocessor(
-        tokenizer=tokenizer,
-        sequence_length=10,
-    )
     ```
     """
 
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py
index c5400e7a7f..f04b78aa0f 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py
@@ -58,16 +58,39 @@ class DebertaV3Tokenizer(SentencePieceTokenizer):
     Examples:
 
     ```python
-    tokenizer = keras_nlp.models.DebertaV3Tokenizer(proto="model.spm")
+    # Unbatched input.
+    tokenizer = keras_nlp.models.DebertaV3Tokenizer.from_preset(
+        "deberta_v3_base_en",
+    )
+    tokenizer("The quick brown fox jumped.")
 
     # Batched inputs.
     tokenizer(["the quick brown fox", "the earth is round"])
 
-    # Unbatched inputs.
-    tokenizer("the quick brown fox")
-
     # Detokenization.
-    tokenizer.detokenize(tf.constant([[1, 4, 9, 5, 7, 2]]))
+    tokenizer.detokenize(tokenizer("The quick brown fox jumped."))
+
+    # Custom vocabulary.
+    bytes_io = io.BytesIO()
+    ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."])
+    sentencepiece.SentencePieceTrainer.train(
+        sentence_iterator=ds.as_numpy_iterator(),
+        model_writer=bytes_io,
+        vocab_size=9,
+        model_type="WORD",
+        pad_id=0,
+        bos_id=1,
+        eos_id=2,
+        unk_id=3,
+        pad_piece="[PAD]",
+        bos_piece="[CLS]",
+        eos_piece="[SEP]",
+        unk_piece="[UNK]",
+    )
+    tokenizer = keras_nlp.models.DebertaV3Tokenizer(
+        proto=bytes_io.getvalue(),
+    )
+    tokenizer("The quick brown fox jumped.")
     ```
     """
 
diff --git a/keras_nlp/models/deberta_v3/disentangled_attention_encoder.py b/keras_nlp/models/deberta_v3/disentangled_attention_encoder.py
index 005d482174..a605fdeb11 100644
--- a/keras_nlp/models/deberta_v3/disentangled_attention_encoder.py
+++ b/keras_nlp/models/deberta_v3/disentangled_attention_encoder.py
@@ -58,23 +58,6 @@ class DisentangledAttentionEncoder(keras.layers.Layer):
         bias_initializer: string or `keras.initializers` initializer,
             defaults to "zeros". The bias initializer for
             the dense and disentangled self-attention layers.
-
-    Examples:
-
-    ```python
-    # Create a single disentangled attention encoder layer.
-    encoder = keras_nlp.layers.DisentangledAttentionEncoder(
-        intermediate_dim=64, num_heads=8)
-
-    # Create a simple model containing the encoder.
-    input = keras.Input(shape=[10, 64])
-    output = encoder(input)
-    model = keras.Model(inputs=input, outputs=output)
-
-    # Call encoder on the inputs.
-    input_data = tf.random.uniform(shape=[2, 10, 64])
-    output = model(input_data)
-    ```
     """
 
     def __init__(