diff --git a/keras_nlp/models/distil_bert/distil_bert_backbone.py b/keras_nlp/models/distil_bert/distil_bert_backbone.py
index 9d5138ece5..63df398583 100644
--- a/keras_nlp/models/distil_bert/distil_bert_backbone.py
+++ b/keras_nlp/models/distil_bert/distil_bert_backbone.py
@@ -35,7 +35,7 @@ def distilbert_kernel_initializer(stddev=0.02):
 
 @keras_nlp_export("keras_nlp.models.DistilBertBackbone")
 class DistilBertBackbone(Backbone):
-    """DistilBERT encoder network.
+    """A DistilBERT encoder network.
 
     This network implements a bi-directional Transformer-based encoder as
     described in ["DistilBERT, a distilled version of BERT: smaller, faster,
@@ -45,8 +45,8 @@ class DistilBertBackbone(Backbone):
 
     The default constructor gives a fully customizable, randomly initialized
     DistilBERT encoder with any number of layers, heads, and embedding
-    dimensions. To load preset architectures and weights, use the `from_preset`
-    constructor.
+    dimensions. To load preset architectures and weights, use the
+    `from_preset()` constructor.
 
     Disclaimer: Pre-trained models are provided on an "as is" basis, without
     warranties or conditions of any kind. The underlying model is provided by a
@@ -76,22 +76,22 @@ class DistilBertBackbone(Backbone):
         ),
     }
 
-    # Pretrained DistilBERT encoder
+    # Pretrained DistilBERT encoder.
     model = keras_nlp.models.DistilBertBackbone.from_preset(
         "distil_bert_base_en_uncased"
     )
-    output = model(input_data)
+    model(input_data)
 
-    # Randomly initialized DistilBERT encoder with custom config
+    # Randomly initialized DistilBERT encoder with custom config.
     model = keras_nlp.models.DistilBertBackbone(
         vocabulary_size=30552,
-        num_layers=6,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12,
+        num_layers=4,
+        num_heads=4,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128,
     )
-    output = model(input_data)
+    model(input_data)
     ```
     """
 
diff --git a/keras_nlp/models/distil_bert/distil_bert_classifier.py b/keras_nlp/models/distil_bert/distil_bert_classifier.py
index 805a1d74bd..538abf7788 100644
--- a/keras_nlp/models/distil_bert/distil_bert_classifier.py
+++ b/keras_nlp/models/distil_bert/distil_bert_classifier.py
@@ -36,9 +36,9 @@ class DistilBertClassifier(Task):
     """An end-to-end DistilBERT model for classification tasks.
 
     This model attaches a classification head to a
-    `keras_nlp.model.DistilBertBackbone` model, mapping from the backbone
-    outputs to logit output suitable for a classification task. For usage of
-    this model with pre-trained weights, see the `from_preset()` method.
+    `keras_nlp.model.DistilBertBackbone` instance, mapping from the backbone
+    outputs to logits suitable for a classification task. For usage of
+    this model with pre-trained weights, see the `from_preset()` constructor.
 
     This model can optionally be configured with a `preprocessor` layer, in
     which case it will automatically apply preprocessing to raw inputs during
@@ -62,60 +62,8 @@ class DistilBertClassifier(Task):
 
     Examples:
 
-    Example usage.
+    Raw string data.
     ```python
-    preprocessed_features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
-    }
-    labels = [0, 3]
-
-    # Randomly initialized DistilBertBackbone
-    backbone = keras_nlp.models.DistilBertBackbone(
-        vocabulary_size=30552,
-        num_layers=6,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=512
-    )
-
-    # Create a DistilBertClassifier and fit your data.
-    classifier = keras_nlp.models.DistilBertClassifier(
-        backbone,
-        num_classes=4,
-        preprocessor=None,
-    )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
-
-    # Access backbone programatically (e.g., to change `trainable`)
-    classifier.backbone.trainable = False
-    ```
-
-    Raw string inputs.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
-    features = ["The quick brown fox jumped.", "I forgot my homework."]
-    labels = [0, 3]
-
-    # Create a DistilBertClassifier and fit your data.
-    classifier = keras_nlp.models.DistilBertClassifier.from_preset(
-        "distil_bert_base_en_uncased",
-        num_classes=4,
-    )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
-    classifier.fit(x=features, y=labels, batch_size=2)
-    ```
-
-    Raw string inputs with customized preprocessing.
-    ```python
-    # Create a dataset with raw string features in an `(x, y)` format.
     features = ["The quick brown fox jumped.", "I forgot my homework."]
     labels = [0, 3]
 
@@ -124,43 +72,72 @@ class DistilBertClassifier(Task):
         "distil_bert_base_en_uncased",
         sequence_length=128,
     )
-    # Create a DistilBertClassifier and fit your data.
+    # Pretrained classifier.
     classifier = keras_nlp.models.DistilBertClassifier.from_preset(
         "distil_bert_base_en_uncased",
         num_classes=4,
         preprocessor=preprocessor,
     )
+    classifier.fit(x=features, y=labels, batch_size=2)
+
+    # Re-compile (e.g., with a new learning rate)
     classifier.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.Adam(5e-5),
+        jit_compile=True,
     )
+    # Access backbone programatically (e.g., to change `trainable`).
+    classifier.backbone.trainable = False
+    # Fit again.
     classifier.fit(x=features, y=labels, batch_size=2)
     ```
 
-    Preprocessed inputs.
+    Preprocessed integer data.
     ```python
-    # Create a dataset with preprocessed features in an `(x, y)` format.
-    preprocessed_features = {
+    features = {
         "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
         "padding_mask": tf.constant(
             [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
         ),
     }
     labels = [0, 3]
 
-    # Create a DistilBERT classifier and fit your data.
+    # Pretrained classifier without preprocessing.
     classifier = keras_nlp.models.DistilBertClassifier.from_preset(
         "distil_bert_base_en_uncased",
         num_classes=4,
         preprocessor=None,
     )
-    classifier.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
-    classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
+    classifier.fit(x=features, y=labels, batch_size=2)
     ```
+
+    Custom backbone and vocabulary.
+    ```python
+    features = ["The quick brown fox jumped.", "I forgot my homework."]
+    labels = [0, 3]
+    vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
+    vocab += ["The", "quick", "brown", "fox", "jumped", "."]
+    tokenizer = keras_nlp.models.DistilBertTokenizer(
+        vocabulary=vocab,
+    )
+    preprocessor = keras_nlp.models.DistilBertPreprocessor(
+        tokenizer=tokenizer,
+        sequence_length=128,
+    )
+    backbone = keras_nlp.models.DistilBertBackbone(
+        vocabulary_size=30552,
+        num_layers=4,
+        num_heads=4,
+        hidden_dim=256,
+        intermediate_dim=512,
+        max_sequence_length=128,
+    )
+    classifier = keras_nlp.models.DistilBertClassifier(
+        backbone=backbone,
+        preprocessor=preprocessor,
+        num_classes=4,
+    )
+    classifier.fit(x=features, y=labels, batch_size=2)
     """
 
     def __init__(
diff --git a/keras_nlp/models/distil_bert/distil_bert_masked_lm.py b/keras_nlp/models/distil_bert/distil_bert_masked_lm.py
index 37f3edc15c..86ac5b38ec 100644
--- a/keras_nlp/models/distil_bert/distil_bert_masked_lm.py
+++ b/keras_nlp/models/distil_bert/distil_bert_masked_lm.py
@@ -39,7 +39,7 @@ class DistilBertMaskedLM(Task):
     This model will train DistilBERT on a masked language modeling task.
     The model will predict labels for a number of masked tokens in the
     input data. For usage of this model with pre-trained weights, see the
-    `from_preset()` method.
+    `from_preset()` constructor.
 
     This model can optionally be configured with a `preprocessor` layer, in
     which case inputs can be raw string features during `fit()`, `predict()`,
@@ -60,26 +60,32 @@ class DistilBertMaskedLM(Task):
 
     Example usage:
 
-    Raw string inputs and pretrained backbone.
+    Raw string data.
     ```python
-    # Create a dataset with raw string features. Labels are inferred.
     features = ["The quick brown fox jumped.", "I forgot my homework."]
 
-    # Create a DistilBertMaskedLM with a pretrained backbone and further train
-    # on an MLM task.
+    # Pretrained language model.
     masked_lm = keras_nlp.models.DistilBertMaskedLM.from_preset(
-        "distil_bert_base_en",
+        "distil_bert_base_en_uncased",
     )
+    masked_lm.fit(x=features, batch_size=2)
+
+    # Re-compile (e.g., with a new learning rate).
     masked_lm.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=keras.optimizers.Adam(5e-5),
+        jit_compile=True,
     )
+    # Access backbone programatically (e.g., to change `trainable`).
+    masked_lm.backbone.trainable = False
+    # Fit again.
     masked_lm.fit(x=features, batch_size=2)
     ```
 
-    Preprocessed inputs and custom backbone.
+    Preprocessed integer data.
     ```python
-    # Create a preprocessed dataset where 0 is the mask token.
-    preprocessed_features = {
+    # Create preprocessed batch where 0 is the mask token.
+    features = {
         "token_ids": tf.constant(
             [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
         ),
@@ -91,24 +97,11 @@ class DistilBertMaskedLM(Task):
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
 
-    # Randomly initialize a DistilBERT encoder
-    backbone = keras_nlp.models.DistilBertBackbone(
-        vocabulary_size=50265,
-        num_layers=12,
-        num_heads=12,
-        hidden_dim=768,
-        intermediate_dim=3072,
-        max_sequence_length=12
-    )
-    # Create a DistilBERT masked_lm and fit the data.
-    masked_lm = keras_nlp.models.DistilBertMaskedLM(
-        backbone,
+    masked_lm = keras_nlp.models.DistilBertMaskedLM.from_preset(
+        "distil_bert_base_en_uncased",
         preprocessor=None,
     )
-    masked_lm.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    )
-    masked_lm.fit(x=preprocessed_features, y=labels, batch_size=2)
+    masked_lm.fit(x=features, y=labels, batch_size=2)
     ```
     """
 
diff --git a/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor.py b/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor.py
index 0fc647b88d..4ecc569d73 100644
--- a/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor.py
+++ b/keras_nlp/models/distil_bert/distil_bert_masked_lm_preprocessor.py
@@ -33,47 +33,89 @@ class DistilBertMaskedLMPreprocessor(DistilBertPreprocessor):
     `keras_nlp.models.DistilBertMaskedLM` task model. Preprocessing will occur in
     multiple steps.
 
-    - Tokenize any number of input segments using the `tokenizer`.
-    - Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`.
+    1. Tokenize any number of input segments using the `tokenizer`.
+    2. Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`.
        with the appropriate `"[CLS]"`, `"[SEP]"` and `"[PAD]"` tokens.
-    - Randomly select non-special tokens to mask, controlled by
+    3. Randomly select non-special tokens to mask, controlled by
       `mask_selection_rate`.
-    - Construct a `(x, y, sample_weight)` tuple suitable for training with a
+    4. Construct a `(x, y, sample_weight)` tuple suitable for training with a
       `keras_nlp.models.DistilBertMaskedLM` task model.
 
+    Args:
+        tokenizer: A `keras_nlp.models.DistilBertTokenizer` instance.
+        sequence_length: int. The length of the packed inputs.
+        truncate: string. The algorithm to truncate a list of batched segments
+            to fit within `sequence_length`. The value can be either
+            `round_robin` or `waterfall`:
+                - `"round_robin"`: Available space is assigned one token at a
+                    time in a round-robin fashion to the inputs that still need
+                    some, until the limit is reached.
+                - `"waterfall"`: The allocation of the budget is done using a
+                    "waterfall" algorithm that allocates quota in a
+                    left-to-right manner and fills up the buckets until we run
+                    out of budget. It supports an arbitrary number of segments.
+        mask_selection_rate: float. The probability an input token will be
+            dynamically masked.
+        mask_selection_length: int. The maximum number of masked tokens
+            in a given sample.
+        mask_token_rate: float. The probability the a selected token will be
+            replaced with the mask token.
+        random_token_rate: float. The probability the a selected token will be
+            replaced with a random token from the vocabulary. A selected token
+            will be left as is with probability
+            `1 - mask_token_rate - random_token_rate`.
+
+    Call arguments:
+        x: A tensor of single string sequences, or a tuple of multiple
+            tensor sequences to be packed together. Inputs may be batched or
+            unbatched. For single sequences, raw python inputs will be converted
+            to tensors. For multiple sequences, pass tensors directly.
+        y: Label data. Should always be `None` as the layer generates labels.
+        sample_weight: Label weights. Should always be `None` as the layer
+            generates label weights.
+
     Examples:
+
+    Directly calling the layer on data.
     ```python
-    # Load the preprocessor from a preset.
     preprocessor = keras_nlp.models.DistilBertMaskedLMPreprocessor.from_preset(
-        "distil_bert_base_en"
+        "distil_bert_base_en_uncased"
     )
 
     # Tokenize and mask a single sentence.
-    sentence = tf.constant("The quick brown fox jumped.")
-    preprocessor(sentence)
+    preprocessor("The quick brown fox jumped.")
 
-    # Tokenize and mask a batch of sentences.
-    sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    preprocessor(sentences)
+    # Tokenize and mask a batch of single sentences.
+    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
+
+    # Tokenize and mask sentence pairs.
+    # In this case, always convert input to tensors before calling the layer.
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+    preprocessor((first, second))
+    ```
 
-    # Tokenize and mask a dataset of sentences.
-    features = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
+    Mapping with `tf.data.Dataset`.
+    ```python
+    preprocessor = keras_nlp.models.DistilBertMaskedLMPreprocessor.from_preset(
+        "distil_bert_base_en_uncased"
     )
-    ds = tf.data.Dataset.from_tensor_slices((features))
+
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+
+    # Map single sentences.
+    ds = tf.data.Dataset.from_tensor_slices(first)
     ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
 
-    # Alternatively, you can create a preprocessor from your own vocabulary.
-    # The usage is exactly the same as above.
-    vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
-    vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
-    vocab += ["Call", "me", "Ish", "##mael", "."]
-    vocab += ["Oh", "look", "a", "whale"]
-    vocab += ["I", "forgot", "my", "home", "##work"]
-    tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
-    preprocessor = keras_nlp.models.DistilBertMaskedLMPreprocessor(tokenizer)
+    # Map sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices((first, second))
+    # Watch out for tf.data's default unpacking of tuples here!
+    # Best to invoke the `preprocessor` directly in this case.
+    ds = ds.map(
+        lambda first, second: preprocessor(x=(first, second)),
+        num_parallel_calls=tf.data.AUTOTUNE,
+    )
     ```
     """
 
diff --git a/keras_nlp/models/distil_bert/distil_bert_preprocessor.py b/keras_nlp/models/distil_bert/distil_bert_preprocessor.py
index 725279288e..d65bf7582f 100644
--- a/keras_nlp/models/distil_bert/distil_bert_preprocessor.py
+++ b/keras_nlp/models/distil_bert/distil_bert_preprocessor.py
@@ -35,29 +35,16 @@ class DistilBertPreprocessor(Preprocessor):
 
     This preprocessing layer will do three things:
 
-     - Tokenize any number of input segments using the `tokenizer`.
-     - Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`.
+     1. Tokenize any number of input segments using the `tokenizer`.
+     2. Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`.
        with the appropriate `"[CLS]"`, `"[SEP]"` and `"[PAD]"` tokens.
-     - Construct a dictionary of with keys `"token_ids"` and `"padding_mask"`,
+     3. Construct a dictionary of with keys `"token_ids"` and `"padding_mask"`,
        that can be passed directly to a DistilBERT model.
 
     This layer can be used directly with `tf.data.Dataset.map` to preprocess
     string data in the `(x, y, sample_weight)` format used by
     `keras.Model.fit`.
 
-    The call method of this layer accepts three arguments, `x`, `y`, and
-    `sample_weight`. `x` can be a python string or tensor representing a single
-    segment, a list of python strings representing a batch of single segments,
-    or a list of tensors representing multiple segments to be packed together.
-    `y` and `sample_weight` are both optional, can have any format, and will be
-    passed through unaltered.
-
-    Special care should be taken when using `tf.data` to map over an unlabeled
-    tuple of string segments. `tf.data.Dataset.map` will unpack this tuple
-    directly into the call arguments of this layer, rather than forward all
-    argument to `x`. To handle this case, it is recommended to  explicitly call
-    the layer, e.g. `ds.map(lambda seg1, seg2: preprocessor(x=(seg1, seg2)))`.
-
     Args:
         tokenizer: A `keras_nlp.models.DistilBertTokenizer` instance.
         sequence_length: The length of the packed inputs.
@@ -72,79 +59,61 @@ class DistilBertPreprocessor(Preprocessor):
                     left-to-right manner and fills up the buckets until we run
                     out of budget. It supports an arbitrary number of segments.
 
+    Call arguments:
+        x: A tensor of single string sequences, or a tuple of multiple
+            tensor sequences to be packed together. Inputs may be batched or
+            unbatched. For single sequences, raw python inputs will be converted
+            to tensors. For multiple sequences, pass tensors directly.
+        y: Any label data. Will be passed through unaltered.
+        sample_weight: Any label weight data. Will be passed through unaltered.
+
     Examples:
+
+    Directly calling the layer on data.
     ```python
-    # Load the preprocessor from a preset.
-    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset("distil_bert_base_en_uncased")
+    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
+        "distil_bert_base_en_uncased"
+    )
+    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
 
-    # Tokenize and pack a single sentence.
-    sentence = tf.constant("The quick brown fox jumped.")
-    preprocessor(sentence)
-    # Same output.
+    # Custom vocabulary.
+    vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
+    vocab += ["The", "quick", "brown", "fox", "jumped", "."]
+    tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
+    preprocessor = keras_nlp.models.DistilBertPreprocessor(tokenizer)
     preprocessor("The quick brown fox jumped.")
+    ```
 
-    # Tokenize and a batch of single sentences.
-    sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    preprocessor(sentences)
-    # Same output.
-    preprocessor(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
+    Mapping with `tf.data.Dataset`.
+    ```python
+    preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
+        "distil_bert_base_en_uncased"
     )
 
-    # Tokenize and pack a sentence pair.
-    first_sentence = tf.constant("The quick brown fox jumped.")
-    second_sentence = tf.constant("The fox tripped.")
-    preprocessor((first_sentence, second_sentence))
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+    label = tf.constant([1, 1])
+    # Map labeled single sentences.
+    ds = tf.data.Dataset.from_tensor_slices((first, label))
+    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
+
 
-    # Map a dataset to preprocess a single sentence.
-    features = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    labels = tf.constant([0, 1])
-    ds = tf.data.Dataset.from_tensor_slices((features, labels))
+    # Map unlabeled single sentences.
+    ds = tf.data.Dataset.from_tensor_slices(first)
     ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
 
-    # Map a dataset to preprocess sentence pairs.
-    first_sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    second_sentences = tf.constant(
-        ["The fox tripped.", "Oh look, a whale."]
-    )
-    labels = tf.constant([1, 1])
-    ds = tf.data.Dataset.from_tensor_slices(
-        (
-            (first_sentences, second_sentences), labels
-        )
-    )
+    # Map labeled sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices(((first, second), label))
     ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
+    # Map unlabeled sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices((first, second))
 
-    # Map a dataset to preprocess unlabeled sentence pairs.
-    first_sentences = tf.constant(
-        ["The quick brown fox jumped.", "Call me Ishmael."]
-    )
-    second_sentences = tf.constant(
-        ["The fox tripped.", "Oh look, a whale."]
-    )
-    ds = tf.data.Dataset.from_tensor_slices((first_sentences, second_sentences))
     # Watch out for tf.data's default unpacking of tuples here!
     # Best to invoke the `preprocessor` directly in this case.
     ds = ds.map(
-        lambda s1, s2: preprocessor(x=(s1, s2)),
+        lambda first, second: preprocessor(x=(first, second)),
         num_parallel_calls=tf.data.AUTOTUNE,
     )
-
-    # Alternatively, you can create a preprocessor from your own vocabulary.
-    # The usage is exactly the same as above.
-    vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
-    vocab += ["The", "qu", "##ick", "br", "##own", "fox", "tripped"]
-    vocab += ["Call", "me", "Ish", "##mael", "."]
-    vocab += ["Oh", "look", "a", "whale"]
-    vocab += ["I", "forgot", "my", "home", "##work"]
-    tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
-    preprocessor = keras_nlp.models.DistilBertPreprocessor(tokenizer)
     ```
     """
 
diff --git a/keras_nlp/models/distil_bert/distil_bert_tokenizer.py b/keras_nlp/models/distil_bert/distil_bert_tokenizer.py
index 6a706a7316..f954925ce3 100644
--- a/keras_nlp/models/distil_bert/distil_bert_tokenizer.py
+++ b/keras_nlp/models/distil_bert/distil_bert_tokenizer.py
@@ -50,29 +50,22 @@ class DistilBertTokenizer(WordPieceTokenizer):
 
     Examples:
 
-    Batched input.
-    >>> vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
-    >>> vocab += ["The", "qu", "##ick", "brown", "fox", "."]
-    >>> inputs = ["The quick brown fox.", "The fox."]
-    >>> tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
-    >>> tokenizer(inputs)
-    <tf.RaggedTensor [[5, 6, 7, 8, 9, 10], [5, 9, 10]]>
-
-    Unbatched input.
-    >>> vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
-    >>> vocab += ["The", "qu", "##ick", "brown", "fox", "."]
-    >>> inputs = "The fox."
-    >>> tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
-    >>> tokenizer(inputs)
-    <tf.Tensor: shape=(3,), dtype=int32, numpy=array([ 5,  9, 10], dtype=int32)>
-
-    Detokenization.
-    >>> vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
-    >>> vocab += ["The", "qu", "##ick", "brown", "fox", "."]
-    >>> inputs = "The quick brown fox."
-    >>> tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
-    >>> tokenizer.detokenize(tokenizer.tokenize(inputs)).numpy().decode('utf-8')
-    'The quick brown fox .'
+    ```python
+    # Unbatched input.
+    tokenizer = keras_nlp.models.DistilBertTokenizer.from_preset(
+        "distil_bert_base_en_uncased",
+    )
+    tokenizer("The quick brown fox jumped.")
+    # Batched input.
+    tokenizer(["The quick brown fox jumped.", "The fox slept."])
+    # Detokenization.
+    tokenizer.detokenize(tokenizer("The quick brown fox jumped."))
+    # Custom vocabulary.
+    vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
+    vocab += ["The", "quick", "brown", "fox", "jumped", "."]
+    tokenizer = keras_nlp.models.DistilBertTokenizer(vocabulary=vocab)
+    tokenizer("The quick brown fox jumped.")
+    ```
     """
 
     def __init__(