From 4ea0ff1526f12ee171852d85aade9cdbf718aa3d Mon Sep 17 00:00:00 2001
From: Matt Watson <mattdangerw@gmail.com>
Date: Thu, 12 Sep 2024 16:17:53 -0700
Subject: [PATCH] Remove preprocessing base classes we no longer use

For some reason, all our causal lm models had a preprocessing
layer base class that packed inputs without setting things up for
the causal language model loss.

With our new base classes, `CausalLMPreprocessor` and
`Seq2SeqLMPreprocessor`, these are no longer used at all.

These are public, but I did a search for them on GitHub and the
only one I could find with any usage is `GPT2Preprocessor`. The
others are complete unused, and not generally very useful.

Let's remove them before anyone does depend on them so that we can
keep preprocessing one-to-one paired with a task. E.g.

```
BertTextClassifier
BertTextClassifierPreprocesssor
```
---
 keras_nlp/api/models/__init__.py              |  17 --
 .../src/models/bart/bart_preprocessor.py      | 264 ------------------
 .../src/models/bart/bart_preprocessor_test.py |  72 -----
 .../src/models/bloom/bloom_preprocessor.py    | 178 ------------
 .../models/bloom/bloom_preprocessor_test.py   |  80 ------
 .../models/electra/electra_preprocessor.py    | 155 ----------
 .../electra/electra_preprocessor_test.py      |  69 -----
 .../src/models/falcon/falcon_preprocessor.py  | 180 ------------
 .../models/falcon/falcon_preprocessor_test.py |  80 ------
 .../src/models/gemma/gemma_preprocessor.py    | 184 ------------
 .../models/gemma/gemma_preprocessor_test.py   |  74 -----
 .../src/models/gpt2/gpt2_preprocessor.py      |  85 +-----
 .../gpt_neo_x/gpt_neo_x_preprocessor.py       | 138 ---------
 .../gpt_neo_x/gpt_neo_x_preprocessor_test.py  |  71 -----
 .../src/models/llama/llama_preprocessor.py    | 182 ------------
 .../models/llama/llama_preprocessor_test.py   |  68 -----
 .../src/models/llama3/llama3_preprocessor.py  |  23 --
 .../models/llama3/llama3_preprocessor_test.py |  84 ------
 .../models/mistral/mistral_preprocessor.py    | 183 ------------
 .../mistral/mistral_preprocessor_test.py      |  72 -----
 keras_nlp/src/models/opt/opt_preprocessor.py  | 181 ------------
 .../src/models/opt/opt_preprocessor_test.py   |  79 ------
 .../models/pali_gemma/pali_gemma_tokenizer.py |   2 +-
 .../src/models/phi3/phi3_preprocessor.py      | 183 ------------
 .../src/models/phi3/phi3_preprocessor_test.py |  69 -----
 .../models/seq_2_seq_lm_preprocessor_test.py  |   3 +-
 26 files changed, 9 insertions(+), 2767 deletions(-)
 delete mode 100644 keras_nlp/src/models/bart/bart_preprocessor.py
 delete mode 100644 keras_nlp/src/models/bart/bart_preprocessor_test.py
 delete mode 100644 keras_nlp/src/models/bloom/bloom_preprocessor.py
 delete mode 100644 keras_nlp/src/models/bloom/bloom_preprocessor_test.py
 delete mode 100644 keras_nlp/src/models/electra/electra_preprocessor.py
 delete mode 100644 keras_nlp/src/models/electra/electra_preprocessor_test.py
 delete mode 100644 keras_nlp/src/models/falcon/falcon_preprocessor.py
 delete mode 100644 keras_nlp/src/models/falcon/falcon_preprocessor_test.py
 delete mode 100644 keras_nlp/src/models/gemma/gemma_preprocessor.py
 delete mode 100644 keras_nlp/src/models/gemma/gemma_preprocessor_test.py
 delete mode 100644 keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py
 delete mode 100644 keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py
 delete mode 100644 keras_nlp/src/models/llama/llama_preprocessor.py
 delete mode 100644 keras_nlp/src/models/llama/llama_preprocessor_test.py
 delete mode 100644 keras_nlp/src/models/llama3/llama3_preprocessor.py
 delete mode 100644 keras_nlp/src/models/llama3/llama3_preprocessor_test.py
 delete mode 100644 keras_nlp/src/models/mistral/mistral_preprocessor.py
 delete mode 100644 keras_nlp/src/models/mistral/mistral_preprocessor_test.py
 delete mode 100644 keras_nlp/src/models/opt/opt_preprocessor.py
 delete mode 100644 keras_nlp/src/models/opt/opt_preprocessor_test.py
 delete mode 100644 keras_nlp/src/models/phi3/phi3_preprocessor.py
 delete mode 100644 keras_nlp/src/models/phi3/phi3_preprocessor_test.py

diff --git a/keras_nlp/api/models/__init__.py b/keras_nlp/api/models/__init__.py
index 64368e4c45..1329399894 100644
--- a/keras_nlp/api/models/__init__.py
+++ b/keras_nlp/api/models/__init__.py
@@ -34,7 +34,6 @@
 from keras_nlp.src.models.albert.albert_tokenizer import AlbertTokenizer
 from keras_nlp.src.models.backbone import Backbone
 from keras_nlp.src.models.bart.bart_backbone import BartBackbone
-from keras_nlp.src.models.bart.bart_preprocessor import BartPreprocessor
 from keras_nlp.src.models.bart.bart_seq_2_seq_lm import BartSeq2SeqLM
 from keras_nlp.src.models.bart.bart_seq_2_seq_lm_preprocessor import (
     BartSeq2SeqLMPreprocessor,
@@ -58,7 +57,6 @@
 from keras_nlp.src.models.bloom.bloom_causal_lm_preprocessor import (
     BloomCausalLMPreprocessor,
 )
-from keras_nlp.src.models.bloom.bloom_preprocessor import BloomPreprocessor
 from keras_nlp.src.models.bloom.bloom_tokenizer import BloomTokenizer
 from keras_nlp.src.models.causal_lm import CausalLM
 from keras_nlp.src.models.causal_lm_preprocessor import CausalLMPreprocessor
@@ -105,9 +103,6 @@
     DistilBertTokenizer,
 )
 from keras_nlp.src.models.electra.electra_backbone import ElectraBackbone
-from keras_nlp.src.models.electra.electra_preprocessor import (
-    ElectraPreprocessor,
-)
 from keras_nlp.src.models.electra.electra_tokenizer import ElectraTokenizer
 from keras_nlp.src.models.f_net.f_net_backbone import FNetBackbone
 from keras_nlp.src.models.f_net.f_net_masked_lm import FNetMaskedLM
@@ -127,14 +122,12 @@
 from keras_nlp.src.models.falcon.falcon_causal_lm_preprocessor import (
     FalconCausalLMPreprocessor,
 )
-from keras_nlp.src.models.falcon.falcon_preprocessor import FalconPreprocessor
 from keras_nlp.src.models.falcon.falcon_tokenizer import FalconTokenizer
 from keras_nlp.src.models.gemma.gemma_backbone import GemmaBackbone
 from keras_nlp.src.models.gemma.gemma_causal_lm import GemmaCausalLM
 from keras_nlp.src.models.gemma.gemma_causal_lm_preprocessor import (
     GemmaCausalLMPreprocessor,
 )
-from keras_nlp.src.models.gemma.gemma_preprocessor import GemmaPreprocessor
 from keras_nlp.src.models.gemma.gemma_tokenizer import GemmaTokenizer
 from keras_nlp.src.models.gpt2.gpt2_backbone import GPT2Backbone
 from keras_nlp.src.models.gpt2.gpt2_causal_lm import GPT2CausalLM
@@ -148,23 +141,18 @@
 from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_causal_lm_preprocessor import (
     GPTNeoXCausalLMPreprocessor,
 )
-from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_preprocessor import (
-    GPTNeoXPreprocessor,
-)
 from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer
 from keras_nlp.src.models.llama3.llama3_backbone import Llama3Backbone
 from keras_nlp.src.models.llama3.llama3_causal_lm import Llama3CausalLM
 from keras_nlp.src.models.llama3.llama3_causal_lm_preprocessor import (
     Llama3CausalLMPreprocessor,
 )
-from keras_nlp.src.models.llama3.llama3_preprocessor import Llama3Preprocessor
 from keras_nlp.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
 from keras_nlp.src.models.llama.llama_backbone import LlamaBackbone
 from keras_nlp.src.models.llama.llama_causal_lm import LlamaCausalLM
 from keras_nlp.src.models.llama.llama_causal_lm_preprocessor import (
     LlamaCausalLMPreprocessor,
 )
-from keras_nlp.src.models.llama.llama_preprocessor import LlamaPreprocessor
 from keras_nlp.src.models.llama.llama_tokenizer import LlamaTokenizer
 from keras_nlp.src.models.masked_lm import MaskedLM
 from keras_nlp.src.models.masked_lm_preprocessor import MaskedLMPreprocessor
@@ -173,16 +161,12 @@
 from keras_nlp.src.models.mistral.mistral_causal_lm_preprocessor import (
     MistralCausalLMPreprocessor,
 )
-from keras_nlp.src.models.mistral.mistral_preprocessor import (
-    MistralPreprocessor,
-)
 from keras_nlp.src.models.mistral.mistral_tokenizer import MistralTokenizer
 from keras_nlp.src.models.opt.opt_backbone import OPTBackbone
 from keras_nlp.src.models.opt.opt_causal_lm import OPTCausalLM
 from keras_nlp.src.models.opt.opt_causal_lm_preprocessor import (
     OPTCausalLMPreprocessor,
 )
-from keras_nlp.src.models.opt.opt_preprocessor import OPTPreprocessor
 from keras_nlp.src.models.opt.opt_tokenizer import OPTTokenizer
 from keras_nlp.src.models.pali_gemma.pali_gemma_backbone import (
     PaliGemmaBackbone,
@@ -201,7 +185,6 @@
 from keras_nlp.src.models.phi3.phi3_causal_lm_preprocessor import (
     Phi3CausalLMPreprocessor,
 )
-from keras_nlp.src.models.phi3.phi3_preprocessor import Phi3Preprocessor
 from keras_nlp.src.models.phi3.phi3_tokenizer import Phi3Tokenizer
 from keras_nlp.src.models.preprocessor import Preprocessor
 from keras_nlp.src.models.roberta.roberta_backbone import RobertaBackbone
diff --git a/keras_nlp/src/models/bart/bart_preprocessor.py b/keras_nlp/src/models/bart/bart_preprocessor.py
deleted file mode 100644
index 0f9de65fab..0000000000
--- a/keras_nlp/src/models/bart/bart_preprocessor.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import keras
-
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker
-from keras_nlp.src.models.bart.bart_backbone import BartBackbone
-from keras_nlp.src.models.bart.bart_tokenizer import BartTokenizer
-from keras_nlp.src.models.preprocessor import Preprocessor
-from keras_nlp.src.utils.tensor_utils import preprocessing_function
-
-
-@keras_nlp_export("keras_nlp.models.BartPreprocessor")
-class BartPreprocessor(Preprocessor):
-    """A BART preprocessing layer which tokenizes and packs inputs.
-
-    This preprocessing layer will do three things:
-
-     1. Tokenize both encoder inputs and decoder inputs using the `tokenizer`.
-        Both inputs can contain only one segment.
-     2. Add the appropriate special tokens - `"<s>"`, `"</s>"` and `"<pad>"`.
-     3. Construct a dictionary with keys `"encoder_token_ids"`,
-        `"encoder_padding_mask"`, `"decoder_token_ids"`, `"decoder_padding_mask"`
-        that can be passed directly to a BART model.
-
-    Args:
-        tokenizer: A `keras_nlp.models.BartTokenizer` instance.
-        encoder_sequence_length: The length of the packed encoder inputs.
-        decoder_sequence_length: The length of the packed decoder inputs.
-
-    Call arguments:
-        x: A dictionary with `encoder_text` and `decoder_text` as its keys.
-            Each value in the dictionary should be a tensor of single string
-            sequences. Inputs may be batched or unbatched. Raw python inputs
-            will be converted to tensors.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-
-    Examples:
-
-    Directly calling the layer on data.
-    ```python
-    preprocessor = keras_nlp.models.BartPreprocessor.from_preset("bart_base_en")
-
-    # Preprocess unbatched inputs.
-    inputs = {
-        "encoder_text": "The fox was sleeping.",
-        "decoder_text": "The fox was awake."
-    }
-    preprocessor(inputs)
-
-    # Preprocess batched inputs.
-    inputs = {
-        "encoder_text": ["The fox was sleeping.", "The lion was quiet."],
-        "decoder_text": ["The fox was awake.", "The lion was roaring."]
-    }
-    preprocessor(inputs)
-
-    # Custom vocabulary.
-    vocab = {
-        "<s>": 0,
-        "<pad>": 1,
-        "</s>": 2,
-        "Ġafter": 5,
-        "noon": 6,
-        "Ġsun": 7,
-    }
-    merges = ["Ġ a", "Ġ s", "Ġ n", "e r", "n o", "o n", "Ġs u", "Ġa f", "no on"]
-    merges += ["Ġsu n", "Ġaf t", "Ġaft er"]
-
-    tokenizer = keras_nlp.models.BartTokenizer(
-        vocabulary=vocab,
-        merges=merges,
-    )
-    preprocessor = keras_nlp.models.BartPreprocessor(
-        tokenizer=tokenizer,
-        encoder_sequence_length=20,
-        decoder_sequence_length=10,
-    )
-    inputs = {
-        "encoder_text": "The fox was sleeping.",
-        "decoder_text": "The fox was awake."
-    }
-    preprocessor(inputs)
-    ```
-
-    Mapping with `tf.data.Dataset`.
-    ```python
-    preprocessor = keras_nlp.models.BartPreprocessor.from_preset("bart_base_en")
-
-    # Map labeled single sentences.
-    features = {
-        "encoder_text": tf.constant(
-            ["The fox was sleeping.", "The lion was quiet."]
-        ),
-        "decoder_text": tf.constant(
-            ["The fox was awake.", "The lion was silent."]
-        )
-    }
-    labels = tf.constant(["True", "False"])
-    ds = tf.data.Dataset.from_tensor_slices((features, labels))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled single sentences.
-    features = {
-        "encoder_text": tf.constant(
-            ["The fox was sleeping.", "The lion was quiet."]
-        ),
-        "decoder_text": tf.constant(
-            ["The fox was awake.", "The lion was roaring."]
-        )
-    }
-    ds = tf.data.Dataset.from_tensor_slices(features)
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-    ```
-    """
-
-    backbone_cls = BartBackbone
-    tokenizer_cls = BartTokenizer
-
-    def __init__(
-        self,
-        tokenizer,
-        encoder_sequence_length=1024,
-        decoder_sequence_length=1024,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.tokenizer = tokenizer
-        self.encoder_packer = None
-        self.decoder_packer = None
-        self.encoder_sequence_length = encoder_sequence_length
-        self.decoder_sequence_length = decoder_sequence_length
-
-    def build(self, input_shape):
-        # Defer packer creation to `build()` so that we can be sure tokenizer
-        # assets have loaded when restoring a saved model.
-
-        # TODO: Use `MultiSegmentPacker` instead of `StartEndPacker` once we
-        # want to move to multi-segment packing and have improved
-        # `MultiSegmentPacker`'s performance.
-        self.encoder_packer = StartEndPacker(
-            start_value=self.tokenizer.start_token_id,
-            end_value=self.tokenizer.end_token_id,
-            pad_value=self.tokenizer.pad_token_id,
-            sequence_length=self.encoder_sequence_length,
-            return_padding_mask=True,
-        )
-
-        # The decoder is packed a bit differently; the format is as follows:
-        # `[end_token_id, start_token_id, tokens..., end_token_id, padding...]`.
-        self.decoder_packer = StartEndPacker(
-            start_value=[
-                self.tokenizer.end_token_id,
-                self.tokenizer.start_token_id,
-            ],
-            end_value=self.tokenizer.end_token_id,
-            pad_value=self.tokenizer.pad_token_id,
-            sequence_length=self.decoder_sequence_length,
-            return_padding_mask=True,
-        )
-        self.built = True
-
-    @preprocessing_function
-    def call(
-        self,
-        x,
-        y=None,
-        sample_weight=None,
-        *,
-        encoder_sequence_length=None,
-        decoder_sequence_length=None,
-        # `sequence_length` is an alias for `decoder_sequence_length`
-        sequence_length=None,
-    ):
-        if not (
-            isinstance(x, dict)
-            and all(k in x for k in ("encoder_text", "decoder_text"))
-        ):
-            raise ValueError(
-                '`x` must be a dictionary, containing the keys `"encoder_text"`'
-                f' and `"decoder_text"`. Received x={x}.'
-            )
-
-        if encoder_sequence_length is None:
-            encoder_sequence_length = self.encoder_sequence_length
-        decoder_sequence_length = decoder_sequence_length or sequence_length
-        if decoder_sequence_length is None:
-            decoder_sequence_length = self.decoder_sequence_length
-
-        encoder_inputs = self.tokenizer(x["encoder_text"])
-        encoder_token_ids, encoder_padding_mask = self.encoder_packer(
-            encoder_inputs,
-            sequence_length=encoder_sequence_length,
-        )
-
-        decoder_inputs = self.tokenizer(x["decoder_text"])
-        decoder_token_ids, decoder_padding_mask = self.decoder_packer(
-            decoder_inputs,
-            sequence_length=decoder_sequence_length,
-        )
-
-        x = {
-            "encoder_token_ids": encoder_token_ids,
-            "encoder_padding_mask": encoder_padding_mask,
-            "decoder_token_ids": decoder_token_ids,
-            "decoder_padding_mask": decoder_padding_mask,
-        }
-
-        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "encoder_sequence_length": self.encoder_sequence_length,
-                "decoder_sequence_length": self.decoder_sequence_length,
-            }
-        )
-        return config
-
-    @property
-    def encoder_sequence_length(self):
-        """The padded length of encoder input sequences."""
-        return self._encoder_sequence_length
-
-    @encoder_sequence_length.setter
-    def encoder_sequence_length(self, value):
-        self._encoder_sequence_length = value
-        if self.encoder_packer is not None:
-            self.encoder_packer.sequence_length = value
-
-    @property
-    def decoder_sequence_length(self):
-        """The padded length of decoder input sequences."""
-        return self._decoder_sequence_length
-
-    @decoder_sequence_length.setter
-    def decoder_sequence_length(self, value):
-        self._decoder_sequence_length = value
-        if self.decoder_packer is not None:
-            self.decoder_packer.sequence_length = value
-
-    @property
-    def sequence_length(self):
-        """Alias for `decoder_sequence_length`."""
-        return self.decoder_sequence_length
-
-    @sequence_length.setter
-    def sequence_length(self, value):
-        self.decoder_sequence_length = value
diff --git a/keras_nlp/src/models/bart/bart_preprocessor_test.py b/keras_nlp/src/models/bart/bart_preprocessor_test.py
deleted file mode 100644
index 22e547671b..0000000000
--- a/keras_nlp/src/models/bart/bart_preprocessor_test.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from keras_nlp.src.models.bart.bart_preprocessor import BartPreprocessor
-from keras_nlp.src.models.bart.bart_tokenizer import BartTokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class BartPreprocessorTest(TestCase):
-    def setUp(self):
-        self.vocab = ["<s>", "<pad>", "</s>", "air", "Ġair", "plane", "Ġat"]
-        self.vocab += ["port", "<mask>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
-        self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
-        self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
-        self.merges += ["Ġai r", "Ġa i", "pla ne"]
-        self.tokenizer = BartTokenizer(
-            vocabulary=self.vocab, merges=self.merges
-        )
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "encoder_sequence_length": 5,
-            "decoder_sequence_length": 8,
-        }
-        self.input_data = (
-            {
-                "encoder_text": [" airplane at airport"],
-                "decoder_text": [" airplane airport"],
-            },
-            [1],  # Pass through labels.
-            [1.0],  # Pass through sample_weights.
-        )
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessor_test(
-            cls=BartPreprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output=(
-                {
-                    "encoder_token_ids": [[0, 4, 5, 6, 2]],
-                    "encoder_padding_mask": [[1, 1, 1, 1, 1]],
-                    "decoder_token_ids": [[2, 0, 4, 5, 4, 7, 2, 1]],
-                    "decoder_padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
-                },
-                [1],  # Pass through labels.
-                [1.0],  # Pass through sample_weights.
-            ),
-            token_id_key="decoder_token_ids",
-        )
-
-    @pytest.mark.extra_large
-    def test_all_presets(self):
-        for preset in BartPreprocessor.presets:
-            self.run_preset_test(
-                cls=BartPreprocessor,
-                preset=preset,
-                input_data=self.input_data,
-            )
diff --git a/keras_nlp/src/models/bloom/bloom_preprocessor.py b/keras_nlp/src/models/bloom/bloom_preprocessor.py
deleted file mode 100644
index 8b2d7b2ba0..0000000000
--- a/keras_nlp/src/models/bloom/bloom_preprocessor.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import keras
-
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker
-from keras_nlp.src.models.bloom.bloom_backbone import BloomBackbone
-from keras_nlp.src.models.bloom.bloom_tokenizer import BloomTokenizer
-from keras_nlp.src.models.preprocessor import Preprocessor
-from keras_nlp.src.utils.tensor_utils import preprocessing_function
-
-
-@keras_nlp_export("keras_nlp.models.BloomPreprocessor")
-class BloomPreprocessor(Preprocessor):
-    """BLOOM preprocessing layer which tokenizes and packs inputs.
-
-    This preprocessing layer will do 2 things:
-
-    - Tokenize the inputs using the `tokenizer`.
-    - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can
-        be passed directly to a `keras_nlp.models.BloomBackbone`.
-
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-
-    The call method of this layer accepts three arguments, `x`, `y`, and
-    `sample_weight`. `x` can be a python string or tensor representing a single
-    segment, a list of python strings representing a batch of single segments,
-    or a list of tensors representing multiple segments to be packed together.
-    `y` and `sample_weight` are both optional, can have any format, and will be
-    passed through unaltered.
-
-    Args:
-        tokenizer: A `keras_nlp.models.BloomTokenizer` instance.
-        sequence_length: The length of the packed inputs.
-        add_start_token: If `True`, the preprocessor will prepend the tokenizer
-            start token to each input sequence.
-        add_end_token: If `True`, the preprocessor will append the tokenizer
-            end token to each input sequence.
-
-    Call arguments:
-        x: A string, `tf.Tensor` or list of python strings.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-        sequence_length: Pass to override the configured `sequence_length` of
-            the layer.
-
-    Examples:
-
-    Directly calling the layer on data.
-    ```python
-    preprocessor = keras_nlp.models.BloomPreprocessor.from_preset(
-        "bloom_560m_multi"
-    )
-    # Tokenize and pack a single sentence.
-    preprocessor("The quick brown fox jumped.")
-
-    # Tokenize a batch of single sentences.
-    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
-
-    # Custom vocabulary.
-    features = ["a quick fox.", "a fox quick."]
-    vocab = {"<pad>": 0, "<s>":1, "</s>":2, "a": 3, "Ġquick": 4, "Ġfox": 5}
-    merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
-    merges += ["Ġ f", "o x", "Ġf ox"]
-    tokenizer = keras_nlp.models.BloomTokenizer(
-        vocabulary=vocab,
-        merges=merges,
-    )
-    preprocessor = keras_nlp.models.BloomPreprocessor(tokenizer=tokenizer)
-    preprocessor("The quick brown fox jumped.")
-    ```
-
-    Mapping with `tf.data.Dataset`.
-    ```python
-    preprocessor = keras_nlp.models.BloomPreprocessor.from_preset(
-        "bloom_560m_multi"
-    )
-
-    text = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    label = tf.constant([1, 1])
-
-    # Map labeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices((text, label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices(text)
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-    ```
-    """
-
-    backbone_cls = BloomBackbone
-    tokenizer_cls = BloomTokenizer
-
-    def __init__(
-        self,
-        tokenizer,
-        sequence_length=1024,
-        add_start_token=True,
-        add_end_token=True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.tokenizer = tokenizer
-        self.packer = None
-        self.sequence_length = sequence_length
-        self.add_start_token = add_start_token
-        self.add_end_token = add_end_token
-
-    def build(self, input_shape):
-        # Defer packer creation to `build()` so that we can be sure tokenizer
-        # assets have loaded when restoring a saved model.
-        self.packer = StartEndPacker(
-            start_value=self.tokenizer.start_token_id,
-            end_value=self.tokenizer.end_token_id,
-            pad_value=self.tokenizer.pad_token_id,
-            sequence_length=self.sequence_length,
-            return_padding_mask=True,
-        )
-        self.built = True
-
-    @preprocessing_function
-    def call(
-        self,
-        x,
-        y=None,
-        sample_weight=None,
-        sequence_length=None,
-    ):
-        sequence_length = sequence_length or self.sequence_length
-        token_ids, padding_mask = self.packer(
-            self.tokenizer(x),
-            sequence_length=sequence_length,
-            add_start_value=self.add_start_token,
-            add_end_value=self.add_end_token,
-        )
-        x = {
-            "token_ids": token_ids,
-            "padding_mask": padding_mask,
-        }
-        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.sequence_length,
-                "add_start_token": self.add_start_token,
-                "add_end_token": self.add_end_token,
-            }
-        )
-        return config
-
-    @property
-    def sequence_length(self):
-        """The padded length of model input sequences."""
-        return self._sequence_length
-
-    @sequence_length.setter
-    def sequence_length(self, value):
-        self._sequence_length = value
-        if self.packer is not None:
-            self.packer.sequence_length = value
diff --git a/keras_nlp/src/models/bloom/bloom_preprocessor_test.py b/keras_nlp/src/models/bloom/bloom_preprocessor_test.py
deleted file mode 100644
index 9eca885715..0000000000
--- a/keras_nlp/src/models/bloom/bloom_preprocessor_test.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from keras_nlp.src.models.bloom.bloom_preprocessor import BloomPreprocessor
-from keras_nlp.src.models.bloom.bloom_tokenizer import BloomTokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class BloomPreprocessorTest(TestCase):
-    def setUp(self):
-        self.vocab = ["<pad>", "<s>", "</s>"]
-        self.vocab += ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
-        self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
-        self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
-        self.merges += ["Ġai r", "Ġa i", "pla ne"]
-        self.tokenizer = BloomTokenizer(
-            vocabulary=self.vocab,
-            merges=self.merges,
-        )
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "sequence_length": 8,
-        }
-        self.input_data = ["airplane at airport"]
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessor_test(
-            cls=BloomPreprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output={
-                "token_ids": [[1, 4, 6, 7, 5, 8, 2, 0]],
-                "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
-            },
-        )
-
-    def test_no_start_end_token(self):
-        input_data = ["airplane at airport"] * 4
-
-        preprocessor = BloomPreprocessor(
-            tokenizer=BloomTokenizer(
-                vocabulary=self.vocab,
-                merges=self.merges,
-            ),
-            sequence_length=8,
-            add_start_token=False,
-            add_end_token=False,
-        )
-        x = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[4, 6, 7, 5, 8, 0, 0, 0]] * 4)
-        self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
-
-    def test_sequence_length_override(self):
-        input_data = "airplane at airport"
-        preprocessor = BloomPreprocessor(**self.init_kwargs)
-        x = preprocessor(input_data, sequence_length=4)
-        self.assertAllEqual(x["token_ids"], [1, 4, 6, 2])
-
-    @pytest.mark.extra_large
-    def test_all_presets(self):
-        for preset in BloomPreprocessor.presets:
-            self.run_preset_test(
-                cls=BloomPreprocessor,
-                preset=preset,
-                input_data=self.input_data,
-            )
diff --git a/keras_nlp/src/models/electra/electra_preprocessor.py b/keras_nlp/src/models/electra/electra_preprocessor.py
deleted file mode 100644
index 82dbb3310c..0000000000
--- a/keras_nlp/src/models/electra/electra_preprocessor.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import keras
-
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.layers.preprocessing.multi_segment_packer import (
-    MultiSegmentPacker,
-)
-from keras_nlp.src.models.electra.electra_backbone import ElectraBackbone
-from keras_nlp.src.models.electra.electra_tokenizer import ElectraTokenizer
-from keras_nlp.src.models.preprocessor import Preprocessor
-from keras_nlp.src.utils.tensor_utils import preprocessing_function
-
-
-@keras_nlp_export("keras_nlp.models.ElectraPreprocessor")
-class ElectraPreprocessor(Preprocessor):
-    """A ELECTRA preprocessing layer which tokenizes and packs inputs.
-
-    This preprocessing layer will do three things:
-
-     1. Tokenize any number of input segments using the `tokenizer`.
-     2. Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`.
-       with the appropriate `"[CLS]"`, `"[SEP]"` and `"[PAD]"` tokens.
-     3. Construct a dictionary of with keys `"token_ids"` and `"padding_mask"`,
-       that can be passed directly to a ELECTRA model.
-
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-
-    Args:
-        tokenizer: A `keras_nlp.models.ElectraTokenizer` instance.
-        sequence_length: The length of the packed inputs.
-        truncate: string. The algorithm to truncate a list of batched segments
-            to fit within `sequence_length`. The value can be either
-            `round_robin` or `waterfall`:
-            - `"round_robin"`: Available space is assigned one token at a
-                time in a round-robin fashion to the inputs that still need
-                some, until the limit is reached.
-            - `"waterfall"`: The allocation of the budget is done using a
-                "waterfall" algorithm that allocates quota in a
-                left-to-right manner and fills up the buckets until we run
-                out of budget. It supports an arbitrary number of segments.
-
-    Call arguments:
-        x: A tensor of single string sequences, or a tuple of multiple
-            tensor sequences to be packed together. Inputs may be batched or
-            unbatched. For single sequences, raw python inputs will be converted
-            to tensors. For multiple sequences, pass tensors directly.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-
-    Examples:
-
-    Directly calling the layer on data.
-    ```python
-    preprocessor = keras_nlp.models.ElectraPreprocessor.from_preset(
-        "electra_base_discriminator_en"
-    )
-    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
-
-    # Custom vocabulary.
-    vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
-    vocab += ["The", "quick", "brown", "fox", "jumped", "."]
-    tokenizer = keras_nlp.models.ElectraTokenizer(vocabulary=vocab)
-    preprocessor = keras_nlp.models.ElectraPreprocessor(tokenizer)
-    preprocessor("The quick brown fox jumped.")
-    ```
-
-    Mapping with `tf.data.Dataset`.
-    ```python
-    preprocessor = keras_nlp.models.ElectraPreprocessor.from_preset(
-        "electra_base_discriminator_en"
-    )
-
-    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
-    label = tf.constant([1, 1])
-    # Map labeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices((first, label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-
-    # Map unlabeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices(first)
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map labeled sentence pairs.
-    ds = tf.data.Dataset.from_tensor_slices(((first, second), label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-    # Map unlabeled sentence pairs.
-    ds = tf.data.Dataset.from_tensor_slices((first, second))
-
-    # Watch out for tf.data's default unpacking of tuples here!
-    # Best to invoke the `preprocessor` directly in this case.
-    ds = ds.map(
-        lambda first, second: preprocessor(x=(first, second)),
-        num_parallel_calls=tf.data.AUTOTUNE,
-    )
-    ```
-    """
-
-    backbone_cls = ElectraBackbone
-    tokenizer_cls = ElectraTokenizer
-
-    def __init__(
-        self,
-        tokenizer,
-        sequence_length=512,
-        truncate="round_robin",
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.tokenizer = tokenizer
-        self.packer = MultiSegmentPacker(
-            start_value=self.tokenizer.cls_token_id,
-            end_value=self.tokenizer.sep_token_id,
-            pad_value=self.tokenizer.pad_token_id,
-            truncate=truncate,
-            sequence_length=sequence_length,
-        )
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.packer.sequence_length,
-                "truncate": self.packer.truncate,
-            }
-        )
-        return config
-
-    @preprocessing_function
-    def call(self, x, y=None, sample_weight=None):
-        x = x if isinstance(x, tuple) else (x,)
-        x = tuple(self.tokenizer(segment) for segment in x)
-        token_ids, segment_ids = self.packer(x)
-        x = {
-            "token_ids": token_ids,
-            "segment_ids": segment_ids,
-            "padding_mask": token_ids != self.tokenizer.pad_token_id,
-        }
-        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
diff --git a/keras_nlp/src/models/electra/electra_preprocessor_test.py b/keras_nlp/src/models/electra/electra_preprocessor_test.py
deleted file mode 100644
index 62dbaf3fdd..0000000000
--- a/keras_nlp/src/models/electra/electra_preprocessor_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from keras_nlp.src.models.electra.electra_preprocessor import (
-    ElectraPreprocessor,
-)
-from keras_nlp.src.models.electra.electra_tokenizer import ElectraTokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class ElectraPreprocessorTest(TestCase):
-    def setUp(self):
-        self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
-        self.vocab += ["THE", "QUICK", "BROWN", "FOX"]
-        self.vocab += ["the", "quick", "brown", "fox"]
-        self.tokenizer = ElectraTokenizer(vocabulary=self.vocab)
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "sequence_length": 8,
-        }
-        self.input_data = (
-            ["THE QUICK BROWN FOX."],
-            [1],  # Pass through labels.
-            [1.0],  # Pass through sample_weights.
-        )
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessing_layer_test(
-            cls=ElectraPreprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output=(
-                {
-                    "token_ids": [[2, 5, 6, 7, 8, 1, 3, 0]],
-                    "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0]],
-                    "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
-                },
-                [1],  # Pass through labels.
-                [1.0],  # Pass through sample_weights.
-            ),
-        )
-
-    def test_errors_for_2d_list_input(self):
-        preprocessor = ElectraPreprocessor(**self.init_kwargs)
-        ambiguous_input = [["one", "two"], ["three", "four"]]
-        with self.assertRaises(ValueError):
-            preprocessor(ambiguous_input)
-
-    @pytest.mark.extra_large
-    def test_all_presets(self):
-        for preset in ElectraPreprocessor.presets:
-            self.run_preset_test(
-                cls=ElectraPreprocessor,
-                preset=preset,
-                input_data=self.input_data,
-            )
diff --git a/keras_nlp/src/models/falcon/falcon_preprocessor.py b/keras_nlp/src/models/falcon/falcon_preprocessor.py
deleted file mode 100644
index 491f6e5fe2..0000000000
--- a/keras_nlp/src/models/falcon/falcon_preprocessor.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import keras
-
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker
-from keras_nlp.src.models.falcon.falcon_backbone import FalconBackbone
-from keras_nlp.src.models.falcon.falcon_tokenizer import FalconTokenizer
-from keras_nlp.src.models.preprocessor import Preprocessor
-from keras_nlp.src.utils.tensor_utils import preprocessing_function
-
-
-@keras_nlp_export("keras_nlp.models.FalconPreprocessor")
-class FalconPreprocessor(Preprocessor):
-    """Falcon preprocessing layer which tokenizes and packs inputs.
-
-    This preprocessing layer will do 2 things:
-
-    - Tokenize the inputs using the `tokenizer`.
-    - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can
-        be passed directly to a `keras_nlp.models.FalconBackbone`.
-
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-
-    The call method of this layer accepts three arguments, `x`, `y`, and
-    `sample_weight`. `x` can be a python string or tensor representing a single
-    segment, a list of python strings representing a batch of single segments,
-    or a list of tensors representing multiple segments to be packed together.
-    `y` and `sample_weight` are both optional, can have any format, and will be
-    passed through unaltered.
-
-    `FalconPreprocessor` forces the input to have only one segment, as Falcon is
-    mainly used for generation tasks. For tasks having multi-segment inputs
-    like "glue/mnli", please use a model designed for classification purposes
-    such as BERT or RoBERTa.
-
-    Args:
-        tokenizer: A `keras_nlp.models.FalconTokenizer` instance.
-        sequence_length: The length of the packed inputs.
-        add_start_token: If `True`, the preprocessor will prepend the tokenizer
-            start token to each input sequence.
-        add_end_token: If `True`, the preprocessor will append the tokenizer
-            end token to each input sequence.
-
-    Call arguments:
-        x: A string, `tf.Tensor` or list of python strings.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-        sequence_length: Pass to override the configured `sequence_length` of
-            the layer.
-
-    Examples:
-
-    Directly calling the layer on data.
-    ```python
-    preprocessor = keras_nlp.models.FalconPreprocessor.from_preset("falcon_rw_1b")
-
-    # Tokenize and pack a single sentence.
-    preprocessor("The quick brown fox jumped.")
-
-    # Tokenize a batch of single sentences.
-    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
-
-    # Custom vocabulary.
-    features = ["a quick fox.", "a fox quick."]
-    vocab = {"<|endoftext|>": 0, "a": 4, "Ġquick": 5, "Ġfox": 6}
-    merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
-    merges += ["Ġ f", "o x", "Ġf ox"]
-    tokenizer = keras_nlp.models.FalconTokenizer(
-        vocabulary=vocab,
-        merges=merges,
-    )
-    preprocessor = keras_nlp.models.FalconPreprocessor(tokenizer=tokenizer)
-    preprocessor("The quick brown fox jumped.")
-    ```
-
-    Mapping with `tf.data.Dataset`.
-    ```python
-    preprocessor = keras_nlp.models.FalconPreprocessor.from_preset("falcon_rw_1b")
-
-    text = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    label = tf.constant([1, 1])
-
-    # Map labeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices((text, label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices(text)
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-    ```
-    """
-
-    backbone_cls = FalconBackbone
-    tokenizer_cls = FalconTokenizer
-
-    def __init__(
-        self,
-        tokenizer,
-        sequence_length=1024,
-        add_start_token=True,
-        add_end_token=True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.tokenizer = tokenizer
-        self.packer = None
-        self.sequence_length = sequence_length
-        self.add_start_token = add_start_token
-        self.add_end_token = add_end_token
-
-    def build(self, input_shape):
-        # Defer packer creation to `build()` so that we can be sure tokenizer
-        # assets have loaded when restoring a saved model.
-        self.packer = StartEndPacker(
-            start_value=self.tokenizer.start_token_id,
-            end_value=self.tokenizer.end_token_id,
-            pad_value=self.tokenizer.pad_token_id,
-            sequence_length=self.sequence_length,
-            return_padding_mask=True,
-        )
-        self.built = True
-
-    @preprocessing_function
-    def call(
-        self,
-        x,
-        y=None,
-        sample_weight=None,
-        sequence_length=None,
-    ):
-        sequence_length = sequence_length or self.sequence_length
-        token_ids, padding_mask = self.packer(
-            self.tokenizer(x),
-            sequence_length=sequence_length,
-            add_start_value=self.add_start_token,
-            add_end_value=self.add_end_token,
-        )
-        x = {
-            "token_ids": token_ids,
-            "padding_mask": padding_mask,
-        }
-        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.sequence_length,
-                "add_start_token": self.add_start_token,
-                "add_end_token": self.add_end_token,
-            }
-        )
-        return config
-
-    @property
-    def sequence_length(self):
-        """The padded length of model input sequences."""
-        return self._sequence_length
-
-    @sequence_length.setter
-    def sequence_length(self, value):
-        self._sequence_length = value
-        if self.packer is not None:
-            self.packer.sequence_length = value
diff --git a/keras_nlp/src/models/falcon/falcon_preprocessor_test.py b/keras_nlp/src/models/falcon/falcon_preprocessor_test.py
deleted file mode 100644
index 075710b23b..0000000000
--- a/keras_nlp/src/models/falcon/falcon_preprocessor_test.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from keras_nlp.src.models.falcon.falcon_preprocessor import FalconPreprocessor
-from keras_nlp.src.models.falcon.falcon_tokenizer import FalconTokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class FalconPreprocessorTest(TestCase):
-    def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
-        self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
-        self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
-        self.merges += ["Ġai r", "Ġa i", "pla ne"]
-        self.tokenizer = FalconTokenizer(
-            vocabulary=self.vocab,
-            merges=self.merges,
-        )
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "sequence_length": 8,
-        }
-        self.input_data = ["airplane at airport"]
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessor_test(
-            cls=FalconPreprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output={
-                "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]],
-                "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
-            },
-        )
-
-    def test_no_start_end_token(self):
-        input_data = ["airplane at airport"] * 4
-
-        preprocessor = FalconPreprocessor(
-            tokenizer=FalconTokenizer(
-                vocabulary=self.vocab,
-                merges=self.merges,
-            ),
-            sequence_length=8,
-            add_start_token=False,
-            add_end_token=False,
-        )
-        x = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4)
-        self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
-
-    def test_sequence_length_override(self):
-        input_data = "airplane at airport"
-        preprocessor = FalconPreprocessor(**self.init_kwargs)
-        x = preprocessor(input_data, sequence_length=4)
-        self.assertAllEqual(x["token_ids"], [6, 1, 3, 6])
-
-    @pytest.mark.extra_large
-    def test_all_presets(self):
-        for preset in FalconPreprocessor.presets:
-            self.run_preset_test(
-                cls=FalconPreprocessor,
-                preset=preset,
-                input_data=self.input_data,
-            )
diff --git a/keras_nlp/src/models/gemma/gemma_preprocessor.py b/keras_nlp/src/models/gemma/gemma_preprocessor.py
deleted file mode 100644
index dcbe531b56..0000000000
--- a/keras_nlp/src/models/gemma/gemma_preprocessor.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import keras
-
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker
-from keras_nlp.src.models.gemma.gemma_backbone import GemmaBackbone
-from keras_nlp.src.models.gemma.gemma_tokenizer import GemmaTokenizer
-from keras_nlp.src.models.preprocessor import Preprocessor
-from keras_nlp.src.utils.tensor_utils import preprocessing_function
-
-
-@keras_nlp_export("keras_nlp.models.GemmaPreprocessor")
-class GemmaPreprocessor(Preprocessor):
-    """Gemma preprocessing layer which tokenizes and packs inputs.
-
-    This preprocessing layer will do 2 things:
-
-    - Tokenize the inputs using the `tokenizer`.
-    - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can
-        be passed directly to a `keras_nlp.models.GemmaBackbone`.
-
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-
-    The call method of this layer accepts three arguments, `x`, `y`, and
-    `sample_weight`. `x` can be a python string or tensor representing a single
-    segment, a list of python strings representing a batch of single segments,
-    or a list of tensors representing multiple segments to be packed together.
-    `y` and `sample_weight` are both optional, can have any format, and will be
-    passed through unaltered.
-
-    `GemmaPreprocessor` expects the input to have only one segment, as Gemma is
-    mainly used for generation tasks. For tasks having multi-segment inputs
-    please combine inputs into a single string input before passing to the
-    preprocessor layer.
-
-    Args:
-        tokenizer: A `keras_nlp.models.GemmaTokenizer` instance.
-        sequence_length: The length of the packed inputs.
-        add_start_token: If `True`, the preprocessor will prepend the tokenizer
-            start token to each input sequence.
-        add_end_token: If `True`, the preprocessor will append the tokenizer
-            end token to each input sequence.
-
-    Call arguments:
-        x: A string, `tf.Tensor` or list of python strings.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-        sequence_length: Pass to override the configured `sequence_length` of
-            the layer.
-
-    Examples:
-
-    Directly calling the layer on data.
-    ```python
-    preprocessor = keras_nlp.models.GemmaPreprocessor.from_preset(
-        "gemma_2b_en"
-    )
-
-    # Tokenize and pack a single sentence.
-    preprocessor("The quick brown fox jumped.")
-
-    # Tokenize a batch of sentences.
-    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
-
-    # Custom vocabulary.
-    bytes_io = io.BytesIO()
-    ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."])
-    sentencepiece.SentencePieceTrainer.train(
-        sentence_iterator=ds.as_numpy_iterator(),
-        model_writer=bytes_io,
-        vocab_size=8,
-        model_type="WORD",
-        pad_id=0,
-        bos_id=1,
-        eos_id=2,
-        unk_id=3,
-        pad_piece="<pad>",
-        bos_piece="<bos>",
-        eos_piece="<eos>",
-        unk_piece="<unk>",
-    )
-    tokenizer = keras_nlp.models.GemmaTokenizer(
-        proto=bytes_io.getvalue(),
-    )
-    preprocessor = keras_nlp.models.GemmaPreprocessor(tokenizer=tokenizer)
-    preprocessor("The quick brown fox jumped.")
-    ```
-
-    Apply preprocessing to a `tf.data.Dataset`.
-    ```python
-    preprocessor = keras_nlp.models.GemmaPreprocessor.from_preset(
-        "gemma_2b_en"
-    )
-
-    text = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    label = tf.constant([1, 1])
-
-    # Map labeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices((text, label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices(text)
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-    ```
-    """
-
-    backbone_cls = GemmaBackbone
-    tokenizer_cls = GemmaTokenizer
-
-    def __init__(
-        self,
-        tokenizer,
-        sequence_length=1024,
-        add_start_token=True,
-        add_end_token=True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.tokenizer = tokenizer
-        self.sequence_length = sequence_length
-        self.add_start_token = add_start_token
-        self.add_end_token = add_end_token
-
-    def build(self, input_shape):
-        # Defer packer creation to `build()` so that we can be sure tokenizer
-        # assets have loaded when restoring a saved model.
-        self.packer = StartEndPacker(
-            start_value=self.tokenizer.start_token_id,
-            end_value=self.tokenizer.end_token_id,
-            pad_value=self.tokenizer.pad_token_id,
-            sequence_length=self.sequence_length,
-            return_padding_mask=True,
-        )
-        self.built = True
-
-    @preprocessing_function
-    def call(
-        self,
-        x,
-        y=None,
-        sample_weight=None,
-        sequence_length=None,
-    ):
-        sequence_length = sequence_length or self.sequence_length
-        token_ids, padding_mask = self.packer(
-            self.tokenizer(x),
-            sequence_length=sequence_length,
-            add_start_value=self.add_start_token,
-            add_end_value=self.add_end_token,
-        )
-        x = {
-            "token_ids": token_ids,
-            "padding_mask": padding_mask,
-        }
-        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.sequence_length,
-                "add_start_token": self.add_start_token,
-                "add_end_token": self.add_end_token,
-            }
-        )
-        return config
diff --git a/keras_nlp/src/models/gemma/gemma_preprocessor_test.py b/keras_nlp/src/models/gemma/gemma_preprocessor_test.py
deleted file mode 100644
index 66264edb27..0000000000
--- a/keras_nlp/src/models/gemma/gemma_preprocessor_test.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import pytest
-
-from keras_nlp.src.models.gemma.gemma_preprocessor import GemmaPreprocessor
-from keras_nlp.src.models.gemma.gemma_tokenizer import GemmaTokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class GemmaPreprocessorTest(TestCase):
-    def setUp(self):
-        self.tokenizer = GemmaTokenizer(
-            proto=os.path.join(
-                self.get_test_data_dir(), "gemma_test_vocab.spm"
-            ),
-        )
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "sequence_length": 8,
-        }
-        self.input_data = ["the quick brown fox"]
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessing_layer_test(
-            cls=GemmaPreprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output={
-                "token_ids": [[1, 4, 9, 5, 7, 2, 0, 0]],
-                "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]],
-            },
-        )
-
-    def test_no_start_end_token(self):
-        input_data = ["the quick brown fox"] * 4
-        preprocessor = GemmaPreprocessor(
-            tokenizer=self.tokenizer,
-            sequence_length=8,
-            add_start_token=False,
-            add_end_token=False,
-        )
-        x = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[4, 9, 5, 7, 0, 0, 0, 0]] * 4)
-        self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 0, 0, 0, 0]] * 4)
-
-    def test_sequence_length_override(self):
-        input_data = "the quick brown fox"
-        preprocessor = GemmaPreprocessor(**self.init_kwargs)
-        x = preprocessor(input_data, sequence_length=4)
-        self.assertAllEqual(x["token_ids"], [1, 4, 9, 2])
-
-    @pytest.mark.kaggle_key_required
-    @pytest.mark.extra_large
-    def test_all_presets(self):
-        for preset in GemmaPreprocessor.presets:
-            self.run_preset_test(
-                cls=GemmaPreprocessor,
-                preset=preset,
-                input_data=self.input_data,
-            )
diff --git a/keras_nlp/src/models/gpt2/gpt2_preprocessor.py b/keras_nlp/src/models/gpt2/gpt2_preprocessor.py
index c9af92fcbf..720b052568 100644
--- a/keras_nlp/src/models/gpt2/gpt2_preprocessor.py
+++ b/keras_nlp/src/models/gpt2/gpt2_preprocessor.py
@@ -25,85 +25,12 @@
 
 @keras_nlp_export("keras_nlp.models.GPT2Preprocessor")
 class GPT2Preprocessor(Preprocessor):
-    """GPT2 preprocessing layer which tokenizes and packs inputs.
+    """Legacy preprocessing layer for GPT2.
 
-    This preprocessing layer will do 2 things:
-
-    - Tokenize the inputs using the `tokenizer`.
-    - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can
-        be passed directly to a `keras_nlp.models.GPT2Backbone`.
-
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-
-    The call method of this layer accepts three arguments, `x`, `y`, and
-    `sample_weight`. `x` can be a python string or tensor representing a single
-    segment, a list of python strings representing a batch of single segments,
-    or a list of tensors representing multiple segments to be packed together.
-    `y` and `sample_weight` are both optional, can have any format, and will be
-    passed through unaltered.
-
-    `GPT2Preprocessor` forces the input to have only one segment, as GPT2 is
-    mainly used for generation tasks. For tasks having multi-segment inputs
-    like "glue/mnli", please use a model designed for classification purposes
-    such as BERT or RoBERTa.
-
-    Args:
-        tokenizer: A `keras_nlp.models.GPT2Tokenizer` instance.
-        sequence_length: The length of the packed inputs.
-        add_start_token: If `True`, the preprocessor will prepend the tokenizer
-            start token to each input sequence.
-        add_end_token: If `True`, the preprocessor will append the tokenizer
-            end token to each input sequence.
-
-    Call arguments:
-        x: A string, `tf.Tensor` or list of python strings.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-        sequence_length: Pass to override the configured `sequence_length` of
-            the layer.
-
-    Examples:
-
-    Directly calling the layer on data.
-    ```python
-    preprocessor = keras_nlp.models.GPT2Preprocessor.from_preset("gpt2_base_en")
-
-    # Tokenize and pack a single sentence.
-    preprocessor("The quick brown fox jumped.")
-
-    # Tokenize a batch of single sentences.
-    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
-
-    # Custom vocabulary.
-    features = ["a quick fox.", "a fox quick."]
-    vocab = {"<|endoftext|>": 0, "a": 4, "Ġquick": 5, "Ġfox": 6}
-    merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
-    merges += ["Ġ f", "o x", "Ġf ox"]
-    tokenizer = keras_nlp.models.GPT2Tokenizer(
-        vocabulary=vocab,
-        merges=merges,
-    )
-    preprocessor = keras_nlp.models.GPT2Preprocessor(tokenizer=tokenizer)
-    preprocessor("The quick brown fox jumped.")
-    ```
-
-    Mapping with `tf.data.Dataset`.
-    ```python
-    preprocessor = keras_nlp.models.GPT2Preprocessor.from_preset("gpt2_base_en")
-
-    text = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    label = tf.constant([1, 1])
-
-    # Map labeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices((text, label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices(text)
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-    ```
+    This layer should not be used in new code! All preprocessing layers pair
+    directly with a task. E.g. `BertClassifier` and
+    `BertClassifierPreprocessor`. Either use `GPT2CausalLMPreprocessor` or
+    wrap `GPT2Tokenizer` into a custom preprocessing layer or function.
     """
 
     backbone_cls = GPT2Backbone
@@ -117,6 +44,8 @@ def __init__(
         add_end_token=True,
         **kwargs,
     ):
+        # TODO: this class has some usage, but barely any, and is no longer
+        # documented. We should consider dropping it.
         super().__init__(**kwargs)
         self.tokenizer = tokenizer
         self.packer = None
diff --git a/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py b/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py
deleted file mode 100644
index 06a46ce470..0000000000
--- a/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import keras
-
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker
-from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_backbone import GPTNeoXBackbone
-from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer
-from keras_nlp.src.models.preprocessor import Preprocessor
-from keras_nlp.src.utils.tensor_utils import preprocessing_function
-
-
-@keras_nlp_export("keras_nlp.models.GPTNeoXPreprocessor")
-class GPTNeoXPreprocessor(Preprocessor):
-    """GPTNeoX preprocessing layer which tokenizes and packs inputs.
-
-    This preprocessing layer will do 2 things:
-
-    - Tokenize the inputs using the `tokenizer`.
-    - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can
-        be passed directly to a `keras_nlp.models.GPTNeoXBackbone`.
-
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-
-    The call method of this layer accepts three arguments, `x`, `y`, and
-    `sample_weight`. `x` can be a python string or tensor representing a single
-    segment, a list of python strings representing a batch of single segments,
-    or a list of tensors representing multiple segments to be packed together.
-    `y` and `sample_weight` are both optional, can have any format, and will be
-    passed through unaltered.
-
-    `GPTNeoXPreprocessor` forces the input to have only one segment, as GPTNeoX is
-    mainly used for generation tasks. For tasks having multi-segment inputs
-    like "glue/mnli", please use a model designed for classification purposes
-    such as BERT or RoBERTa.
-
-    Args:
-        tokenizer: A `keras_nlp.models.GPTNeoXTokenizer` instance.
-        sequence_length: The length of the packed inputs.
-        add_start_token: If `True`, the preprocessor will prepend the tokenizer
-            start token to each input sequence.
-        add_end_token: If `True`, the preprocessor will append the tokenizer
-            end token to each input sequence.
-
-    Call arguments:
-        x: A string, `tf.Tensor` or list of python strings.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-        sequence_length: Pass to override the configured `sequence_length` of
-            the layer.
-    """
-
-    backbone_cls = GPTNeoXBackbone
-    tokenizer_cls = GPTNeoXTokenizer
-
-    def __init__(
-        self,
-        tokenizer,
-        sequence_length=1024,
-        add_start_token=True,
-        add_end_token=True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.tokenizer = tokenizer
-        self.packer = None
-        self.sequence_length = sequence_length
-        self.add_start_token = add_start_token
-        self.add_end_token = add_end_token
-
-    def build(self, input_shape):
-        # Defer packer creation to `build()` so that we can be sure tokenizer
-        # assets have loaded when restoring a saved model.
-        self.packer = StartEndPacker(
-            start_value=self.tokenizer.start_token_id,
-            end_value=self.tokenizer.end_token_id,
-            pad_value=self.tokenizer.pad_token_id,
-            sequence_length=self.sequence_length,
-            return_padding_mask=True,
-        )
-        self.built = True
-
-    @preprocessing_function
-    def call(
-        self,
-        x,
-        y=None,
-        sample_weight=None,
-        sequence_length=None,
-    ):
-        sequence_length = sequence_length or self.sequence_length
-        token_ids, padding_mask = self.packer(
-            self.tokenizer(x),
-            sequence_length=sequence_length,
-            add_start_value=self.add_start_token,
-            add_end_value=self.add_end_token,
-        )
-        x = {
-            "token_ids": token_ids,
-            "padding_mask": padding_mask,
-        }
-        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.sequence_length,
-                "add_start_token": self.add_start_token,
-                "add_end_token": self.add_end_token,
-            }
-        )
-        return config
-
-    @property
-    def sequence_length(self):
-        """The padded length of model input sequences."""
-        return self._sequence_length
-
-    @sequence_length.setter
-    def sequence_length(self, value):
-        self._sequence_length = value
-        if self.packer is not None:
-            self.packer.sequence_length = value
diff --git a/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py b/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py
deleted file mode 100644
index aad68bebc6..0000000000
--- a/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_preprocessor import (
-    GPTNeoXPreprocessor,
-)
-from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class GPTNeoXPreprocessorTest(TestCase):
-    def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
-        self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
-        self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
-        self.merges += ["Ġai r", "Ġa i", "pla ne"]
-        self.tokenizer = GPTNeoXTokenizer(
-            vocabulary=self.vocab,
-            merges=self.merges,
-        )
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "sequence_length": 8,
-        }
-        self.input_data = ["airplane at airport"]
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessor_test(
-            cls=GPTNeoXPreprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output={
-                "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]],
-                "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
-            },
-        )
-
-    def test_no_start_end_token(self):
-        input_data = ["airplane at airport"] * 4
-
-        preprocessor = GPTNeoXPreprocessor(
-            tokenizer=GPTNeoXTokenizer(
-                vocabulary=self.vocab,
-                merges=self.merges,
-            ),
-            sequence_length=8,
-            add_start_token=False,
-            add_end_token=False,
-        )
-        x = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4)
-        self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
-
-    def test_sequence_length_override(self):
-        input_data = "airplane at airport"
-        preprocessor = GPTNeoXPreprocessor(**self.init_kwargs)
-        x = preprocessor(input_data, sequence_length=4)
-        self.assertAllEqual(x["token_ids"], [6, 1, 3, 6])
diff --git a/keras_nlp/src/models/llama/llama_preprocessor.py b/keras_nlp/src/models/llama/llama_preprocessor.py
deleted file mode 100644
index 75122856c6..0000000000
--- a/keras_nlp/src/models/llama/llama_preprocessor.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import keras
-
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker
-from keras_nlp.src.models.llama.llama_backbone import LlamaBackbone
-from keras_nlp.src.models.llama.llama_tokenizer import LlamaTokenizer
-from keras_nlp.src.models.preprocessor import Preprocessor
-from keras_nlp.src.utils.tensor_utils import preprocessing_function
-
-
-@keras_nlp_export("keras_nlp.models.LlamaPreprocessor")
-class LlamaPreprocessor(Preprocessor):
-    """A Llama preprocessing layer which tokenizes and packs inputs.
-
-    This preprocessing layer will do three things:
-
-     1. Tokenize any number of input segments using the `tokenizer`.
-     2. Pack the inputs together using a `keras_nlp.layers.StartEndPacker`.
-       with the appropriate tokens.
-     3. Construct a dictionary with keys `"token_ids"`, and `"padding_mask"`
-       that can be passed directly to `keras_nlp.models.LlamaBackbone`.
-
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-
-    Args:
-        tokenizer: A `keras_nlp.models.LlamaTokenizer` instance.
-        sequence_length: The length of the packed inputs.
-        add_start_token: If `True`, the preprocessor will prepend the tokenizer
-            start token to each input sequence. Default is `True`.
-        add_end_token: If `True`, the preprocessor will append the tokenizer
-            end token to each input sequence. Default is `False`.
-
-    Call arguments:
-        x: A tensor of single string sequences, or a tuple of multiple
-            tensor sequences to be packed together. Inputs may be batched or
-            unbatched. For single sequences, raw python inputs will be converted
-            to tensors. For multiple sequences, pass tensors directly.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-        sequence_length: Pass to override the configured `sequence_length` of
-            the layer.
-
-    Examples:
-
-    Directly calling the from_preset().
-    ```python
-    preprocessor = keras_nlp.models.LlamaPreprocessor.from_preset(
-        "llama_base_en"
-    )
-
-    # Tokenize and pack a single sentence.
-    preprocessor("The quick brown fox jumped.")
-
-    # Tokenize and a batch of single sentences.
-    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
-
-    # Preprocess a batch of sentence pairs.
-    # When handling multiple sequences, always convert to tensors first!
-    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
-    preprocessor((first, second))
-    ```
-
-    Mapping with `tf.data.Dataset`.
-    ```python
-    preprocessor = keras_nlp.models.LlamaPreprocessor.from_preset(
-        "llama_base_en"
-    )
-    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
-    label = tf.constant([1, 1])
-
-    # Map labeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices((first, label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices(first)
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map labeled sentence pairs.
-    ds = tf.data.Dataset.from_tensor_slices(((first, second), label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled sentence pairs.
-    ds = tf.data.Dataset.from_tensor_slices((first, second))
-
-    # Watch out for tf.data's default unpacking of tuples here!
-    # Best to invoke the `preprocessor` directly in this case.
-    ds = ds.map(
-        lambda first, second: preprocessor(x=(first, second)),
-        num_parallel_calls=tf.data.AUTOTUNE,
-    )
-    ```
-    """
-
-    backbone_cls = LlamaBackbone
-    tokenizer_cls = LlamaTokenizer
-
-    def __init__(
-        self,
-        tokenizer,
-        sequence_length=1024,
-        add_start_token=True,
-        add_end_token=False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.tokenizer = tokenizer
-        self.packer = None
-        self.add_start_token = add_start_token
-        self.add_end_token = add_end_token
-        self.sequence_length = sequence_length
-
-    def build(self, input_shape):
-        # Defer packer creation to `build()` so that we can be sure tokenizer
-        # assets have loaded when restoring a saved model.
-        self.packer = StartEndPacker(
-            start_value=self.tokenizer.start_token_id,
-            end_value=self.tokenizer.end_token_id,
-            sequence_length=self.sequence_length,
-            return_padding_mask=True,
-        )
-        self.built = True
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.sequence_length,
-                "add_start_token": self.add_start_token,
-                "add_end_token": self.add_end_token,
-            }
-        )
-        return config
-
-    @preprocessing_function
-    def call(
-        self,
-        x,
-        y=None,
-        sample_weight=None,
-        sequence_length=None,
-    ):
-        sequence_length = sequence_length or self.sequence_length
-        token_ids, padding_mask = self.packer(
-            self.tokenizer(x),
-            sequence_length=sequence_length,
-            add_start_value=self.add_start_token,
-            add_end_value=self.add_end_token,
-        )
-        x = {
-            "token_ids": token_ids,
-            "padding_mask": padding_mask,
-        }
-        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
-
-    @property
-    def sequence_length(self):
-        """The padded length of model input sequences."""
-        return self._sequence_length
-
-    @sequence_length.setter
-    def sequence_length(self, value):
-        self._sequence_length = value
-        if self.packer is not None:
-            self.packer.sequence_length = value
diff --git a/keras_nlp/src/models/llama/llama_preprocessor_test.py b/keras_nlp/src/models/llama/llama_preprocessor_test.py
deleted file mode 100644
index eca0af66a0..0000000000
--- a/keras_nlp/src/models/llama/llama_preprocessor_test.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import pytest
-
-from keras_nlp.src.models.llama.llama_preprocessor import LlamaPreprocessor
-from keras_nlp.src.models.llama.llama_tokenizer import LlamaTokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class LlamaPreprocessorTest(TestCase):
-    def setUp(self):
-        self.tokenizer = LlamaTokenizer(
-            # Generated using create_llama_test_proto.py
-            proto=os.path.join(self.get_test_data_dir(), "llama_test_vocab.spm")
-        )
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "sequence_length": 8,
-        }
-        self.input_data = (
-            ["the quick brown fox"],
-            [1],  # Pass through labels.
-            [1.0],  # Pass through sample_weights.
-        )
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessor_test(
-            cls=LlamaPreprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output=(
-                {
-                    "token_ids": [[1, 3, 8, 4, 6, 0, 0, 0]],
-                    "padding_mask": [[1, 1, 1, 1, 1, 0, 0, 0]],
-                },
-                [1],  # Pass through labels.
-                [1.0],  # Pass through sample_weights.
-            ),
-        )
-
-    def test_errors_for_2d_list_input(self):
-        preprocessor = LlamaPreprocessor(**self.init_kwargs)
-        ambiguous_input = [["one", "two"], ["three", "four"]]
-        with self.assertRaises(ValueError):
-            preprocessor(ambiguous_input)
-
-    @pytest.mark.extra_large
-    def test_all_presets(self):
-        for preset in LlamaPreprocessor.presets:
-            self.run_preset_test(
-                cls=LlamaPreprocessor,
-                preset=preset,
-                input_data=self.input_data,
-            )
diff --git a/keras_nlp/src/models/llama3/llama3_preprocessor.py b/keras_nlp/src/models/llama3/llama3_preprocessor.py
deleted file mode 100644
index b1fc5769ab..0000000000
--- a/keras_nlp/src/models/llama3/llama3_preprocessor.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.models.llama3.llama3_backbone import Llama3Backbone
-from keras_nlp.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
-from keras_nlp.src.models.llama.llama_preprocessor import LlamaPreprocessor
-
-
-@keras_nlp_export("keras_nlp.models.Llama3Preprocessor")
-class Llama3Preprocessor(LlamaPreprocessor):
-    backbone_cls = Llama3Backbone
-    tokenizer_cls = Llama3Tokenizer
diff --git a/keras_nlp/src/models/llama3/llama3_preprocessor_test.py b/keras_nlp/src/models/llama3/llama3_preprocessor_test.py
deleted file mode 100644
index 13d4f0208c..0000000000
--- a/keras_nlp/src/models/llama3/llama3_preprocessor_test.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from keras_nlp.src.models.llama3.llama3_preprocessor import Llama3Preprocessor
-from keras_nlp.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class Llama3PreprocessorTest(TestCase):
-    def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|end_of_text|>", "<|begin_of_text|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
-        self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
-        self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
-        self.merges += ["Ġai r", "Ġa i", "pla ne"]
-        self.tokenizer = Llama3Tokenizer(
-            vocabulary=self.vocab,
-            merges=self.merges,
-        )
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "sequence_length": 8,
-        }
-        self.input_data = [
-            "airplane at airport",
-        ]
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessor_test(
-            cls=Llama3Preprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output=(
-                {
-                    "token_ids": [[7, 1, 3, 4, 2, 5, 0, 0]],
-                    "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]],
-                }
-            ),
-        )
-
-    def test_with_start_end_token(self):
-        input_data = ["airplane at airport"] * 4
-
-        preprocessor = Llama3Preprocessor(
-            tokenizer=Llama3Tokenizer(
-                vocabulary=self.vocab,
-                merges=self.merges,
-            ),
-            sequence_length=8,
-            add_start_token=True,
-            add_end_token=True,
-        )
-        x = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[7, 1, 3, 4, 2, 5, 6, 0]] * 4)
-        self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4)
-
-    def test_sequence_length_override(self):
-        input_data = "airplane at airport"
-        preprocessor = Llama3Preprocessor(**self.init_kwargs)
-        x = preprocessor(input_data, sequence_length=4)
-        self.assertAllEqual(x["token_ids"], [7, 1, 3, 4])
-
-    @pytest.mark.extra_large
-    def test_all_presets(self):
-        for preset in Llama3Preprocessor.presets:
-            self.run_preset_test(
-                cls=Llama3Preprocessor,
-                preset=preset,
-                input_data=self.input_data,
-            )
diff --git a/keras_nlp/src/models/mistral/mistral_preprocessor.py b/keras_nlp/src/models/mistral/mistral_preprocessor.py
deleted file mode 100644
index c6d7731722..0000000000
--- a/keras_nlp/src/models/mistral/mistral_preprocessor.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import keras
-
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker
-from keras_nlp.src.models.mistral.mistral_backbone import MistralBackbone
-from keras_nlp.src.models.mistral.mistral_tokenizer import MistralTokenizer
-from keras_nlp.src.models.preprocessor import Preprocessor
-from keras_nlp.src.utils.tensor_utils import preprocessing_function
-
-
-@keras_nlp_export("keras_nlp.models.MistralPreprocessor")
-class MistralPreprocessor(Preprocessor):
-    """A Mistral preprocessing layer which tokenizes and packs inputs.
-
-    This preprocessing layer will do three things:
-
-     1. Tokenize any number of input segments using the `tokenizer`.
-     2. Pack the inputs together using a `keras_nlp.layers.StartEndPacker`.
-       with the appropriate tokens.
-     3. Construct a dictionary with keys `"token_ids"`, and `"padding_mask"`
-       that can be passed directly to `keras_nlp.models.MistralBackbone`.
-
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-
-    Args:
-        tokenizer: A `keras_nlp.models.MistralTokenizer` instance.
-        sequence_length: The length of the packed inputs.
-        add_start_token: If `True`, the preprocessor will prepend the tokenizer
-            start token to each input sequence. Default is `True`.
-        add_end_token: If `True`, the preprocessor will append the tokenizer
-            end token to each input sequence. Default is `False`.
-
-    Call arguments:
-        x: A tensor of single string sequences, or a tuple of multiple
-            tensor sequences to be packed together. Inputs may be batched or
-            unbatched. For single sequences, raw python inputs will be converted
-            to tensors. For multiple sequences, pass tensors directly.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-        sequence_length: Pass to override the configured `sequence_length` of
-            the layer.
-
-    Examples:
-
-    Directly calling the from_preset().
-    ```python
-    preprocessor = keras_nlp.models.MistralPreprocessor.from_preset(
-        "mistral_base_en"
-    )
-
-    # Tokenize and pack a single sentence.
-    preprocessor("The quick brown fox jumped.")
-
-    # Tokenize and a batch of single sentences.
-    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
-
-    # Preprocess a batch of sentence pairs.
-    # When handling multiple sequences, always convert to tensors first!
-    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
-    preprocessor((first, second))
-    ```
-
-    Mapping with `tf.data.Dataset`.
-    ```python
-    preprocessor = keras_nlp.models.MistralPreprocessor.from_preset(
-        "mistral_base_en"
-    )
-    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
-    label = tf.constant([1, 1])
-
-    # Map labeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices((first, label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices(first)
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map labeled sentence pairs.
-    ds = tf.data.Dataset.from_tensor_slices(((first, second), label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled sentence pairs.
-    ds = tf.data.Dataset.from_tensor_slices((first, second))
-
-    # Watch out for tf.data's default unpacking of tuples here!
-    # Best to invoke the `preprocessor` directly in this case.
-    ds = ds.map(
-        lambda first, second: preprocessor(x=(first, second)),
-        num_parallel_calls=tf.data.AUTOTUNE,
-    )
-    ```
-    """
-
-    backbone_cls = MistralBackbone
-    tokenizer_cls = MistralTokenizer
-
-    def __init__(
-        self,
-        tokenizer,
-        sequence_length=1024,
-        add_start_token=True,
-        add_end_token=False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.tokenizer = tokenizer
-        self.packer = None
-        self.add_start_token = add_start_token
-        self.add_end_token = add_end_token
-        self.sequence_length = sequence_length
-
-    def build(self, input_shape):
-        # Defer packer creation to `build()` so that we can be sure tokenizer
-        # assets have loaded when restoring a saved model.
-        self.packer = StartEndPacker(
-            start_value=self.tokenizer.start_token_id,
-            end_value=self.tokenizer.end_token_id,
-            sequence_length=self.sequence_length,
-            return_padding_mask=True,
-        )
-        self.built = True
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.sequence_length,
-                "add_start_token": self.add_start_token,
-                "add_end_token": self.add_end_token,
-            }
-        )
-        return config
-
-    @preprocessing_function
-    def call(
-        self,
-        x,
-        y=None,
-        sample_weight=None,
-        sequence_length=None,
-    ):
-        sequence_length = sequence_length or self.sequence_length
-        token_ids, padding_mask = self.packer(
-            self.tokenizer(x),
-            sequence_length=sequence_length,
-            add_start_value=self.add_start_token,
-            add_end_value=self.add_end_token,
-        )
-        x = {
-            "token_ids": token_ids,
-            "padding_mask": padding_mask,
-        }
-        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
-
-    @property
-    def sequence_length(self):
-        """The padded length of model input sequences."""
-        return self._sequence_length
-
-    @sequence_length.setter
-    def sequence_length(self, value):
-        self._sequence_length = value
-        if self.packer is not None:
-            self.packer.sequence_length = value
diff --git a/keras_nlp/src/models/mistral/mistral_preprocessor_test.py b/keras_nlp/src/models/mistral/mistral_preprocessor_test.py
deleted file mode 100644
index e38e498b00..0000000000
--- a/keras_nlp/src/models/mistral/mistral_preprocessor_test.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import pytest
-
-from keras_nlp.src.models.mistral.mistral_preprocessor import (
-    MistralPreprocessor,
-)
-from keras_nlp.src.models.mistral.mistral_tokenizer import MistralTokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class MistralPreprocessorTest(TestCase):
-    def setUp(self):
-        self.tokenizer = MistralTokenizer(
-            # Generated using create_mistral_test_proto.py
-            proto=os.path.join(
-                self.get_test_data_dir(), "mistral_test_vocab.spm"
-            )
-        )
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "sequence_length": 8,
-        }
-        self.input_data = (
-            ["the quick brown fox"],
-            [1],  # Pass through labels.
-            [1.0],  # Pass through sample_weights.
-        )
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessor_test(
-            cls=MistralPreprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output=(
-                {
-                    "token_ids": [[1, 3, 8, 4, 6, 0, 0, 0]],
-                    "padding_mask": [[1, 1, 1, 1, 1, 0, 0, 0]],
-                },
-                [1],  # Pass through labels.
-                [1.0],  # Pass through sample_weights.
-            ),
-        )
-
-    def test_errors_for_2d_list_input(self):
-        preprocessor = MistralPreprocessor(**self.init_kwargs)
-        ambiguous_input = [["one", "two"], ["three", "four"]]
-        with self.assertRaises(ValueError):
-            preprocessor(ambiguous_input)
-
-    @pytest.mark.extra_large
-    def test_all_presets(self):
-        for preset in MistralPreprocessor.presets:
-            self.run_preset_test(
-                cls=MistralPreprocessor,
-                preset=preset,
-                input_data=self.input_data,
-            )
diff --git a/keras_nlp/src/models/opt/opt_preprocessor.py b/keras_nlp/src/models/opt/opt_preprocessor.py
deleted file mode 100644
index 0cafaaec5e..0000000000
--- a/keras_nlp/src/models/opt/opt_preprocessor.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import keras
-
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker
-from keras_nlp.src.models.opt.opt_backbone import OPTBackbone
-from keras_nlp.src.models.opt.opt_tokenizer import OPTTokenizer
-from keras_nlp.src.models.preprocessor import Preprocessor
-from keras_nlp.src.utils.tensor_utils import preprocessing_function
-
-
-@keras_nlp_export("keras_nlp.models.OPTPreprocessor")
-class OPTPreprocessor(Preprocessor):
-    """OPT preprocessing layer which tokenizes and packs inputs.
-
-    This preprocessing layer will do 2 things:
-
-    - Tokenize the input using the `tokenizer`.
-    - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can
-        be passed directly to a `keras_nlp.models.OPTBackbone`.
-
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-
-    The call method of this layer accepts three arguments, `x`, `y`, and
-    `sample_weight`. `x` can be a python string or tensor representing a single
-    segment, a list of python strings representing a batch of single segments,
-    or a list of tensors representing multiple segments to be packed together.
-    `y` and `sample_weight` are both optional, can have any format, and will be
-    passed through unaltered.
-
-    `OPTPreprocessor` forces the input to have only one segment, as OPT is
-    mainly used for generation tasks. For tasks having multi-segment inputs
-    like "glue/mnli", please use a model designed for classification purposes
-    such as BERT or RoBERTa.
-
-    Args:
-        tokenizer: A `keras_nlp.models.OPTTokenizer` instance.
-        sequence_length: The length of the packed inputs.
-        add_start_token: If `True`, the preprocessor will append the tokenizer
-            start token to each input sequence.
-        add_end_token: If `True`, the preprocessor will append the tokenizer
-            end token to each input sequence.
-
-    Call arguments:
-        x: A string, `tf.Tensor` or list of python strings.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-        sequence_length: Pass to override the configured `sequence_length` of
-            the layer.
-
-    Examples:
-
-    Directly calling the layer on data.
-    ```python
-    preprocessor = keras_nlp.models.OPTPreprocessor.from_preset("opt_125m_en")
-
-    # Tokenize and pack a single sentence.
-    preprocessor("The quick brown fox jumped.")
-
-    # Tokenize a batch of single sentences.
-    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
-
-    # Custom vocabulary.
-    features = ["a quick fox.", "a fox quick."]
-    vocab = {"<|endoftext|>": 0, "a": 4, "Ġquick": 5, "Ġfox": 6}
-    merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
-    merges += ["Ġ f", "o x", "Ġf ox"]
-    tokenizer = keras_nlp.models.OPTTokenizer(
-        vocabulary=vocab,
-        merges=merges,
-    )
-    preprocessor = keras_nlp.models.OPTPreprocessor(tokenizer=tokenizer)
-    preprocessor("The quick brown fox jumped.")
-    ```
-
-    Mapping with `tf.data.Dataset`.
-    ```python
-    preprocessor = keras_nlp.models.OPTPreprocessor.from_preset("opt_125m_en")
-
-    text = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    label = tf.constant([1, 1])
-
-    # Map labeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices((text, label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices(text)
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-    ```
-    """
-
-    backbone_cls = OPTBackbone
-    tokenizer_cls = OPTTokenizer
-
-    def __init__(
-        self,
-        tokenizer,
-        sequence_length=1024,
-        add_start_token=True,
-        add_end_token=True,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-
-        self.tokenizer = tokenizer
-        self.packer = None
-        self.sequence_length = sequence_length
-        self.add_start_token = add_start_token
-        self.add_end_token = add_end_token
-
-    def build(self, input_shape):
-        # Defer packer creation to `build()` so that we can be sure tokenizer
-        # assets have loaded when restoring a saved model.
-        self.packer = StartEndPacker(
-            start_value=self.tokenizer.start_token_id,
-            end_value=self.tokenizer.end_token_id,
-            pad_value=self.tokenizer.pad_token_id,
-            sequence_length=self.sequence_length,
-            return_padding_mask=True,
-        )
-        self.built = True
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.sequence_length,
-                "add_start_token": self.add_start_token,
-                "add_end_token": self.add_end_token,
-            }
-        )
-        return config
-
-    @preprocessing_function
-    def call(
-        self,
-        x,
-        y=None,
-        sample_weight=None,
-        sequence_length=None,
-    ):
-        sequence_length = sequence_length or self.sequence_length
-        token_ids, padding_mask = self.packer(
-            self.tokenizer(x),
-            sequence_length=sequence_length,
-            add_start_value=self.add_start_token,
-            add_end_value=self.add_end_token,
-        )
-        x = {
-            "token_ids": token_ids,
-            "padding_mask": padding_mask,
-        }
-        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
-
-    @property
-    def sequence_length(self):
-        """The padded length of model input sequences."""
-        return self._sequence_length
-
-    @sequence_length.setter
-    def sequence_length(self, value):
-        self._sequence_length = value
-        if self.packer is not None:
-            self.packer.sequence_length = value
diff --git a/keras_nlp/src/models/opt/opt_preprocessor_test.py b/keras_nlp/src/models/opt/opt_preprocessor_test.py
deleted file mode 100644
index 614ff82ebe..0000000000
--- a/keras_nlp/src/models/opt/opt_preprocessor_test.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from keras_nlp.src.models.opt.opt_preprocessor import OPTPreprocessor
-from keras_nlp.src.models.opt.opt_tokenizer import OPTTokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class OPTPreprocessorTest(TestCase):
-    def setUp(self):
-        self.vocab = ["<pad>", "</s>", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
-        self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
-        self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
-        self.merges += ["Ġai r", "Ġa i", "pla ne"]
-        self.tokenizer = OPTTokenizer(
-            vocabulary=self.vocab,
-            merges=self.merges,
-        )
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "sequence_length": 8,
-        }
-        self.input_data = ["airplane at airport"]
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessor_test(
-            cls=OPTPreprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output={
-                "token_ids": [[1, 2, 4, 5, 3, 6, 1, 0]],
-                "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
-            },
-        )
-
-    def test_no_start_end_token(self):
-        input_data = ["airplane at airport"] * 4
-
-        preprocessor = OPTPreprocessor(
-            tokenizer=OPTTokenizer(
-                vocabulary=self.vocab,
-                merges=self.merges,
-            ),
-            sequence_length=8,
-            add_start_token=False,
-            add_end_token=False,
-        )
-        x = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[2, 4, 5, 3, 6, 0, 0, 0]] * 4)
-        self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
-
-    def test_sequence_length_override(self):
-        input_data = "airplane at airport"
-        preprocessor = OPTPreprocessor(**self.init_kwargs)
-        x = preprocessor(input_data, sequence_length=4)
-        self.assertAllEqual(x["token_ids"], [1, 2, 4, 1])
-
-    @pytest.mark.extra_large
-    def test_all_presets(self):
-        for preset in OPTPreprocessor.presets:
-            self.run_preset_test(
-                cls=OPTPreprocessor,
-                preset=preset,
-                input_data=self.input_data,
-            )
diff --git a/keras_nlp/src/models/pali_gemma/pali_gemma_tokenizer.py b/keras_nlp/src/models/pali_gemma/pali_gemma_tokenizer.py
index fbdb0693d0..d52c1de4d4 100644
--- a/keras_nlp/src/models/pali_gemma/pali_gemma_tokenizer.py
+++ b/keras_nlp/src/models/pali_gemma/pali_gemma_tokenizer.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.models.gemma.gemma_preprocessor import GemmaTokenizer
+from keras_nlp.src.models.gemma.gemma_tokenizer import GemmaTokenizer
 from keras_nlp.src.models.pali_gemma.pali_gemma_backbone import (
     PaliGemmaBackbone,
 )
diff --git a/keras_nlp/src/models/phi3/phi3_preprocessor.py b/keras_nlp/src/models/phi3/phi3_preprocessor.py
deleted file mode 100644
index ce392b5088..0000000000
--- a/keras_nlp/src/models/phi3/phi3_preprocessor.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import keras
-
-from keras_nlp.src.api_export import keras_nlp_export
-from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker
-from keras_nlp.src.models.phi3.phi3_backbone import Phi3Backbone
-from keras_nlp.src.models.phi3.phi3_tokenizer import Phi3Tokenizer
-from keras_nlp.src.models.preprocessor import Preprocessor
-from keras_nlp.src.utils.tensor_utils import preprocessing_function
-
-
-@keras_nlp_export("keras_nlp.models.Phi3Preprocessor")
-class Phi3Preprocessor(Preprocessor):
-    """A Phi3 preprocessing layer which tokenizes and packs inputs.
-
-    This preprocessing layer will do three things:
-
-     1. Tokenize any number of input segments using the `tokenizer`.
-     2. Pack the inputs together using a `keras_nlp.layers.StartEndPacker`.
-       with the appropriate tokens.
-     3. Construct a dictionary with keys `"token_ids"`, and `"padding_mask"`
-       that can be passed directly to `keras_nlp.models.Phi3Backbone`.
-
-    This layer can be used directly with `tf.data.Dataset.map` to preprocess
-    string data in the `(x, y, sample_weight)` format used by
-    `keras.Model.fit`.
-
-    Args:
-        tokenizer: A `keras_nlp.models.Phi3Tokenizer` instance.
-        sequence_length: The length of the packed inputs.
-        add_start_token: If `True`, the preprocessor will prepend the tokenizer
-            start token to each input sequence. Default is `True`.
-        add_end_token: If `True`, the preprocessor will append the tokenizer
-            end token to each input sequence. Default is `False`.
-
-    Call arguments:
-        x: A tensor of single string sequences, or a tuple of multiple
-            tensor sequences to be packed together. Inputs may be batched or
-            unbatched. For single sequences, raw python inputs will be converted
-            to tensors. For multiple sequences, pass tensors directly.
-        y: Any label data. Will be passed through unaltered.
-        sample_weight: Any label weight data. Will be passed through unaltered.
-        sequence_length: Pass to override the configured `sequence_length` of
-            the layer.
-
-    Examples:
-
-    Directly calling the from_preset().
-    ```python
-    preprocessor = keras_nlp.models.Phi3Preprocessor.from_preset(
-        ""
-    )
-
-    # Tokenize and pack a single sentence.
-    preprocessor("The quick brown fox jumped.")
-
-    # Tokenize and a batch of single sentences.
-    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
-
-    # Preprocess a batch of sentence pairs.
-    # When handling multiple sequences, always convert to tensors first!
-    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
-    preprocessor((first, second))
-    ```
-
-    Mapping with `tf.data.Dataset`.
-    ```python
-    preprocessor = keras_nlp.models.Phi3Preprocessor.from_preset(
-        "phi3_mini_4k_instruct_en"
-    )
-    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
-    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
-    label = tf.constant([1, 1])
-
-    # Map labeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices((first, label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled single sentences.
-    ds = tf.data.Dataset.from_tensor_slices(first)
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map labeled sentence pairs.
-    ds = tf.data.Dataset.from_tensor_slices(((first, second), label))
-    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
-
-    # Map unlabeled sentence pairs.
-    ds = tf.data.Dataset.from_tensor_slices((first, second))
-
-    # Watch out for tf.data's default unpacking of tuples here!
-    # Best to invoke the `preprocessor` directly in this case.
-    ds = ds.map(
-        lambda first, second: preprocessor(x=(first, second)),
-        num_parallel_calls=tf.data.AUTOTUNE,
-    )
-    ```
-    """
-
-    backbone_cls = Phi3Backbone
-    tokenizer_cls = Phi3Tokenizer
-
-    def __init__(
-        self,
-        tokenizer,
-        sequence_length=1024,
-        add_start_token=True,
-        add_end_token=False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.tokenizer = tokenizer
-        self.packer = None
-        self.add_start_token = add_start_token
-        self.add_end_token = add_end_token
-        self.sequence_length = sequence_length
-
-    def build(self, input_shape):
-        # Defer packer creation to `build()` so that we can be sure tokenizer
-        # assets have loaded when restoring a saved model.
-        self.packer = StartEndPacker(
-            start_value=self.tokenizer.start_token_id,
-            end_value=self.tokenizer.end_token_id,
-            pad_value=self.tokenizer.pad_token_id,
-            sequence_length=self.sequence_length,
-            return_padding_mask=True,
-        )
-        self.built = True
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.sequence_length,
-                "add_start_token": self.add_start_token,
-                "add_end_token": self.add_end_token,
-            }
-        )
-        return config
-
-    @preprocessing_function
-    def call(
-        self,
-        x,
-        y=None,
-        sample_weight=None,
-        sequence_length=None,
-    ):
-        sequence_length = sequence_length or self.sequence_length
-        token_ids, padding_mask = self.packer(
-            self.tokenizer(x),
-            sequence_length=sequence_length,
-            add_start_value=self.add_start_token,
-            add_end_value=self.add_end_token,
-        )
-        x = {
-            "token_ids": token_ids,
-            "padding_mask": padding_mask,
-        }
-        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
-
-    @property
-    def sequence_length(self):
-        """The padded length of model input sequences."""
-        return self._sequence_length
-
-    @sequence_length.setter
-    def sequence_length(self, value):
-        self._sequence_length = value
-        if self.packer is not None:
-            self.packer.sequence_length = value
diff --git a/keras_nlp/src/models/phi3/phi3_preprocessor_test.py b/keras_nlp/src/models/phi3/phi3_preprocessor_test.py
deleted file mode 100644
index 406fc4eb17..0000000000
--- a/keras_nlp/src/models/phi3/phi3_preprocessor_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright 2024 The KerasNLP Authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-import pytest
-
-from keras_nlp.src.models.phi3.phi3_preprocessor import Phi3Preprocessor
-from keras_nlp.src.models.phi3.phi3_tokenizer import Phi3Tokenizer
-from keras_nlp.src.tests.test_case import TestCase
-
-
-class Phi3PreprocessorTest(TestCase):
-    def setUp(self):
-        self.tokenizer = Phi3Tokenizer(
-            # Generated using create_phi3_test_proto.py
-            proto=os.path.join(self.get_test_data_dir(), "phi3_test_vocab.spm")
-        )
-        self.init_kwargs = {
-            "tokenizer": self.tokenizer,
-            "sequence_length": 12,
-        }
-        self.input_data = (
-            # Encoded to [3, 5, 6, 4, 3, 9, 7, 11, 3, 15]
-            ["the fox <|endoftext|>"],
-            [1],  # Pass through labels.
-            [1.0],  # Pass through sample_weights.
-        )
-
-    def test_preprocessor_basics(self):
-        self.run_preprocessor_test(
-            cls=Phi3Preprocessor,
-            init_kwargs=self.init_kwargs,
-            input_data=self.input_data,
-            expected_output=(
-                {
-                    "token_ids": [[1, 3, 5, 6, 4, 3, 9, 7, 11, 3, 15, 0]],
-                    "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]],
-                },
-                [1],  # Pass through labels.
-                [1.0],  # Pass through sample_weights.
-            ),
-        )
-
-    def test_errors_for_2d_list_input(self):
-        preprocessor = Phi3Preprocessor(**self.init_kwargs)
-        ambiguous_input = [["one", "two"], ["three", "four"]]
-        with self.assertRaises(ValueError):
-            preprocessor(ambiguous_input)
-
-    @pytest.mark.extra_large
-    def test_all_presets(self):
-        for preset in Phi3Preprocessor.presets:
-            self.run_preset_test(
-                cls=Phi3Preprocessor,
-                preset=preset,
-                input_data=self.input_data,
-            )
diff --git a/keras_nlp/src/models/seq_2_seq_lm_preprocessor_test.py b/keras_nlp/src/models/seq_2_seq_lm_preprocessor_test.py
index b1d353ab9d..f784b571fe 100644
--- a/keras_nlp/src/models/seq_2_seq_lm_preprocessor_test.py
+++ b/keras_nlp/src/models/seq_2_seq_lm_preprocessor_test.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import pytest
 
-from keras_nlp.src.models.bart.bart_preprocessor import BartPreprocessor
 from keras_nlp.src.models.bart.bart_seq_2_seq_lm_preprocessor import (
     BartSeq2SeqLMPreprocessor,
 )
@@ -25,7 +24,7 @@
 class TestSeq2SeqLMPreprocessor(TestCase):
     def test_preset_accessors(self):
         bert_presets = set(BertTokenizer.presets.keys())
-        bart_presets = set(BartPreprocessor.presets.keys())
+        bart_presets = set(BartSeq2SeqLMPreprocessor.presets.keys())
         all_presets = set(Seq2SeqLMPreprocessor.presets.keys())
         self.assertTrue(bert_presets.isdisjoint(all_presets))
         self.assertTrue(bart_presets.issubset(all_presets))