From 4ea0ff1526f12ee171852d85aade9cdbf718aa3d Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Thu, 12 Sep 2024 16:17:53 -0700 Subject: [PATCH] Remove preprocessing base classes we no longer use For some reason, all our causal lm models had a preprocessing layer base class that packed inputs without setting things up for the causal language model loss. With our new base classes, `CausalLMPreprocessor` and `Seq2SeqLMPreprocessor`, these are no longer used at all. These are public, but I did a search for them on GitHub and the only one I could find with any usage is `GPT2Preprocessor`. The others are complete unused, and not generally very useful. Let's remove them before anyone does depend on them so that we can keep preprocessing one-to-one paired with a task. E.g. ``` BertTextClassifier BertTextClassifierPreprocesssor ``` --- keras_nlp/api/models/__init__.py | 17 -- .../src/models/bart/bart_preprocessor.py | 264 ------------------ .../src/models/bart/bart_preprocessor_test.py | 72 ----- .../src/models/bloom/bloom_preprocessor.py | 178 ------------ .../models/bloom/bloom_preprocessor_test.py | 80 ------ .../models/electra/electra_preprocessor.py | 155 ---------- .../electra/electra_preprocessor_test.py | 69 ----- .../src/models/falcon/falcon_preprocessor.py | 180 ------------ .../models/falcon/falcon_preprocessor_test.py | 80 ------ .../src/models/gemma/gemma_preprocessor.py | 184 ------------ .../models/gemma/gemma_preprocessor_test.py | 74 ----- .../src/models/gpt2/gpt2_preprocessor.py | 85 +----- .../gpt_neo_x/gpt_neo_x_preprocessor.py | 138 --------- .../gpt_neo_x/gpt_neo_x_preprocessor_test.py | 71 ----- .../src/models/llama/llama_preprocessor.py | 182 ------------ .../models/llama/llama_preprocessor_test.py | 68 ----- .../src/models/llama3/llama3_preprocessor.py | 23 -- .../models/llama3/llama3_preprocessor_test.py | 84 ------ .../models/mistral/mistral_preprocessor.py | 183 ------------ .../mistral/mistral_preprocessor_test.py | 72 ----- keras_nlp/src/models/opt/opt_preprocessor.py | 181 ------------ .../src/models/opt/opt_preprocessor_test.py | 79 ------ .../models/pali_gemma/pali_gemma_tokenizer.py | 2 +- .../src/models/phi3/phi3_preprocessor.py | 183 ------------ .../src/models/phi3/phi3_preprocessor_test.py | 69 ----- .../models/seq_2_seq_lm_preprocessor_test.py | 3 +- 26 files changed, 9 insertions(+), 2767 deletions(-) delete mode 100644 keras_nlp/src/models/bart/bart_preprocessor.py delete mode 100644 keras_nlp/src/models/bart/bart_preprocessor_test.py delete mode 100644 keras_nlp/src/models/bloom/bloom_preprocessor.py delete mode 100644 keras_nlp/src/models/bloom/bloom_preprocessor_test.py delete mode 100644 keras_nlp/src/models/electra/electra_preprocessor.py delete mode 100644 keras_nlp/src/models/electra/electra_preprocessor_test.py delete mode 100644 keras_nlp/src/models/falcon/falcon_preprocessor.py delete mode 100644 keras_nlp/src/models/falcon/falcon_preprocessor_test.py delete mode 100644 keras_nlp/src/models/gemma/gemma_preprocessor.py delete mode 100644 keras_nlp/src/models/gemma/gemma_preprocessor_test.py delete mode 100644 keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py delete mode 100644 keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py delete mode 100644 keras_nlp/src/models/llama/llama_preprocessor.py delete mode 100644 keras_nlp/src/models/llama/llama_preprocessor_test.py delete mode 100644 keras_nlp/src/models/llama3/llama3_preprocessor.py delete mode 100644 keras_nlp/src/models/llama3/llama3_preprocessor_test.py delete mode 100644 keras_nlp/src/models/mistral/mistral_preprocessor.py delete mode 100644 keras_nlp/src/models/mistral/mistral_preprocessor_test.py delete mode 100644 keras_nlp/src/models/opt/opt_preprocessor.py delete mode 100644 keras_nlp/src/models/opt/opt_preprocessor_test.py delete mode 100644 keras_nlp/src/models/phi3/phi3_preprocessor.py delete mode 100644 keras_nlp/src/models/phi3/phi3_preprocessor_test.py diff --git a/keras_nlp/api/models/__init__.py b/keras_nlp/api/models/__init__.py index 64368e4c45..1329399894 100644 --- a/keras_nlp/api/models/__init__.py +++ b/keras_nlp/api/models/__init__.py @@ -34,7 +34,6 @@ from keras_nlp.src.models.albert.albert_tokenizer import AlbertTokenizer from keras_nlp.src.models.backbone import Backbone from keras_nlp.src.models.bart.bart_backbone import BartBackbone -from keras_nlp.src.models.bart.bart_preprocessor import BartPreprocessor from keras_nlp.src.models.bart.bart_seq_2_seq_lm import BartSeq2SeqLM from keras_nlp.src.models.bart.bart_seq_2_seq_lm_preprocessor import ( BartSeq2SeqLMPreprocessor, @@ -58,7 +57,6 @@ from keras_nlp.src.models.bloom.bloom_causal_lm_preprocessor import ( BloomCausalLMPreprocessor, ) -from keras_nlp.src.models.bloom.bloom_preprocessor import BloomPreprocessor from keras_nlp.src.models.bloom.bloom_tokenizer import BloomTokenizer from keras_nlp.src.models.causal_lm import CausalLM from keras_nlp.src.models.causal_lm_preprocessor import CausalLMPreprocessor @@ -105,9 +103,6 @@ DistilBertTokenizer, ) from keras_nlp.src.models.electra.electra_backbone import ElectraBackbone -from keras_nlp.src.models.electra.electra_preprocessor import ( - ElectraPreprocessor, -) from keras_nlp.src.models.electra.electra_tokenizer import ElectraTokenizer from keras_nlp.src.models.f_net.f_net_backbone import FNetBackbone from keras_nlp.src.models.f_net.f_net_masked_lm import FNetMaskedLM @@ -127,14 +122,12 @@ from keras_nlp.src.models.falcon.falcon_causal_lm_preprocessor import ( FalconCausalLMPreprocessor, ) -from keras_nlp.src.models.falcon.falcon_preprocessor import FalconPreprocessor from keras_nlp.src.models.falcon.falcon_tokenizer import FalconTokenizer from keras_nlp.src.models.gemma.gemma_backbone import GemmaBackbone from keras_nlp.src.models.gemma.gemma_causal_lm import GemmaCausalLM from keras_nlp.src.models.gemma.gemma_causal_lm_preprocessor import ( GemmaCausalLMPreprocessor, ) -from keras_nlp.src.models.gemma.gemma_preprocessor import GemmaPreprocessor from keras_nlp.src.models.gemma.gemma_tokenizer import GemmaTokenizer from keras_nlp.src.models.gpt2.gpt2_backbone import GPT2Backbone from keras_nlp.src.models.gpt2.gpt2_causal_lm import GPT2CausalLM @@ -148,23 +141,18 @@ from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_causal_lm_preprocessor import ( GPTNeoXCausalLMPreprocessor, ) -from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_preprocessor import ( - GPTNeoXPreprocessor, -) from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer from keras_nlp.src.models.llama3.llama3_backbone import Llama3Backbone from keras_nlp.src.models.llama3.llama3_causal_lm import Llama3CausalLM from keras_nlp.src.models.llama3.llama3_causal_lm_preprocessor import ( Llama3CausalLMPreprocessor, ) -from keras_nlp.src.models.llama3.llama3_preprocessor import Llama3Preprocessor from keras_nlp.src.models.llama3.llama3_tokenizer import Llama3Tokenizer from keras_nlp.src.models.llama.llama_backbone import LlamaBackbone from keras_nlp.src.models.llama.llama_causal_lm import LlamaCausalLM from keras_nlp.src.models.llama.llama_causal_lm_preprocessor import ( LlamaCausalLMPreprocessor, ) -from keras_nlp.src.models.llama.llama_preprocessor import LlamaPreprocessor from keras_nlp.src.models.llama.llama_tokenizer import LlamaTokenizer from keras_nlp.src.models.masked_lm import MaskedLM from keras_nlp.src.models.masked_lm_preprocessor import MaskedLMPreprocessor @@ -173,16 +161,12 @@ from keras_nlp.src.models.mistral.mistral_causal_lm_preprocessor import ( MistralCausalLMPreprocessor, ) -from keras_nlp.src.models.mistral.mistral_preprocessor import ( - MistralPreprocessor, -) from keras_nlp.src.models.mistral.mistral_tokenizer import MistralTokenizer from keras_nlp.src.models.opt.opt_backbone import OPTBackbone from keras_nlp.src.models.opt.opt_causal_lm import OPTCausalLM from keras_nlp.src.models.opt.opt_causal_lm_preprocessor import ( OPTCausalLMPreprocessor, ) -from keras_nlp.src.models.opt.opt_preprocessor import OPTPreprocessor from keras_nlp.src.models.opt.opt_tokenizer import OPTTokenizer from keras_nlp.src.models.pali_gemma.pali_gemma_backbone import ( PaliGemmaBackbone, @@ -201,7 +185,6 @@ from keras_nlp.src.models.phi3.phi3_causal_lm_preprocessor import ( Phi3CausalLMPreprocessor, ) -from keras_nlp.src.models.phi3.phi3_preprocessor import Phi3Preprocessor from keras_nlp.src.models.phi3.phi3_tokenizer import Phi3Tokenizer from keras_nlp.src.models.preprocessor import Preprocessor from keras_nlp.src.models.roberta.roberta_backbone import RobertaBackbone diff --git a/keras_nlp/src/models/bart/bart_preprocessor.py b/keras_nlp/src/models/bart/bart_preprocessor.py deleted file mode 100644 index 0f9de65fab..0000000000 --- a/keras_nlp/src/models/bart/bart_preprocessor.py +++ /dev/null @@ -1,264 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import keras - -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker -from keras_nlp.src.models.bart.bart_backbone import BartBackbone -from keras_nlp.src.models.bart.bart_tokenizer import BartTokenizer -from keras_nlp.src.models.preprocessor import Preprocessor -from keras_nlp.src.utils.tensor_utils import preprocessing_function - - -@keras_nlp_export("keras_nlp.models.BartPreprocessor") -class BartPreprocessor(Preprocessor): - """A BART preprocessing layer which tokenizes and packs inputs. - - This preprocessing layer will do three things: - - 1. Tokenize both encoder inputs and decoder inputs using the `tokenizer`. - Both inputs can contain only one segment. - 2. Add the appropriate special tokens - `""`, `""` and `""`. - 3. Construct a dictionary with keys `"encoder_token_ids"`, - `"encoder_padding_mask"`, `"decoder_token_ids"`, `"decoder_padding_mask"` - that can be passed directly to a BART model. - - Args: - tokenizer: A `keras_nlp.models.BartTokenizer` instance. - encoder_sequence_length: The length of the packed encoder inputs. - decoder_sequence_length: The length of the packed decoder inputs. - - Call arguments: - x: A dictionary with `encoder_text` and `decoder_text` as its keys. - Each value in the dictionary should be a tensor of single string - sequences. Inputs may be batched or unbatched. Raw python inputs - will be converted to tensors. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - - Examples: - - Directly calling the layer on data. - ```python - preprocessor = keras_nlp.models.BartPreprocessor.from_preset("bart_base_en") - - # Preprocess unbatched inputs. - inputs = { - "encoder_text": "The fox was sleeping.", - "decoder_text": "The fox was awake." - } - preprocessor(inputs) - - # Preprocess batched inputs. - inputs = { - "encoder_text": ["The fox was sleeping.", "The lion was quiet."], - "decoder_text": ["The fox was awake.", "The lion was roaring."] - } - preprocessor(inputs) - - # Custom vocabulary. - vocab = { - "": 0, - "": 1, - "": 2, - "Ġafter": 5, - "noon": 6, - "Ġsun": 7, - } - merges = ["Ġ a", "Ġ s", "Ġ n", "e r", "n o", "o n", "Ġs u", "Ġa f", "no on"] - merges += ["Ġsu n", "Ġaf t", "Ġaft er"] - - tokenizer = keras_nlp.models.BartTokenizer( - vocabulary=vocab, - merges=merges, - ) - preprocessor = keras_nlp.models.BartPreprocessor( - tokenizer=tokenizer, - encoder_sequence_length=20, - decoder_sequence_length=10, - ) - inputs = { - "encoder_text": "The fox was sleeping.", - "decoder_text": "The fox was awake." - } - preprocessor(inputs) - ``` - - Mapping with `tf.data.Dataset`. - ```python - preprocessor = keras_nlp.models.BartPreprocessor.from_preset("bart_base_en") - - # Map labeled single sentences. - features = { - "encoder_text": tf.constant( - ["The fox was sleeping.", "The lion was quiet."] - ), - "decoder_text": tf.constant( - ["The fox was awake.", "The lion was silent."] - ) - } - labels = tf.constant(["True", "False"]) - ds = tf.data.Dataset.from_tensor_slices((features, labels)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled single sentences. - features = { - "encoder_text": tf.constant( - ["The fox was sleeping.", "The lion was quiet."] - ), - "decoder_text": tf.constant( - ["The fox was awake.", "The lion was roaring."] - ) - } - ds = tf.data.Dataset.from_tensor_slices(features) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - ``` - """ - - backbone_cls = BartBackbone - tokenizer_cls = BartTokenizer - - def __init__( - self, - tokenizer, - encoder_sequence_length=1024, - decoder_sequence_length=1024, - **kwargs, - ): - super().__init__(**kwargs) - self.tokenizer = tokenizer - self.encoder_packer = None - self.decoder_packer = None - self.encoder_sequence_length = encoder_sequence_length - self.decoder_sequence_length = decoder_sequence_length - - def build(self, input_shape): - # Defer packer creation to `build()` so that we can be sure tokenizer - # assets have loaded when restoring a saved model. - - # TODO: Use `MultiSegmentPacker` instead of `StartEndPacker` once we - # want to move to multi-segment packing and have improved - # `MultiSegmentPacker`'s performance. - self.encoder_packer = StartEndPacker( - start_value=self.tokenizer.start_token_id, - end_value=self.tokenizer.end_token_id, - pad_value=self.tokenizer.pad_token_id, - sequence_length=self.encoder_sequence_length, - return_padding_mask=True, - ) - - # The decoder is packed a bit differently; the format is as follows: - # `[end_token_id, start_token_id, tokens..., end_token_id, padding...]`. - self.decoder_packer = StartEndPacker( - start_value=[ - self.tokenizer.end_token_id, - self.tokenizer.start_token_id, - ], - end_value=self.tokenizer.end_token_id, - pad_value=self.tokenizer.pad_token_id, - sequence_length=self.decoder_sequence_length, - return_padding_mask=True, - ) - self.built = True - - @preprocessing_function - def call( - self, - x, - y=None, - sample_weight=None, - *, - encoder_sequence_length=None, - decoder_sequence_length=None, - # `sequence_length` is an alias for `decoder_sequence_length` - sequence_length=None, - ): - if not ( - isinstance(x, dict) - and all(k in x for k in ("encoder_text", "decoder_text")) - ): - raise ValueError( - '`x` must be a dictionary, containing the keys `"encoder_text"`' - f' and `"decoder_text"`. Received x={x}.' - ) - - if encoder_sequence_length is None: - encoder_sequence_length = self.encoder_sequence_length - decoder_sequence_length = decoder_sequence_length or sequence_length - if decoder_sequence_length is None: - decoder_sequence_length = self.decoder_sequence_length - - encoder_inputs = self.tokenizer(x["encoder_text"]) - encoder_token_ids, encoder_padding_mask = self.encoder_packer( - encoder_inputs, - sequence_length=encoder_sequence_length, - ) - - decoder_inputs = self.tokenizer(x["decoder_text"]) - decoder_token_ids, decoder_padding_mask = self.decoder_packer( - decoder_inputs, - sequence_length=decoder_sequence_length, - ) - - x = { - "encoder_token_ids": encoder_token_ids, - "encoder_padding_mask": encoder_padding_mask, - "decoder_token_ids": decoder_token_ids, - "decoder_padding_mask": decoder_padding_mask, - } - - return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) - - def get_config(self): - config = super().get_config() - config.update( - { - "encoder_sequence_length": self.encoder_sequence_length, - "decoder_sequence_length": self.decoder_sequence_length, - } - ) - return config - - @property - def encoder_sequence_length(self): - """The padded length of encoder input sequences.""" - return self._encoder_sequence_length - - @encoder_sequence_length.setter - def encoder_sequence_length(self, value): - self._encoder_sequence_length = value - if self.encoder_packer is not None: - self.encoder_packer.sequence_length = value - - @property - def decoder_sequence_length(self): - """The padded length of decoder input sequences.""" - return self._decoder_sequence_length - - @decoder_sequence_length.setter - def decoder_sequence_length(self, value): - self._decoder_sequence_length = value - if self.decoder_packer is not None: - self.decoder_packer.sequence_length = value - - @property - def sequence_length(self): - """Alias for `decoder_sequence_length`.""" - return self.decoder_sequence_length - - @sequence_length.setter - def sequence_length(self, value): - self.decoder_sequence_length = value diff --git a/keras_nlp/src/models/bart/bart_preprocessor_test.py b/keras_nlp/src/models/bart/bart_preprocessor_test.py deleted file mode 100644 index 22e547671b..0000000000 --- a/keras_nlp/src/models/bart/bart_preprocessor_test.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from keras_nlp.src.models.bart.bart_preprocessor import BartPreprocessor -from keras_nlp.src.models.bart.bart_tokenizer import BartTokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class BartPreprocessorTest(TestCase): - def setUp(self): - self.vocab = ["", "", "", "air", "Ġair", "plane", "Ġat"] - self.vocab += ["port", ""] - self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) - self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] - self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] - self.merges += ["Ġai r", "Ġa i", "pla ne"] - self.tokenizer = BartTokenizer( - vocabulary=self.vocab, merges=self.merges - ) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "encoder_sequence_length": 5, - "decoder_sequence_length": 8, - } - self.input_data = ( - { - "encoder_text": [" airplane at airport"], - "decoder_text": [" airplane airport"], - }, - [1], # Pass through labels. - [1.0], # Pass through sample_weights. - ) - - def test_preprocessor_basics(self): - self.run_preprocessor_test( - cls=BartPreprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output=( - { - "encoder_token_ids": [[0, 4, 5, 6, 2]], - "encoder_padding_mask": [[1, 1, 1, 1, 1]], - "decoder_token_ids": [[2, 0, 4, 5, 4, 7, 2, 1]], - "decoder_padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], - }, - [1], # Pass through labels. - [1.0], # Pass through sample_weights. - ), - token_id_key="decoder_token_ids", - ) - - @pytest.mark.extra_large - def test_all_presets(self): - for preset in BartPreprocessor.presets: - self.run_preset_test( - cls=BartPreprocessor, - preset=preset, - input_data=self.input_data, - ) diff --git a/keras_nlp/src/models/bloom/bloom_preprocessor.py b/keras_nlp/src/models/bloom/bloom_preprocessor.py deleted file mode 100644 index 8b2d7b2ba0..0000000000 --- a/keras_nlp/src/models/bloom/bloom_preprocessor.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import keras - -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker -from keras_nlp.src.models.bloom.bloom_backbone import BloomBackbone -from keras_nlp.src.models.bloom.bloom_tokenizer import BloomTokenizer -from keras_nlp.src.models.preprocessor import Preprocessor -from keras_nlp.src.utils.tensor_utils import preprocessing_function - - -@keras_nlp_export("keras_nlp.models.BloomPreprocessor") -class BloomPreprocessor(Preprocessor): - """BLOOM preprocessing layer which tokenizes and packs inputs. - - This preprocessing layer will do 2 things: - - - Tokenize the inputs using the `tokenizer`. - - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can - be passed directly to a `keras_nlp.models.BloomBackbone`. - - This layer can be used directly with `tf.data.Dataset.map` to preprocess - string data in the `(x, y, sample_weight)` format used by - `keras.Model.fit`. - - The call method of this layer accepts three arguments, `x`, `y`, and - `sample_weight`. `x` can be a python string or tensor representing a single - segment, a list of python strings representing a batch of single segments, - or a list of tensors representing multiple segments to be packed together. - `y` and `sample_weight` are both optional, can have any format, and will be - passed through unaltered. - - Args: - tokenizer: A `keras_nlp.models.BloomTokenizer` instance. - sequence_length: The length of the packed inputs. - add_start_token: If `True`, the preprocessor will prepend the tokenizer - start token to each input sequence. - add_end_token: If `True`, the preprocessor will append the tokenizer - end token to each input sequence. - - Call arguments: - x: A string, `tf.Tensor` or list of python strings. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - sequence_length: Pass to override the configured `sequence_length` of - the layer. - - Examples: - - Directly calling the layer on data. - ```python - preprocessor = keras_nlp.models.BloomPreprocessor.from_preset( - "bloom_560m_multi" - ) - # Tokenize and pack a single sentence. - preprocessor("The quick brown fox jumped.") - - # Tokenize a batch of single sentences. - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) - - # Custom vocabulary. - features = ["a quick fox.", "a fox quick."] - vocab = {"": 0, "":1, "":2, "a": 3, "Ġquick": 4, "Ġfox": 5} - merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"] - merges += ["Ġ f", "o x", "Ġf ox"] - tokenizer = keras_nlp.models.BloomTokenizer( - vocabulary=vocab, - merges=merges, - ) - preprocessor = keras_nlp.models.BloomPreprocessor(tokenizer=tokenizer) - preprocessor("The quick brown fox jumped.") - ``` - - Mapping with `tf.data.Dataset`. - ```python - preprocessor = keras_nlp.models.BloomPreprocessor.from_preset( - "bloom_560m_multi" - ) - - text = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - label = tf.constant([1, 1]) - - # Map labeled single sentences. - ds = tf.data.Dataset.from_tensor_slices((text, label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled single sentences. - ds = tf.data.Dataset.from_tensor_slices(text) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - ``` - """ - - backbone_cls = BloomBackbone - tokenizer_cls = BloomTokenizer - - def __init__( - self, - tokenizer, - sequence_length=1024, - add_start_token=True, - add_end_token=True, - **kwargs, - ): - super().__init__(**kwargs) - self.tokenizer = tokenizer - self.packer = None - self.sequence_length = sequence_length - self.add_start_token = add_start_token - self.add_end_token = add_end_token - - def build(self, input_shape): - # Defer packer creation to `build()` so that we can be sure tokenizer - # assets have loaded when restoring a saved model. - self.packer = StartEndPacker( - start_value=self.tokenizer.start_token_id, - end_value=self.tokenizer.end_token_id, - pad_value=self.tokenizer.pad_token_id, - sequence_length=self.sequence_length, - return_padding_mask=True, - ) - self.built = True - - @preprocessing_function - def call( - self, - x, - y=None, - sample_weight=None, - sequence_length=None, - ): - sequence_length = sequence_length or self.sequence_length - token_ids, padding_mask = self.packer( - self.tokenizer(x), - sequence_length=sequence_length, - add_start_value=self.add_start_token, - add_end_value=self.add_end_token, - ) - x = { - "token_ids": token_ids, - "padding_mask": padding_mask, - } - return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.sequence_length, - "add_start_token": self.add_start_token, - "add_end_token": self.add_end_token, - } - ) - return config - - @property - def sequence_length(self): - """The padded length of model input sequences.""" - return self._sequence_length - - @sequence_length.setter - def sequence_length(self, value): - self._sequence_length = value - if self.packer is not None: - self.packer.sequence_length = value diff --git a/keras_nlp/src/models/bloom/bloom_preprocessor_test.py b/keras_nlp/src/models/bloom/bloom_preprocessor_test.py deleted file mode 100644 index 9eca885715..0000000000 --- a/keras_nlp/src/models/bloom/bloom_preprocessor_test.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from keras_nlp.src.models.bloom.bloom_preprocessor import BloomPreprocessor -from keras_nlp.src.models.bloom.bloom_tokenizer import BloomTokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class BloomPreprocessorTest(TestCase): - def setUp(self): - self.vocab = ["", "", ""] - self.vocab += ["!", "air", "Ġair", "plane", "Ġat", "port"] - self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) - self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] - self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] - self.merges += ["Ġai r", "Ġa i", "pla ne"] - self.tokenizer = BloomTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "sequence_length": 8, - } - self.input_data = ["airplane at airport"] - - def test_preprocessor_basics(self): - self.run_preprocessor_test( - cls=BloomPreprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output={ - "token_ids": [[1, 4, 6, 7, 5, 8, 2, 0]], - "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], - }, - ) - - def test_no_start_end_token(self): - input_data = ["airplane at airport"] * 4 - - preprocessor = BloomPreprocessor( - tokenizer=BloomTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, - add_start_token=False, - add_end_token=False, - ) - x = preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[4, 6, 7, 5, 8, 0, 0, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4) - - def test_sequence_length_override(self): - input_data = "airplane at airport" - preprocessor = BloomPreprocessor(**self.init_kwargs) - x = preprocessor(input_data, sequence_length=4) - self.assertAllEqual(x["token_ids"], [1, 4, 6, 2]) - - @pytest.mark.extra_large - def test_all_presets(self): - for preset in BloomPreprocessor.presets: - self.run_preset_test( - cls=BloomPreprocessor, - preset=preset, - input_data=self.input_data, - ) diff --git a/keras_nlp/src/models/electra/electra_preprocessor.py b/keras_nlp/src/models/electra/electra_preprocessor.py deleted file mode 100644 index 82dbb3310c..0000000000 --- a/keras_nlp/src/models/electra/electra_preprocessor.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import keras - -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.layers.preprocessing.multi_segment_packer import ( - MultiSegmentPacker, -) -from keras_nlp.src.models.electra.electra_backbone import ElectraBackbone -from keras_nlp.src.models.electra.electra_tokenizer import ElectraTokenizer -from keras_nlp.src.models.preprocessor import Preprocessor -from keras_nlp.src.utils.tensor_utils import preprocessing_function - - -@keras_nlp_export("keras_nlp.models.ElectraPreprocessor") -class ElectraPreprocessor(Preprocessor): - """A ELECTRA preprocessing layer which tokenizes and packs inputs. - - This preprocessing layer will do three things: - - 1. Tokenize any number of input segments using the `tokenizer`. - 2. Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`. - with the appropriate `"[CLS]"`, `"[SEP]"` and `"[PAD]"` tokens. - 3. Construct a dictionary of with keys `"token_ids"` and `"padding_mask"`, - that can be passed directly to a ELECTRA model. - - This layer can be used directly with `tf.data.Dataset.map` to preprocess - string data in the `(x, y, sample_weight)` format used by - `keras.Model.fit`. - - Args: - tokenizer: A `keras_nlp.models.ElectraTokenizer` instance. - sequence_length: The length of the packed inputs. - truncate: string. The algorithm to truncate a list of batched segments - to fit within `sequence_length`. The value can be either - `round_robin` or `waterfall`: - - `"round_robin"`: Available space is assigned one token at a - time in a round-robin fashion to the inputs that still need - some, until the limit is reached. - - `"waterfall"`: The allocation of the budget is done using a - "waterfall" algorithm that allocates quota in a - left-to-right manner and fills up the buckets until we run - out of budget. It supports an arbitrary number of segments. - - Call arguments: - x: A tensor of single string sequences, or a tuple of multiple - tensor sequences to be packed together. Inputs may be batched or - unbatched. For single sequences, raw python inputs will be converted - to tensors. For multiple sequences, pass tensors directly. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - - Examples: - - Directly calling the layer on data. - ```python - preprocessor = keras_nlp.models.ElectraPreprocessor.from_preset( - "electra_base_discriminator_en" - ) - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) - - # Custom vocabulary. - vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] - vocab += ["The", "quick", "brown", "fox", "jumped", "."] - tokenizer = keras_nlp.models.ElectraTokenizer(vocabulary=vocab) - preprocessor = keras_nlp.models.ElectraPreprocessor(tokenizer) - preprocessor("The quick brown fox jumped.") - ``` - - Mapping with `tf.data.Dataset`. - ```python - preprocessor = keras_nlp.models.ElectraPreprocessor.from_preset( - "electra_base_discriminator_en" - ) - - first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - second = tf.constant(["The fox tripped.", "Oh look, a whale."]) - label = tf.constant([1, 1]) - # Map labeled single sentences. - ds = tf.data.Dataset.from_tensor_slices((first, label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - - # Map unlabeled single sentences. - ds = tf.data.Dataset.from_tensor_slices(first) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map labeled sentence pairs. - ds = tf.data.Dataset.from_tensor_slices(((first, second), label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - # Map unlabeled sentence pairs. - ds = tf.data.Dataset.from_tensor_slices((first, second)) - - # Watch out for tf.data's default unpacking of tuples here! - # Best to invoke the `preprocessor` directly in this case. - ds = ds.map( - lambda first, second: preprocessor(x=(first, second)), - num_parallel_calls=tf.data.AUTOTUNE, - ) - ``` - """ - - backbone_cls = ElectraBackbone - tokenizer_cls = ElectraTokenizer - - def __init__( - self, - tokenizer, - sequence_length=512, - truncate="round_robin", - **kwargs, - ): - super().__init__(**kwargs) - self.tokenizer = tokenizer - self.packer = MultiSegmentPacker( - start_value=self.tokenizer.cls_token_id, - end_value=self.tokenizer.sep_token_id, - pad_value=self.tokenizer.pad_token_id, - truncate=truncate, - sequence_length=sequence_length, - ) - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.packer.sequence_length, - "truncate": self.packer.truncate, - } - ) - return config - - @preprocessing_function - def call(self, x, y=None, sample_weight=None): - x = x if isinstance(x, tuple) else (x,) - x = tuple(self.tokenizer(segment) for segment in x) - token_ids, segment_ids = self.packer(x) - x = { - "token_ids": token_ids, - "segment_ids": segment_ids, - "padding_mask": token_ids != self.tokenizer.pad_token_id, - } - return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) diff --git a/keras_nlp/src/models/electra/electra_preprocessor_test.py b/keras_nlp/src/models/electra/electra_preprocessor_test.py deleted file mode 100644 index 62dbaf3fdd..0000000000 --- a/keras_nlp/src/models/electra/electra_preprocessor_test.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from keras_nlp.src.models.electra.electra_preprocessor import ( - ElectraPreprocessor, -) -from keras_nlp.src.models.electra.electra_tokenizer import ElectraTokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class ElectraPreprocessorTest(TestCase): - def setUp(self): - self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] - self.vocab += ["THE", "QUICK", "BROWN", "FOX"] - self.vocab += ["the", "quick", "brown", "fox"] - self.tokenizer = ElectraTokenizer(vocabulary=self.vocab) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "sequence_length": 8, - } - self.input_data = ( - ["THE QUICK BROWN FOX."], - [1], # Pass through labels. - [1.0], # Pass through sample_weights. - ) - - def test_preprocessor_basics(self): - self.run_preprocessing_layer_test( - cls=ElectraPreprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output=( - { - "token_ids": [[2, 5, 6, 7, 8, 1, 3, 0]], - "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0]], - "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], - }, - [1], # Pass through labels. - [1.0], # Pass through sample_weights. - ), - ) - - def test_errors_for_2d_list_input(self): - preprocessor = ElectraPreprocessor(**self.init_kwargs) - ambiguous_input = [["one", "two"], ["three", "four"]] - with self.assertRaises(ValueError): - preprocessor(ambiguous_input) - - @pytest.mark.extra_large - def test_all_presets(self): - for preset in ElectraPreprocessor.presets: - self.run_preset_test( - cls=ElectraPreprocessor, - preset=preset, - input_data=self.input_data, - ) diff --git a/keras_nlp/src/models/falcon/falcon_preprocessor.py b/keras_nlp/src/models/falcon/falcon_preprocessor.py deleted file mode 100644 index 491f6e5fe2..0000000000 --- a/keras_nlp/src/models/falcon/falcon_preprocessor.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import keras - -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker -from keras_nlp.src.models.falcon.falcon_backbone import FalconBackbone -from keras_nlp.src.models.falcon.falcon_tokenizer import FalconTokenizer -from keras_nlp.src.models.preprocessor import Preprocessor -from keras_nlp.src.utils.tensor_utils import preprocessing_function - - -@keras_nlp_export("keras_nlp.models.FalconPreprocessor") -class FalconPreprocessor(Preprocessor): - """Falcon preprocessing layer which tokenizes and packs inputs. - - This preprocessing layer will do 2 things: - - - Tokenize the inputs using the `tokenizer`. - - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can - be passed directly to a `keras_nlp.models.FalconBackbone`. - - This layer can be used directly with `tf.data.Dataset.map` to preprocess - string data in the `(x, y, sample_weight)` format used by - `keras.Model.fit`. - - The call method of this layer accepts three arguments, `x`, `y`, and - `sample_weight`. `x` can be a python string or tensor representing a single - segment, a list of python strings representing a batch of single segments, - or a list of tensors representing multiple segments to be packed together. - `y` and `sample_weight` are both optional, can have any format, and will be - passed through unaltered. - - `FalconPreprocessor` forces the input to have only one segment, as Falcon is - mainly used for generation tasks. For tasks having multi-segment inputs - like "glue/mnli", please use a model designed for classification purposes - such as BERT or RoBERTa. - - Args: - tokenizer: A `keras_nlp.models.FalconTokenizer` instance. - sequence_length: The length of the packed inputs. - add_start_token: If `True`, the preprocessor will prepend the tokenizer - start token to each input sequence. - add_end_token: If `True`, the preprocessor will append the tokenizer - end token to each input sequence. - - Call arguments: - x: A string, `tf.Tensor` or list of python strings. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - sequence_length: Pass to override the configured `sequence_length` of - the layer. - - Examples: - - Directly calling the layer on data. - ```python - preprocessor = keras_nlp.models.FalconPreprocessor.from_preset("falcon_rw_1b") - - # Tokenize and pack a single sentence. - preprocessor("The quick brown fox jumped.") - - # Tokenize a batch of single sentences. - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) - - # Custom vocabulary. - features = ["a quick fox.", "a fox quick."] - vocab = {"<|endoftext|>": 0, "a": 4, "Ġquick": 5, "Ġfox": 6} - merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"] - merges += ["Ġ f", "o x", "Ġf ox"] - tokenizer = keras_nlp.models.FalconTokenizer( - vocabulary=vocab, - merges=merges, - ) - preprocessor = keras_nlp.models.FalconPreprocessor(tokenizer=tokenizer) - preprocessor("The quick brown fox jumped.") - ``` - - Mapping with `tf.data.Dataset`. - ```python - preprocessor = keras_nlp.models.FalconPreprocessor.from_preset("falcon_rw_1b") - - text = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - label = tf.constant([1, 1]) - - # Map labeled single sentences. - ds = tf.data.Dataset.from_tensor_slices((text, label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled single sentences. - ds = tf.data.Dataset.from_tensor_slices(text) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - ``` - """ - - backbone_cls = FalconBackbone - tokenizer_cls = FalconTokenizer - - def __init__( - self, - tokenizer, - sequence_length=1024, - add_start_token=True, - add_end_token=True, - **kwargs, - ): - super().__init__(**kwargs) - self.tokenizer = tokenizer - self.packer = None - self.sequence_length = sequence_length - self.add_start_token = add_start_token - self.add_end_token = add_end_token - - def build(self, input_shape): - # Defer packer creation to `build()` so that we can be sure tokenizer - # assets have loaded when restoring a saved model. - self.packer = StartEndPacker( - start_value=self.tokenizer.start_token_id, - end_value=self.tokenizer.end_token_id, - pad_value=self.tokenizer.pad_token_id, - sequence_length=self.sequence_length, - return_padding_mask=True, - ) - self.built = True - - @preprocessing_function - def call( - self, - x, - y=None, - sample_weight=None, - sequence_length=None, - ): - sequence_length = sequence_length or self.sequence_length - token_ids, padding_mask = self.packer( - self.tokenizer(x), - sequence_length=sequence_length, - add_start_value=self.add_start_token, - add_end_value=self.add_end_token, - ) - x = { - "token_ids": token_ids, - "padding_mask": padding_mask, - } - return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.sequence_length, - "add_start_token": self.add_start_token, - "add_end_token": self.add_end_token, - } - ) - return config - - @property - def sequence_length(self): - """The padded length of model input sequences.""" - return self._sequence_length - - @sequence_length.setter - def sequence_length(self, value): - self._sequence_length = value - if self.packer is not None: - self.packer.sequence_length = value diff --git a/keras_nlp/src/models/falcon/falcon_preprocessor_test.py b/keras_nlp/src/models/falcon/falcon_preprocessor_test.py deleted file mode 100644 index 075710b23b..0000000000 --- a/keras_nlp/src/models/falcon/falcon_preprocessor_test.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from keras_nlp.src.models.falcon.falcon_preprocessor import FalconPreprocessor -from keras_nlp.src.models.falcon.falcon_tokenizer import FalconTokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class FalconPreprocessorTest(TestCase): - def setUp(self): - self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] - self.vocab += ["<|endoftext|>"] - self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) - self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] - self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] - self.merges += ["Ġai r", "Ġa i", "pla ne"] - self.tokenizer = FalconTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "sequence_length": 8, - } - self.input_data = ["airplane at airport"] - - def test_preprocessor_basics(self): - self.run_preprocessor_test( - cls=FalconPreprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output={ - "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]], - "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], - }, - ) - - def test_no_start_end_token(self): - input_data = ["airplane at airport"] * 4 - - preprocessor = FalconPreprocessor( - tokenizer=FalconTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, - add_start_token=False, - add_end_token=False, - ) - x = preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4) - - def test_sequence_length_override(self): - input_data = "airplane at airport" - preprocessor = FalconPreprocessor(**self.init_kwargs) - x = preprocessor(input_data, sequence_length=4) - self.assertAllEqual(x["token_ids"], [6, 1, 3, 6]) - - @pytest.mark.extra_large - def test_all_presets(self): - for preset in FalconPreprocessor.presets: - self.run_preset_test( - cls=FalconPreprocessor, - preset=preset, - input_data=self.input_data, - ) diff --git a/keras_nlp/src/models/gemma/gemma_preprocessor.py b/keras_nlp/src/models/gemma/gemma_preprocessor.py deleted file mode 100644 index dcbe531b56..0000000000 --- a/keras_nlp/src/models/gemma/gemma_preprocessor.py +++ /dev/null @@ -1,184 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import keras - -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker -from keras_nlp.src.models.gemma.gemma_backbone import GemmaBackbone -from keras_nlp.src.models.gemma.gemma_tokenizer import GemmaTokenizer -from keras_nlp.src.models.preprocessor import Preprocessor -from keras_nlp.src.utils.tensor_utils import preprocessing_function - - -@keras_nlp_export("keras_nlp.models.GemmaPreprocessor") -class GemmaPreprocessor(Preprocessor): - """Gemma preprocessing layer which tokenizes and packs inputs. - - This preprocessing layer will do 2 things: - - - Tokenize the inputs using the `tokenizer`. - - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can - be passed directly to a `keras_nlp.models.GemmaBackbone`. - - This layer can be used directly with `tf.data.Dataset.map` to preprocess - string data in the `(x, y, sample_weight)` format used by - `keras.Model.fit`. - - The call method of this layer accepts three arguments, `x`, `y`, and - `sample_weight`. `x` can be a python string or tensor representing a single - segment, a list of python strings representing a batch of single segments, - or a list of tensors representing multiple segments to be packed together. - `y` and `sample_weight` are both optional, can have any format, and will be - passed through unaltered. - - `GemmaPreprocessor` expects the input to have only one segment, as Gemma is - mainly used for generation tasks. For tasks having multi-segment inputs - please combine inputs into a single string input before passing to the - preprocessor layer. - - Args: - tokenizer: A `keras_nlp.models.GemmaTokenizer` instance. - sequence_length: The length of the packed inputs. - add_start_token: If `True`, the preprocessor will prepend the tokenizer - start token to each input sequence. - add_end_token: If `True`, the preprocessor will append the tokenizer - end token to each input sequence. - - Call arguments: - x: A string, `tf.Tensor` or list of python strings. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - sequence_length: Pass to override the configured `sequence_length` of - the layer. - - Examples: - - Directly calling the layer on data. - ```python - preprocessor = keras_nlp.models.GemmaPreprocessor.from_preset( - "gemma_2b_en" - ) - - # Tokenize and pack a single sentence. - preprocessor("The quick brown fox jumped.") - - # Tokenize a batch of sentences. - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) - - # Custom vocabulary. - bytes_io = io.BytesIO() - ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."]) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=ds.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=8, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="", - bos_piece="", - eos_piece="", - unk_piece="", - ) - tokenizer = keras_nlp.models.GemmaTokenizer( - proto=bytes_io.getvalue(), - ) - preprocessor = keras_nlp.models.GemmaPreprocessor(tokenizer=tokenizer) - preprocessor("The quick brown fox jumped.") - ``` - - Apply preprocessing to a `tf.data.Dataset`. - ```python - preprocessor = keras_nlp.models.GemmaPreprocessor.from_preset( - "gemma_2b_en" - ) - - text = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - label = tf.constant([1, 1]) - - # Map labeled single sentences. - ds = tf.data.Dataset.from_tensor_slices((text, label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled single sentences. - ds = tf.data.Dataset.from_tensor_slices(text) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - ``` - """ - - backbone_cls = GemmaBackbone - tokenizer_cls = GemmaTokenizer - - def __init__( - self, - tokenizer, - sequence_length=1024, - add_start_token=True, - add_end_token=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.tokenizer = tokenizer - self.sequence_length = sequence_length - self.add_start_token = add_start_token - self.add_end_token = add_end_token - - def build(self, input_shape): - # Defer packer creation to `build()` so that we can be sure tokenizer - # assets have loaded when restoring a saved model. - self.packer = StartEndPacker( - start_value=self.tokenizer.start_token_id, - end_value=self.tokenizer.end_token_id, - pad_value=self.tokenizer.pad_token_id, - sequence_length=self.sequence_length, - return_padding_mask=True, - ) - self.built = True - - @preprocessing_function - def call( - self, - x, - y=None, - sample_weight=None, - sequence_length=None, - ): - sequence_length = sequence_length or self.sequence_length - token_ids, padding_mask = self.packer( - self.tokenizer(x), - sequence_length=sequence_length, - add_start_value=self.add_start_token, - add_end_value=self.add_end_token, - ) - x = { - "token_ids": token_ids, - "padding_mask": padding_mask, - } - return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.sequence_length, - "add_start_token": self.add_start_token, - "add_end_token": self.add_end_token, - } - ) - return config diff --git a/keras_nlp/src/models/gemma/gemma_preprocessor_test.py b/keras_nlp/src/models/gemma/gemma_preprocessor_test.py deleted file mode 100644 index 66264edb27..0000000000 --- a/keras_nlp/src/models/gemma/gemma_preprocessor_test.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytest - -from keras_nlp.src.models.gemma.gemma_preprocessor import GemmaPreprocessor -from keras_nlp.src.models.gemma.gemma_tokenizer import GemmaTokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class GemmaPreprocessorTest(TestCase): - def setUp(self): - self.tokenizer = GemmaTokenizer( - proto=os.path.join( - self.get_test_data_dir(), "gemma_test_vocab.spm" - ), - ) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "sequence_length": 8, - } - self.input_data = ["the quick brown fox"] - - def test_preprocessor_basics(self): - self.run_preprocessing_layer_test( - cls=GemmaPreprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output={ - "token_ids": [[1, 4, 9, 5, 7, 2, 0, 0]], - "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]], - }, - ) - - def test_no_start_end_token(self): - input_data = ["the quick brown fox"] * 4 - preprocessor = GemmaPreprocessor( - tokenizer=self.tokenizer, - sequence_length=8, - add_start_token=False, - add_end_token=False, - ) - x = preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[4, 9, 5, 7, 0, 0, 0, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 0, 0, 0, 0]] * 4) - - def test_sequence_length_override(self): - input_data = "the quick brown fox" - preprocessor = GemmaPreprocessor(**self.init_kwargs) - x = preprocessor(input_data, sequence_length=4) - self.assertAllEqual(x["token_ids"], [1, 4, 9, 2]) - - @pytest.mark.kaggle_key_required - @pytest.mark.extra_large - def test_all_presets(self): - for preset in GemmaPreprocessor.presets: - self.run_preset_test( - cls=GemmaPreprocessor, - preset=preset, - input_data=self.input_data, - ) diff --git a/keras_nlp/src/models/gpt2/gpt2_preprocessor.py b/keras_nlp/src/models/gpt2/gpt2_preprocessor.py index c9af92fcbf..720b052568 100644 --- a/keras_nlp/src/models/gpt2/gpt2_preprocessor.py +++ b/keras_nlp/src/models/gpt2/gpt2_preprocessor.py @@ -25,85 +25,12 @@ @keras_nlp_export("keras_nlp.models.GPT2Preprocessor") class GPT2Preprocessor(Preprocessor): - """GPT2 preprocessing layer which tokenizes and packs inputs. + """Legacy preprocessing layer for GPT2. - This preprocessing layer will do 2 things: - - - Tokenize the inputs using the `tokenizer`. - - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can - be passed directly to a `keras_nlp.models.GPT2Backbone`. - - This layer can be used directly with `tf.data.Dataset.map` to preprocess - string data in the `(x, y, sample_weight)` format used by - `keras.Model.fit`. - - The call method of this layer accepts three arguments, `x`, `y`, and - `sample_weight`. `x` can be a python string or tensor representing a single - segment, a list of python strings representing a batch of single segments, - or a list of tensors representing multiple segments to be packed together. - `y` and `sample_weight` are both optional, can have any format, and will be - passed through unaltered. - - `GPT2Preprocessor` forces the input to have only one segment, as GPT2 is - mainly used for generation tasks. For tasks having multi-segment inputs - like "glue/mnli", please use a model designed for classification purposes - such as BERT or RoBERTa. - - Args: - tokenizer: A `keras_nlp.models.GPT2Tokenizer` instance. - sequence_length: The length of the packed inputs. - add_start_token: If `True`, the preprocessor will prepend the tokenizer - start token to each input sequence. - add_end_token: If `True`, the preprocessor will append the tokenizer - end token to each input sequence. - - Call arguments: - x: A string, `tf.Tensor` or list of python strings. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - sequence_length: Pass to override the configured `sequence_length` of - the layer. - - Examples: - - Directly calling the layer on data. - ```python - preprocessor = keras_nlp.models.GPT2Preprocessor.from_preset("gpt2_base_en") - - # Tokenize and pack a single sentence. - preprocessor("The quick brown fox jumped.") - - # Tokenize a batch of single sentences. - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) - - # Custom vocabulary. - features = ["a quick fox.", "a fox quick."] - vocab = {"<|endoftext|>": 0, "a": 4, "Ġquick": 5, "Ġfox": 6} - merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"] - merges += ["Ġ f", "o x", "Ġf ox"] - tokenizer = keras_nlp.models.GPT2Tokenizer( - vocabulary=vocab, - merges=merges, - ) - preprocessor = keras_nlp.models.GPT2Preprocessor(tokenizer=tokenizer) - preprocessor("The quick brown fox jumped.") - ``` - - Mapping with `tf.data.Dataset`. - ```python - preprocessor = keras_nlp.models.GPT2Preprocessor.from_preset("gpt2_base_en") - - text = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - label = tf.constant([1, 1]) - - # Map labeled single sentences. - ds = tf.data.Dataset.from_tensor_slices((text, label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled single sentences. - ds = tf.data.Dataset.from_tensor_slices(text) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - ``` + This layer should not be used in new code! All preprocessing layers pair + directly with a task. E.g. `BertClassifier` and + `BertClassifierPreprocessor`. Either use `GPT2CausalLMPreprocessor` or + wrap `GPT2Tokenizer` into a custom preprocessing layer or function. """ backbone_cls = GPT2Backbone @@ -117,6 +44,8 @@ def __init__( add_end_token=True, **kwargs, ): + # TODO: this class has some usage, but barely any, and is no longer + # documented. We should consider dropping it. super().__init__(**kwargs) self.tokenizer = tokenizer self.packer = None diff --git a/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py b/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py deleted file mode 100644 index 06a46ce470..0000000000 --- a/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import keras - -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker -from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_backbone import GPTNeoXBackbone -from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer -from keras_nlp.src.models.preprocessor import Preprocessor -from keras_nlp.src.utils.tensor_utils import preprocessing_function - - -@keras_nlp_export("keras_nlp.models.GPTNeoXPreprocessor") -class GPTNeoXPreprocessor(Preprocessor): - """GPTNeoX preprocessing layer which tokenizes and packs inputs. - - This preprocessing layer will do 2 things: - - - Tokenize the inputs using the `tokenizer`. - - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can - be passed directly to a `keras_nlp.models.GPTNeoXBackbone`. - - This layer can be used directly with `tf.data.Dataset.map` to preprocess - string data in the `(x, y, sample_weight)` format used by - `keras.Model.fit`. - - The call method of this layer accepts three arguments, `x`, `y`, and - `sample_weight`. `x` can be a python string or tensor representing a single - segment, a list of python strings representing a batch of single segments, - or a list of tensors representing multiple segments to be packed together. - `y` and `sample_weight` are both optional, can have any format, and will be - passed through unaltered. - - `GPTNeoXPreprocessor` forces the input to have only one segment, as GPTNeoX is - mainly used for generation tasks. For tasks having multi-segment inputs - like "glue/mnli", please use a model designed for classification purposes - such as BERT or RoBERTa. - - Args: - tokenizer: A `keras_nlp.models.GPTNeoXTokenizer` instance. - sequence_length: The length of the packed inputs. - add_start_token: If `True`, the preprocessor will prepend the tokenizer - start token to each input sequence. - add_end_token: If `True`, the preprocessor will append the tokenizer - end token to each input sequence. - - Call arguments: - x: A string, `tf.Tensor` or list of python strings. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - sequence_length: Pass to override the configured `sequence_length` of - the layer. - """ - - backbone_cls = GPTNeoXBackbone - tokenizer_cls = GPTNeoXTokenizer - - def __init__( - self, - tokenizer, - sequence_length=1024, - add_start_token=True, - add_end_token=True, - **kwargs, - ): - super().__init__(**kwargs) - self.tokenizer = tokenizer - self.packer = None - self.sequence_length = sequence_length - self.add_start_token = add_start_token - self.add_end_token = add_end_token - - def build(self, input_shape): - # Defer packer creation to `build()` so that we can be sure tokenizer - # assets have loaded when restoring a saved model. - self.packer = StartEndPacker( - start_value=self.tokenizer.start_token_id, - end_value=self.tokenizer.end_token_id, - pad_value=self.tokenizer.pad_token_id, - sequence_length=self.sequence_length, - return_padding_mask=True, - ) - self.built = True - - @preprocessing_function - def call( - self, - x, - y=None, - sample_weight=None, - sequence_length=None, - ): - sequence_length = sequence_length or self.sequence_length - token_ids, padding_mask = self.packer( - self.tokenizer(x), - sequence_length=sequence_length, - add_start_value=self.add_start_token, - add_end_value=self.add_end_token, - ) - x = { - "token_ids": token_ids, - "padding_mask": padding_mask, - } - return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.sequence_length, - "add_start_token": self.add_start_token, - "add_end_token": self.add_end_token, - } - ) - return config - - @property - def sequence_length(self): - """The padded length of model input sequences.""" - return self._sequence_length - - @sequence_length.setter - def sequence_length(self, value): - self._sequence_length = value - if self.packer is not None: - self.packer.sequence_length = value diff --git a/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py b/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py deleted file mode 100644 index aad68bebc6..0000000000 --- a/keras_nlp/src/models/gpt_neo_x/gpt_neo_x_preprocessor_test.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_preprocessor import ( - GPTNeoXPreprocessor, -) -from keras_nlp.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class GPTNeoXPreprocessorTest(TestCase): - def setUp(self): - self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] - self.vocab += ["<|endoftext|>"] - self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) - self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] - self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] - self.merges += ["Ġai r", "Ġa i", "pla ne"] - self.tokenizer = GPTNeoXTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "sequence_length": 8, - } - self.input_data = ["airplane at airport"] - - def test_preprocessor_basics(self): - self.run_preprocessor_test( - cls=GPTNeoXPreprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output={ - "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]], - "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], - }, - ) - - def test_no_start_end_token(self): - input_data = ["airplane at airport"] * 4 - - preprocessor = GPTNeoXPreprocessor( - tokenizer=GPTNeoXTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, - add_start_token=False, - add_end_token=False, - ) - x = preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4) - - def test_sequence_length_override(self): - input_data = "airplane at airport" - preprocessor = GPTNeoXPreprocessor(**self.init_kwargs) - x = preprocessor(input_data, sequence_length=4) - self.assertAllEqual(x["token_ids"], [6, 1, 3, 6]) diff --git a/keras_nlp/src/models/llama/llama_preprocessor.py b/keras_nlp/src/models/llama/llama_preprocessor.py deleted file mode 100644 index 75122856c6..0000000000 --- a/keras_nlp/src/models/llama/llama_preprocessor.py +++ /dev/null @@ -1,182 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import keras - -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker -from keras_nlp.src.models.llama.llama_backbone import LlamaBackbone -from keras_nlp.src.models.llama.llama_tokenizer import LlamaTokenizer -from keras_nlp.src.models.preprocessor import Preprocessor -from keras_nlp.src.utils.tensor_utils import preprocessing_function - - -@keras_nlp_export("keras_nlp.models.LlamaPreprocessor") -class LlamaPreprocessor(Preprocessor): - """A Llama preprocessing layer which tokenizes and packs inputs. - - This preprocessing layer will do three things: - - 1. Tokenize any number of input segments using the `tokenizer`. - 2. Pack the inputs together using a `keras_nlp.layers.StartEndPacker`. - with the appropriate tokens. - 3. Construct a dictionary with keys `"token_ids"`, and `"padding_mask"` - that can be passed directly to `keras_nlp.models.LlamaBackbone`. - - This layer can be used directly with `tf.data.Dataset.map` to preprocess - string data in the `(x, y, sample_weight)` format used by - `keras.Model.fit`. - - Args: - tokenizer: A `keras_nlp.models.LlamaTokenizer` instance. - sequence_length: The length of the packed inputs. - add_start_token: If `True`, the preprocessor will prepend the tokenizer - start token to each input sequence. Default is `True`. - add_end_token: If `True`, the preprocessor will append the tokenizer - end token to each input sequence. Default is `False`. - - Call arguments: - x: A tensor of single string sequences, or a tuple of multiple - tensor sequences to be packed together. Inputs may be batched or - unbatched. For single sequences, raw python inputs will be converted - to tensors. For multiple sequences, pass tensors directly. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - sequence_length: Pass to override the configured `sequence_length` of - the layer. - - Examples: - - Directly calling the from_preset(). - ```python - preprocessor = keras_nlp.models.LlamaPreprocessor.from_preset( - "llama_base_en" - ) - - # Tokenize and pack a single sentence. - preprocessor("The quick brown fox jumped.") - - # Tokenize and a batch of single sentences. - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) - - # Preprocess a batch of sentence pairs. - # When handling multiple sequences, always convert to tensors first! - first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - second = tf.constant(["The fox tripped.", "Oh look, a whale."]) - preprocessor((first, second)) - ``` - - Mapping with `tf.data.Dataset`. - ```python - preprocessor = keras_nlp.models.LlamaPreprocessor.from_preset( - "llama_base_en" - ) - first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - second = tf.constant(["The fox tripped.", "Oh look, a whale."]) - label = tf.constant([1, 1]) - - # Map labeled single sentences. - ds = tf.data.Dataset.from_tensor_slices((first, label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled single sentences. - ds = tf.data.Dataset.from_tensor_slices(first) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map labeled sentence pairs. - ds = tf.data.Dataset.from_tensor_slices(((first, second), label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled sentence pairs. - ds = tf.data.Dataset.from_tensor_slices((first, second)) - - # Watch out for tf.data's default unpacking of tuples here! - # Best to invoke the `preprocessor` directly in this case. - ds = ds.map( - lambda first, second: preprocessor(x=(first, second)), - num_parallel_calls=tf.data.AUTOTUNE, - ) - ``` - """ - - backbone_cls = LlamaBackbone - tokenizer_cls = LlamaTokenizer - - def __init__( - self, - tokenizer, - sequence_length=1024, - add_start_token=True, - add_end_token=False, - **kwargs, - ): - super().__init__(**kwargs) - self.tokenizer = tokenizer - self.packer = None - self.add_start_token = add_start_token - self.add_end_token = add_end_token - self.sequence_length = sequence_length - - def build(self, input_shape): - # Defer packer creation to `build()` so that we can be sure tokenizer - # assets have loaded when restoring a saved model. - self.packer = StartEndPacker( - start_value=self.tokenizer.start_token_id, - end_value=self.tokenizer.end_token_id, - sequence_length=self.sequence_length, - return_padding_mask=True, - ) - self.built = True - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.sequence_length, - "add_start_token": self.add_start_token, - "add_end_token": self.add_end_token, - } - ) - return config - - @preprocessing_function - def call( - self, - x, - y=None, - sample_weight=None, - sequence_length=None, - ): - sequence_length = sequence_length or self.sequence_length - token_ids, padding_mask = self.packer( - self.tokenizer(x), - sequence_length=sequence_length, - add_start_value=self.add_start_token, - add_end_value=self.add_end_token, - ) - x = { - "token_ids": token_ids, - "padding_mask": padding_mask, - } - return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) - - @property - def sequence_length(self): - """The padded length of model input sequences.""" - return self._sequence_length - - @sequence_length.setter - def sequence_length(self, value): - self._sequence_length = value - if self.packer is not None: - self.packer.sequence_length = value diff --git a/keras_nlp/src/models/llama/llama_preprocessor_test.py b/keras_nlp/src/models/llama/llama_preprocessor_test.py deleted file mode 100644 index eca0af66a0..0000000000 --- a/keras_nlp/src/models/llama/llama_preprocessor_test.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytest - -from keras_nlp.src.models.llama.llama_preprocessor import LlamaPreprocessor -from keras_nlp.src.models.llama.llama_tokenizer import LlamaTokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class LlamaPreprocessorTest(TestCase): - def setUp(self): - self.tokenizer = LlamaTokenizer( - # Generated using create_llama_test_proto.py - proto=os.path.join(self.get_test_data_dir(), "llama_test_vocab.spm") - ) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "sequence_length": 8, - } - self.input_data = ( - ["the quick brown fox"], - [1], # Pass through labels. - [1.0], # Pass through sample_weights. - ) - - def test_preprocessor_basics(self): - self.run_preprocessor_test( - cls=LlamaPreprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output=( - { - "token_ids": [[1, 3, 8, 4, 6, 0, 0, 0]], - "padding_mask": [[1, 1, 1, 1, 1, 0, 0, 0]], - }, - [1], # Pass through labels. - [1.0], # Pass through sample_weights. - ), - ) - - def test_errors_for_2d_list_input(self): - preprocessor = LlamaPreprocessor(**self.init_kwargs) - ambiguous_input = [["one", "two"], ["three", "four"]] - with self.assertRaises(ValueError): - preprocessor(ambiguous_input) - - @pytest.mark.extra_large - def test_all_presets(self): - for preset in LlamaPreprocessor.presets: - self.run_preset_test( - cls=LlamaPreprocessor, - preset=preset, - input_data=self.input_data, - ) diff --git a/keras_nlp/src/models/llama3/llama3_preprocessor.py b/keras_nlp/src/models/llama3/llama3_preprocessor.py deleted file mode 100644 index b1fc5769ab..0000000000 --- a/keras_nlp/src/models/llama3/llama3_preprocessor.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.models.llama3.llama3_backbone import Llama3Backbone -from keras_nlp.src.models.llama3.llama3_tokenizer import Llama3Tokenizer -from keras_nlp.src.models.llama.llama_preprocessor import LlamaPreprocessor - - -@keras_nlp_export("keras_nlp.models.Llama3Preprocessor") -class Llama3Preprocessor(LlamaPreprocessor): - backbone_cls = Llama3Backbone - tokenizer_cls = Llama3Tokenizer diff --git a/keras_nlp/src/models/llama3/llama3_preprocessor_test.py b/keras_nlp/src/models/llama3/llama3_preprocessor_test.py deleted file mode 100644 index 13d4f0208c..0000000000 --- a/keras_nlp/src/models/llama3/llama3_preprocessor_test.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from keras_nlp.src.models.llama3.llama3_preprocessor import Llama3Preprocessor -from keras_nlp.src.models.llama3.llama3_tokenizer import Llama3Tokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class Llama3PreprocessorTest(TestCase): - def setUp(self): - self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"] - self.vocab += ["<|end_of_text|>", "<|begin_of_text|>"] - self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) - self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] - self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] - self.merges += ["Ġai r", "Ġa i", "pla ne"] - self.tokenizer = Llama3Tokenizer( - vocabulary=self.vocab, - merges=self.merges, - ) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "sequence_length": 8, - } - self.input_data = [ - "airplane at airport", - ] - - def test_preprocessor_basics(self): - self.run_preprocessor_test( - cls=Llama3Preprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output=( - { - "token_ids": [[7, 1, 3, 4, 2, 5, 0, 0]], - "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]], - } - ), - ) - - def test_with_start_end_token(self): - input_data = ["airplane at airport"] * 4 - - preprocessor = Llama3Preprocessor( - tokenizer=Llama3Tokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, - add_start_token=True, - add_end_token=True, - ) - x = preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[7, 1, 3, 4, 2, 5, 6, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4) - - def test_sequence_length_override(self): - input_data = "airplane at airport" - preprocessor = Llama3Preprocessor(**self.init_kwargs) - x = preprocessor(input_data, sequence_length=4) - self.assertAllEqual(x["token_ids"], [7, 1, 3, 4]) - - @pytest.mark.extra_large - def test_all_presets(self): - for preset in Llama3Preprocessor.presets: - self.run_preset_test( - cls=Llama3Preprocessor, - preset=preset, - input_data=self.input_data, - ) diff --git a/keras_nlp/src/models/mistral/mistral_preprocessor.py b/keras_nlp/src/models/mistral/mistral_preprocessor.py deleted file mode 100644 index c6d7731722..0000000000 --- a/keras_nlp/src/models/mistral/mistral_preprocessor.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import keras - -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker -from keras_nlp.src.models.mistral.mistral_backbone import MistralBackbone -from keras_nlp.src.models.mistral.mistral_tokenizer import MistralTokenizer -from keras_nlp.src.models.preprocessor import Preprocessor -from keras_nlp.src.utils.tensor_utils import preprocessing_function - - -@keras_nlp_export("keras_nlp.models.MistralPreprocessor") -class MistralPreprocessor(Preprocessor): - """A Mistral preprocessing layer which tokenizes and packs inputs. - - This preprocessing layer will do three things: - - 1. Tokenize any number of input segments using the `tokenizer`. - 2. Pack the inputs together using a `keras_nlp.layers.StartEndPacker`. - with the appropriate tokens. - 3. Construct a dictionary with keys `"token_ids"`, and `"padding_mask"` - that can be passed directly to `keras_nlp.models.MistralBackbone`. - - This layer can be used directly with `tf.data.Dataset.map` to preprocess - string data in the `(x, y, sample_weight)` format used by - `keras.Model.fit`. - - Args: - tokenizer: A `keras_nlp.models.MistralTokenizer` instance. - sequence_length: The length of the packed inputs. - add_start_token: If `True`, the preprocessor will prepend the tokenizer - start token to each input sequence. Default is `True`. - add_end_token: If `True`, the preprocessor will append the tokenizer - end token to each input sequence. Default is `False`. - - Call arguments: - x: A tensor of single string sequences, or a tuple of multiple - tensor sequences to be packed together. Inputs may be batched or - unbatched. For single sequences, raw python inputs will be converted - to tensors. For multiple sequences, pass tensors directly. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - sequence_length: Pass to override the configured `sequence_length` of - the layer. - - Examples: - - Directly calling the from_preset(). - ```python - preprocessor = keras_nlp.models.MistralPreprocessor.from_preset( - "mistral_base_en" - ) - - # Tokenize and pack a single sentence. - preprocessor("The quick brown fox jumped.") - - # Tokenize and a batch of single sentences. - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) - - # Preprocess a batch of sentence pairs. - # When handling multiple sequences, always convert to tensors first! - first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - second = tf.constant(["The fox tripped.", "Oh look, a whale."]) - preprocessor((first, second)) - ``` - - Mapping with `tf.data.Dataset`. - ```python - preprocessor = keras_nlp.models.MistralPreprocessor.from_preset( - "mistral_base_en" - ) - first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - second = tf.constant(["The fox tripped.", "Oh look, a whale."]) - label = tf.constant([1, 1]) - - # Map labeled single sentences. - ds = tf.data.Dataset.from_tensor_slices((first, label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled single sentences. - ds = tf.data.Dataset.from_tensor_slices(first) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map labeled sentence pairs. - ds = tf.data.Dataset.from_tensor_slices(((first, second), label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled sentence pairs. - ds = tf.data.Dataset.from_tensor_slices((first, second)) - - # Watch out for tf.data's default unpacking of tuples here! - # Best to invoke the `preprocessor` directly in this case. - ds = ds.map( - lambda first, second: preprocessor(x=(first, second)), - num_parallel_calls=tf.data.AUTOTUNE, - ) - ``` - """ - - backbone_cls = MistralBackbone - tokenizer_cls = MistralTokenizer - - def __init__( - self, - tokenizer, - sequence_length=1024, - add_start_token=True, - add_end_token=False, - **kwargs, - ): - super().__init__(**kwargs) - self.tokenizer = tokenizer - self.packer = None - self.add_start_token = add_start_token - self.add_end_token = add_end_token - self.sequence_length = sequence_length - - def build(self, input_shape): - # Defer packer creation to `build()` so that we can be sure tokenizer - # assets have loaded when restoring a saved model. - self.packer = StartEndPacker( - start_value=self.tokenizer.start_token_id, - end_value=self.tokenizer.end_token_id, - sequence_length=self.sequence_length, - return_padding_mask=True, - ) - self.built = True - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.sequence_length, - "add_start_token": self.add_start_token, - "add_end_token": self.add_end_token, - } - ) - return config - - @preprocessing_function - def call( - self, - x, - y=None, - sample_weight=None, - sequence_length=None, - ): - sequence_length = sequence_length or self.sequence_length - token_ids, padding_mask = self.packer( - self.tokenizer(x), - sequence_length=sequence_length, - add_start_value=self.add_start_token, - add_end_value=self.add_end_token, - ) - x = { - "token_ids": token_ids, - "padding_mask": padding_mask, - } - return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) - - @property - def sequence_length(self): - """The padded length of model input sequences.""" - return self._sequence_length - - @sequence_length.setter - def sequence_length(self, value): - self._sequence_length = value - if self.packer is not None: - self.packer.sequence_length = value diff --git a/keras_nlp/src/models/mistral/mistral_preprocessor_test.py b/keras_nlp/src/models/mistral/mistral_preprocessor_test.py deleted file mode 100644 index e38e498b00..0000000000 --- a/keras_nlp/src/models/mistral/mistral_preprocessor_test.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytest - -from keras_nlp.src.models.mistral.mistral_preprocessor import ( - MistralPreprocessor, -) -from keras_nlp.src.models.mistral.mistral_tokenizer import MistralTokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class MistralPreprocessorTest(TestCase): - def setUp(self): - self.tokenizer = MistralTokenizer( - # Generated using create_mistral_test_proto.py - proto=os.path.join( - self.get_test_data_dir(), "mistral_test_vocab.spm" - ) - ) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "sequence_length": 8, - } - self.input_data = ( - ["the quick brown fox"], - [1], # Pass through labels. - [1.0], # Pass through sample_weights. - ) - - def test_preprocessor_basics(self): - self.run_preprocessor_test( - cls=MistralPreprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output=( - { - "token_ids": [[1, 3, 8, 4, 6, 0, 0, 0]], - "padding_mask": [[1, 1, 1, 1, 1, 0, 0, 0]], - }, - [1], # Pass through labels. - [1.0], # Pass through sample_weights. - ), - ) - - def test_errors_for_2d_list_input(self): - preprocessor = MistralPreprocessor(**self.init_kwargs) - ambiguous_input = [["one", "two"], ["three", "four"]] - with self.assertRaises(ValueError): - preprocessor(ambiguous_input) - - @pytest.mark.extra_large - def test_all_presets(self): - for preset in MistralPreprocessor.presets: - self.run_preset_test( - cls=MistralPreprocessor, - preset=preset, - input_data=self.input_data, - ) diff --git a/keras_nlp/src/models/opt/opt_preprocessor.py b/keras_nlp/src/models/opt/opt_preprocessor.py deleted file mode 100644 index 0cafaaec5e..0000000000 --- a/keras_nlp/src/models/opt/opt_preprocessor.py +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import keras - -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker -from keras_nlp.src.models.opt.opt_backbone import OPTBackbone -from keras_nlp.src.models.opt.opt_tokenizer import OPTTokenizer -from keras_nlp.src.models.preprocessor import Preprocessor -from keras_nlp.src.utils.tensor_utils import preprocessing_function - - -@keras_nlp_export("keras_nlp.models.OPTPreprocessor") -class OPTPreprocessor(Preprocessor): - """OPT preprocessing layer which tokenizes and packs inputs. - - This preprocessing layer will do 2 things: - - - Tokenize the input using the `tokenizer`. - - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can - be passed directly to a `keras_nlp.models.OPTBackbone`. - - This layer can be used directly with `tf.data.Dataset.map` to preprocess - string data in the `(x, y, sample_weight)` format used by - `keras.Model.fit`. - - The call method of this layer accepts three arguments, `x`, `y`, and - `sample_weight`. `x` can be a python string or tensor representing a single - segment, a list of python strings representing a batch of single segments, - or a list of tensors representing multiple segments to be packed together. - `y` and `sample_weight` are both optional, can have any format, and will be - passed through unaltered. - - `OPTPreprocessor` forces the input to have only one segment, as OPT is - mainly used for generation tasks. For tasks having multi-segment inputs - like "glue/mnli", please use a model designed for classification purposes - such as BERT or RoBERTa. - - Args: - tokenizer: A `keras_nlp.models.OPTTokenizer` instance. - sequence_length: The length of the packed inputs. - add_start_token: If `True`, the preprocessor will append the tokenizer - start token to each input sequence. - add_end_token: If `True`, the preprocessor will append the tokenizer - end token to each input sequence. - - Call arguments: - x: A string, `tf.Tensor` or list of python strings. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - sequence_length: Pass to override the configured `sequence_length` of - the layer. - - Examples: - - Directly calling the layer on data. - ```python - preprocessor = keras_nlp.models.OPTPreprocessor.from_preset("opt_125m_en") - - # Tokenize and pack a single sentence. - preprocessor("The quick brown fox jumped.") - - # Tokenize a batch of single sentences. - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) - - # Custom vocabulary. - features = ["a quick fox.", "a fox quick."] - vocab = {"<|endoftext|>": 0, "a": 4, "Ġquick": 5, "Ġfox": 6} - merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"] - merges += ["Ġ f", "o x", "Ġf ox"] - tokenizer = keras_nlp.models.OPTTokenizer( - vocabulary=vocab, - merges=merges, - ) - preprocessor = keras_nlp.models.OPTPreprocessor(tokenizer=tokenizer) - preprocessor("The quick brown fox jumped.") - ``` - - Mapping with `tf.data.Dataset`. - ```python - preprocessor = keras_nlp.models.OPTPreprocessor.from_preset("opt_125m_en") - - text = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - label = tf.constant([1, 1]) - - # Map labeled single sentences. - ds = tf.data.Dataset.from_tensor_slices((text, label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled single sentences. - ds = tf.data.Dataset.from_tensor_slices(text) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - ``` - """ - - backbone_cls = OPTBackbone - tokenizer_cls = OPTTokenizer - - def __init__( - self, - tokenizer, - sequence_length=1024, - add_start_token=True, - add_end_token=True, - **kwargs, - ): - super().__init__(**kwargs) - - self.tokenizer = tokenizer - self.packer = None - self.sequence_length = sequence_length - self.add_start_token = add_start_token - self.add_end_token = add_end_token - - def build(self, input_shape): - # Defer packer creation to `build()` so that we can be sure tokenizer - # assets have loaded when restoring a saved model. - self.packer = StartEndPacker( - start_value=self.tokenizer.start_token_id, - end_value=self.tokenizer.end_token_id, - pad_value=self.tokenizer.pad_token_id, - sequence_length=self.sequence_length, - return_padding_mask=True, - ) - self.built = True - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.sequence_length, - "add_start_token": self.add_start_token, - "add_end_token": self.add_end_token, - } - ) - return config - - @preprocessing_function - def call( - self, - x, - y=None, - sample_weight=None, - sequence_length=None, - ): - sequence_length = sequence_length or self.sequence_length - token_ids, padding_mask = self.packer( - self.tokenizer(x), - sequence_length=sequence_length, - add_start_value=self.add_start_token, - add_end_value=self.add_end_token, - ) - x = { - "token_ids": token_ids, - "padding_mask": padding_mask, - } - return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) - - @property - def sequence_length(self): - """The padded length of model input sequences.""" - return self._sequence_length - - @sequence_length.setter - def sequence_length(self, value): - self._sequence_length = value - if self.packer is not None: - self.packer.sequence_length = value diff --git a/keras_nlp/src/models/opt/opt_preprocessor_test.py b/keras_nlp/src/models/opt/opt_preprocessor_test.py deleted file mode 100644 index 614ff82ebe..0000000000 --- a/keras_nlp/src/models/opt/opt_preprocessor_test.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from keras_nlp.src.models.opt.opt_preprocessor import OPTPreprocessor -from keras_nlp.src.models.opt.opt_tokenizer import OPTTokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class OPTPreprocessorTest(TestCase): - def setUp(self): - self.vocab = ["", "", "air", "Ġair", "plane", "Ġat", "port"] - self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)]) - self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"] - self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"] - self.merges += ["Ġai r", "Ġa i", "pla ne"] - self.tokenizer = OPTTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "sequence_length": 8, - } - self.input_data = ["airplane at airport"] - - def test_preprocessor_basics(self): - self.run_preprocessor_test( - cls=OPTPreprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output={ - "token_ids": [[1, 2, 4, 5, 3, 6, 1, 0]], - "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]], - }, - ) - - def test_no_start_end_token(self): - input_data = ["airplane at airport"] * 4 - - preprocessor = OPTPreprocessor( - tokenizer=OPTTokenizer( - vocabulary=self.vocab, - merges=self.merges, - ), - sequence_length=8, - add_start_token=False, - add_end_token=False, - ) - x = preprocessor(input_data) - self.assertAllEqual(x["token_ids"], [[2, 4, 5, 3, 6, 0, 0, 0]] * 4) - self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4) - - def test_sequence_length_override(self): - input_data = "airplane at airport" - preprocessor = OPTPreprocessor(**self.init_kwargs) - x = preprocessor(input_data, sequence_length=4) - self.assertAllEqual(x["token_ids"], [1, 2, 4, 1]) - - @pytest.mark.extra_large - def test_all_presets(self): - for preset in OPTPreprocessor.presets: - self.run_preset_test( - cls=OPTPreprocessor, - preset=preset, - input_data=self.input_data, - ) diff --git a/keras_nlp/src/models/pali_gemma/pali_gemma_tokenizer.py b/keras_nlp/src/models/pali_gemma/pali_gemma_tokenizer.py index fbdb0693d0..d52c1de4d4 100644 --- a/keras_nlp/src/models/pali_gemma/pali_gemma_tokenizer.py +++ b/keras_nlp/src/models/pali_gemma/pali_gemma_tokenizer.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.models.gemma.gemma_preprocessor import GemmaTokenizer +from keras_nlp.src.models.gemma.gemma_tokenizer import GemmaTokenizer from keras_nlp.src.models.pali_gemma.pali_gemma_backbone import ( PaliGemmaBackbone, ) diff --git a/keras_nlp/src/models/phi3/phi3_preprocessor.py b/keras_nlp/src/models/phi3/phi3_preprocessor.py deleted file mode 100644 index ce392b5088..0000000000 --- a/keras_nlp/src/models/phi3/phi3_preprocessor.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import keras - -from keras_nlp.src.api_export import keras_nlp_export -from keras_nlp.src.layers.preprocessing.start_end_packer import StartEndPacker -from keras_nlp.src.models.phi3.phi3_backbone import Phi3Backbone -from keras_nlp.src.models.phi3.phi3_tokenizer import Phi3Tokenizer -from keras_nlp.src.models.preprocessor import Preprocessor -from keras_nlp.src.utils.tensor_utils import preprocessing_function - - -@keras_nlp_export("keras_nlp.models.Phi3Preprocessor") -class Phi3Preprocessor(Preprocessor): - """A Phi3 preprocessing layer which tokenizes and packs inputs. - - This preprocessing layer will do three things: - - 1. Tokenize any number of input segments using the `tokenizer`. - 2. Pack the inputs together using a `keras_nlp.layers.StartEndPacker`. - with the appropriate tokens. - 3. Construct a dictionary with keys `"token_ids"`, and `"padding_mask"` - that can be passed directly to `keras_nlp.models.Phi3Backbone`. - - This layer can be used directly with `tf.data.Dataset.map` to preprocess - string data in the `(x, y, sample_weight)` format used by - `keras.Model.fit`. - - Args: - tokenizer: A `keras_nlp.models.Phi3Tokenizer` instance. - sequence_length: The length of the packed inputs. - add_start_token: If `True`, the preprocessor will prepend the tokenizer - start token to each input sequence. Default is `True`. - add_end_token: If `True`, the preprocessor will append the tokenizer - end token to each input sequence. Default is `False`. - - Call arguments: - x: A tensor of single string sequences, or a tuple of multiple - tensor sequences to be packed together. Inputs may be batched or - unbatched. For single sequences, raw python inputs will be converted - to tensors. For multiple sequences, pass tensors directly. - y: Any label data. Will be passed through unaltered. - sample_weight: Any label weight data. Will be passed through unaltered. - sequence_length: Pass to override the configured `sequence_length` of - the layer. - - Examples: - - Directly calling the from_preset(). - ```python - preprocessor = keras_nlp.models.Phi3Preprocessor.from_preset( - "" - ) - - # Tokenize and pack a single sentence. - preprocessor("The quick brown fox jumped.") - - # Tokenize and a batch of single sentences. - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) - - # Preprocess a batch of sentence pairs. - # When handling multiple sequences, always convert to tensors first! - first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - second = tf.constant(["The fox tripped.", "Oh look, a whale."]) - preprocessor((first, second)) - ``` - - Mapping with `tf.data.Dataset`. - ```python - preprocessor = keras_nlp.models.Phi3Preprocessor.from_preset( - "phi3_mini_4k_instruct_en" - ) - first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) - second = tf.constant(["The fox tripped.", "Oh look, a whale."]) - label = tf.constant([1, 1]) - - # Map labeled single sentences. - ds = tf.data.Dataset.from_tensor_slices((first, label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled single sentences. - ds = tf.data.Dataset.from_tensor_slices(first) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map labeled sentence pairs. - ds = tf.data.Dataset.from_tensor_slices(((first, second), label)) - ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - - # Map unlabeled sentence pairs. - ds = tf.data.Dataset.from_tensor_slices((first, second)) - - # Watch out for tf.data's default unpacking of tuples here! - # Best to invoke the `preprocessor` directly in this case. - ds = ds.map( - lambda first, second: preprocessor(x=(first, second)), - num_parallel_calls=tf.data.AUTOTUNE, - ) - ``` - """ - - backbone_cls = Phi3Backbone - tokenizer_cls = Phi3Tokenizer - - def __init__( - self, - tokenizer, - sequence_length=1024, - add_start_token=True, - add_end_token=False, - **kwargs, - ): - super().__init__(**kwargs) - self.tokenizer = tokenizer - self.packer = None - self.add_start_token = add_start_token - self.add_end_token = add_end_token - self.sequence_length = sequence_length - - def build(self, input_shape): - # Defer packer creation to `build()` so that we can be sure tokenizer - # assets have loaded when restoring a saved model. - self.packer = StartEndPacker( - start_value=self.tokenizer.start_token_id, - end_value=self.tokenizer.end_token_id, - pad_value=self.tokenizer.pad_token_id, - sequence_length=self.sequence_length, - return_padding_mask=True, - ) - self.built = True - - def get_config(self): - config = super().get_config() - config.update( - { - "sequence_length": self.sequence_length, - "add_start_token": self.add_start_token, - "add_end_token": self.add_end_token, - } - ) - return config - - @preprocessing_function - def call( - self, - x, - y=None, - sample_weight=None, - sequence_length=None, - ): - sequence_length = sequence_length or self.sequence_length - token_ids, padding_mask = self.packer( - self.tokenizer(x), - sequence_length=sequence_length, - add_start_value=self.add_start_token, - add_end_value=self.add_end_token, - ) - x = { - "token_ids": token_ids, - "padding_mask": padding_mask, - } - return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) - - @property - def sequence_length(self): - """The padded length of model input sequences.""" - return self._sequence_length - - @sequence_length.setter - def sequence_length(self, value): - self._sequence_length = value - if self.packer is not None: - self.packer.sequence_length = value diff --git a/keras_nlp/src/models/phi3/phi3_preprocessor_test.py b/keras_nlp/src/models/phi3/phi3_preprocessor_test.py deleted file mode 100644 index 406fc4eb17..0000000000 --- a/keras_nlp/src/models/phi3/phi3_preprocessor_test.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright 2024 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import pytest - -from keras_nlp.src.models.phi3.phi3_preprocessor import Phi3Preprocessor -from keras_nlp.src.models.phi3.phi3_tokenizer import Phi3Tokenizer -from keras_nlp.src.tests.test_case import TestCase - - -class Phi3PreprocessorTest(TestCase): - def setUp(self): - self.tokenizer = Phi3Tokenizer( - # Generated using create_phi3_test_proto.py - proto=os.path.join(self.get_test_data_dir(), "phi3_test_vocab.spm") - ) - self.init_kwargs = { - "tokenizer": self.tokenizer, - "sequence_length": 12, - } - self.input_data = ( - # Encoded to [3, 5, 6, 4, 3, 9, 7, 11, 3, 15] - ["the fox <|endoftext|>"], - [1], # Pass through labels. - [1.0], # Pass through sample_weights. - ) - - def test_preprocessor_basics(self): - self.run_preprocessor_test( - cls=Phi3Preprocessor, - init_kwargs=self.init_kwargs, - input_data=self.input_data, - expected_output=( - { - "token_ids": [[1, 3, 5, 6, 4, 3, 9, 7, 11, 3, 15, 0]], - "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]], - }, - [1], # Pass through labels. - [1.0], # Pass through sample_weights. - ), - ) - - def test_errors_for_2d_list_input(self): - preprocessor = Phi3Preprocessor(**self.init_kwargs) - ambiguous_input = [["one", "two"], ["three", "four"]] - with self.assertRaises(ValueError): - preprocessor(ambiguous_input) - - @pytest.mark.extra_large - def test_all_presets(self): - for preset in Phi3Preprocessor.presets: - self.run_preset_test( - cls=Phi3Preprocessor, - preset=preset, - input_data=self.input_data, - ) diff --git a/keras_nlp/src/models/seq_2_seq_lm_preprocessor_test.py b/keras_nlp/src/models/seq_2_seq_lm_preprocessor_test.py index b1d353ab9d..f784b571fe 100644 --- a/keras_nlp/src/models/seq_2_seq_lm_preprocessor_test.py +++ b/keras_nlp/src/models/seq_2_seq_lm_preprocessor_test.py @@ -13,7 +13,6 @@ # limitations under the License. import pytest -from keras_nlp.src.models.bart.bart_preprocessor import BartPreprocessor from keras_nlp.src.models.bart.bart_seq_2_seq_lm_preprocessor import ( BartSeq2SeqLMPreprocessor, ) @@ -25,7 +24,7 @@ class TestSeq2SeqLMPreprocessor(TestCase): def test_preset_accessors(self): bert_presets = set(BertTokenizer.presets.keys()) - bart_presets = set(BartPreprocessor.presets.keys()) + bart_presets = set(BartSeq2SeqLMPreprocessor.presets.keys()) all_presets = set(Seq2SeqLMPreprocessor.presets.keys()) self.assertTrue(bert_presets.isdisjoint(all_presets)) self.assertTrue(bart_presets.issubset(all_presets))