keras-team · mattdangerw · Dec 13, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -216,12 +216,11 @@ so you aren't waiting around forever!
 
 ## Formatting Code
 
-We use `flake8`, `isort` and `black` for code formatting.  You can run
-the following commands manually every time you want to format your code:
+KerasHub uses [Ruff](https://docs.astral.sh/ruff/) to format the code. You can
+run `the following commands manually every time you want to format your code:
 
 - Run `shell/format.sh` to format your code
 - Run `shell/lint.sh` to check the result.
 
-If after running these the CI flow is still failing, try updating `flake8`,
-`isort` and `black`. This can be done by running `pip install --upgrade black`,
-`pip install --upgrade flake8`, and `pip install --upgrade isort`.
+If after running these the CI flow is still failing, try updating `ruff`
+with `pip install --upgrade ruff`.
diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
@@ -200,18 +200,18 @@
 )
 from keras_hub.src.models.image_to_image import ImageToImage
 from keras_hub.src.models.inpaint import Inpaint
-from keras_hub.src.models.llama3.llama3_backbone import Llama3Backbone
-from keras_hub.src.models.llama3.llama3_causal_lm import Llama3CausalLM
-from keras_hub.src.models.llama3.llama3_causal_lm_preprocessor import (
-    Llama3CausalLMPreprocessor,
-)
-from keras_hub.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
 from keras_hub.src.models.llama.llama_backbone import LlamaBackbone
 from keras_hub.src.models.llama.llama_causal_lm import LlamaCausalLM
 from keras_hub.src.models.llama.llama_causal_lm_preprocessor import (
     LlamaCausalLMPreprocessor,
 )
 from keras_hub.src.models.llama.llama_tokenizer import LlamaTokenizer
+from keras_hub.src.models.llama3.llama3_backbone import Llama3Backbone
+from keras_hub.src.models.llama3.llama3_causal_lm import Llama3CausalLM
+from keras_hub.src.models.llama3.llama3_causal_lm_preprocessor import (
+    Llama3CausalLMPreprocessor,
+)
+from keras_hub.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
 from keras_hub.src.models.masked_lm import MaskedLM
 from keras_hub.src.models.masked_lm_preprocessor import MaskedLMPreprocessor
 from keras_hub.src.models.mistral.mistral_backbone import MistralBackbone

diff --git a/keras_hub/api/tokenizers/__init__.py b/keras_hub/api/tokenizers/__init__.py
@@ -21,8 +21,8 @@
 from keras_hub.src.models.gemma.gemma_tokenizer import GemmaTokenizer
 from keras_hub.src.models.gpt2.gpt2_tokenizer import GPT2Tokenizer
 from keras_hub.src.models.gpt_neo_x.gpt_neo_x_tokenizer import GPTNeoXTokenizer
-from keras_hub.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
 from keras_hub.src.models.llama.llama_tokenizer import LlamaTokenizer
+from keras_hub.src.models.llama3.llama3_tokenizer import Llama3Tokenizer
 from keras_hub.src.models.mistral.mistral_tokenizer import MistralTokenizer
 from keras_hub.src.models.opt.opt_tokenizer import OPTTokenizer
 from keras_hub.src.models.pali_gemma.pali_gemma_tokenizer import (

diff --git a/keras_hub/src/bounding_box/converters.py b/keras_hub/src/bounding_box/converters.py
@@ -73,8 +73,8 @@ def encode_box_to_deltas(
 
     if encoding_format not in ["center_xywh", "center_yxhw"]:
         raise ValueError(
-            "`encoding_format` should be one of 'center_xywh' or 'center_yxhw', "
-            f"got {encoding_format}"
+            "`encoding_format` should be one of 'center_xywh' or "
+            f"'center_yxhw', got {encoding_format}"
         )
 
     encoded_anchors = convert_format(

diff --git a/keras_hub/src/bounding_box/utils_test.py b/keras_hub/src/bounding_box/utils_test.py
@@ -58,10 +58,12 @@ def test_clip_to_image_filters_fully_out_bounding_boxes(self):
             bounding_boxes, bounding_box_format="xyxy", images=image
         )
 
-        self.assertAllEqual(
-            bounding_boxes["boxes"],
-            np.array([[-1, -1, -1, -1], [100, 100, 256, 256]]),
-        ),
+        (
+            self.assertAllEqual(
+                bounding_boxes["boxes"],
+                np.array([[-1, -1, -1, -1], [100, 100, 256, 256]]),
+            ),
+        )
         self.assertAllEqual(
             bounding_boxes["classes"],
             np.array([-1, 0]),

diff --git a/keras_hub/src/layers/modeling/f_net_encoder.py b/keras_hub/src/layers/modeling/f_net_encoder.py
@@ -66,7 +66,7 @@ def __init__(
         layer_norm_epsilon=1e-5,
         kernel_initializer="glorot_uniform",
         bias_initializer="zeros",
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.intermediate_dim = intermediate_dim

diff --git a/keras_hub/src/layers/modeling/masked_lm_head.py b/keras_hub/src/layers/modeling/masked_lm_head.py
@@ -34,7 +34,8 @@ class MaskedLMHead(keras.layers.Layer):
         token_embedding: Optional. A `keras_hub.layers.ReversibleEmbedding`
             instance. If passed, the layer will be used to project from the
             `hidden_dim` of the model to the output `vocabulary_size`.
-        intermediate_activation: The activation function of intermediate dense layer.
+        intermediate_activation: The activation function of intermediate dense
+            layer.
         activation: The activation function for the outputs of the layer.
             Usually either `None` (return logits), or `"softmax"`
             (return probabilities).

diff --git a/keras_hub/src/layers/modeling/rms_normalization.py b/keras_hub/src/layers/modeling/rms_normalization.py
@@ -6,10 +6,11 @@
 
 @keras_hub_export("keras_hub.layers.RMSNormalization")
 class RMSNormalization(keras.layers.Layer):
-    """
-    Root Mean Square (RMS) Normalization layer.
+    """Root Mean Square (RMS) Normalization layer.
+
     This layer normalizes the input tensor based on its RMS value and applies
     a learned scaling factor.
+
     Args:
         input_dim: int. The dimensionality of the input tensor.
     """
@@ -21,12 +22,13 @@ def __init__(self, input_dim):
         )
 
     def call(self, x):
-        """
-        Applies RMS normalization to the input tensor.
+        """Applies RMS normalization to the input tensor.
+
         Args:
-            x: KerasTensor. Input tensor of shape (batch_size, input_dim).
+            x: Input tensor of shape (batch_size, input_dim).
+
         Returns:
-            KerasTensor: The RMS-normalized tensor of the same shape (batch_size, input_dim),
+            The RMS-normalized tensor of the same shape (batch_size, input_dim),
             scaled by the learned `scale` parameter.
         """
         x = ops.cast(x, float)

diff --git a/keras_hub/src/layers/modeling/rotary_embedding.py b/keras_hub/src/layers/modeling/rotary_embedding.py
@@ -11,7 +11,8 @@ class RotaryEmbedding(keras.layers.Layer):
     This layer encodes absolute positional information with a rotation
     matrix. It calculates the rotary encoding with a mix of sine and
     cosine functions with geometrically increasing wavelengths.
-    Defined and formulated in [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864v4).
+    Defined and formulated in
+    [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864v4).
     The input must be a tensor with shape a sequence dimension and a feature
     dimension. Typically, this will either an input with shape
     `(batch_size, sequence_length, feature_length)` or
@@ -65,7 +66,7 @@ def __init__(
         scaling_factor=1.0,
         sequence_axis=1,
         feature_axis=-1,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.max_wavelength = max_wavelength

diff --git a/keras_hub/src/layers/modeling/token_and_position_embedding.py b/keras_hub/src/layers/modeling/token_and_position_embedding.py
@@ -58,7 +58,7 @@ def __init__(
         tie_weights=True,
         embeddings_initializer="uniform",
         mask_zero=False,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         if vocabulary_size is None:

diff --git a/keras_hub/src/layers/modeling/transformer_decoder.py b/keras_hub/src/layers/modeling/transformer_decoder.py
@@ -5,12 +5,13 @@
 from keras_hub.src.layers.modeling.cached_multi_head_attention import (
     CachedMultiHeadAttention,
 )
-from keras_hub.src.utils.keras_utils import clone_initializer
-
-from keras_hub.src.layers.modeling.transformer_layer_utils import (  # isort:skip
+from keras_hub.src.layers.modeling.transformer_layer_utils import (
     compute_causal_mask,
+)
+from keras_hub.src.layers.modeling.transformer_layer_utils import (
     merge_padding_and_attention_mask,
 )
+from keras_hub.src.utils.keras_utils import clone_initializer
 
 
 @keras_hub_export("keras_hub.layers.TransformerDecoder")
@@ -265,13 +266,13 @@ def call(
                 `[batch_size, decoder_sequence_length]`.
             decoder_attention_mask: a boolean Tensor. Customized decoder
                 sequence mask, must be of shape
-                `[batch_size, decoder_sequence_length, decoder_sequence_length]`.
+                `[batch_size, decoder_sequence_length, decoder_sequence_length]`
             encoder_padding_mask: a boolean Tensor, the padding mask of encoder
                 sequence, must be of shape
                 `[batch_size, encoder_sequence_length]`.
             encoder_attention_mask: a boolean Tensor. Customized encoder
                 sequence mask, must be of shape
-                `[batch_size, encoder_sequence_length, encoder_sequence_length]`.
+                `[batch_size, encoder_sequence_length, encoder_sequence_length]`
             self_attention_cache: a dense float Tensor. The cache of key/values
                 pairs in the self-attention layer. Has shape
                 `[batch_size, 2, max_seq_len, num_heads, key_dims]`.
@@ -435,7 +436,8 @@ def _compute_self_attention_mask(
             input_length = output_length = ops.shape(decoder_sequence)[1]
             # We need to handle a rectangular causal mask when doing cached
             # decoding. For generative inference, `decoder_sequence` will
-            # generally be length 1, and `cache` will be the full generation length.
+            # generally be length 1, and `cache` will be the full generation
+            # length.
             if self_attention_cache is not None:
                 input_length = ops.shape(self_attention_cache)[2]
 

diff --git a/keras_hub/src/layers/modeling/transformer_encoder.py b/keras_hub/src/layers/modeling/transformer_encoder.py
@@ -190,7 +190,9 @@ def call(
                 [batch_size, sequence_length, sequence_length].
             training: a boolean indicating whether the layer should behave in
                 training mode or in inference mode.
-            return_attention_scores: a boolean indicating whether the output should be `(attention_output, attention_scores)` if `True` or `attention_output` if `False`. Defaults to `False`.
+            return_attention_scores: a boolean indicating whether the output
+                should be `(attention_output, attention_scores)` if `True` or
+                `attention_output` if `False`. Defaults to `False`.
 
         Returns:
             A Tensor of the same shape as the `inputs`.

diff --git a/keras_hub/src/layers/modeling/transformer_encoder_test.py b/keras_hub/src/layers/modeling/transformer_encoder_test.py
@@ -104,5 +104,6 @@ def test_attention_scores(self):
         )
         self.assertAllEqual(outputs.shape, inputs.shape)
 
-        # attention scores shape (batch_size, num_of_attn_heads, seq_length, seq_length)
+        # attention scores shape
+        # (batch_size, num_of_attn_heads, seq_length, seq_length)
         self.assertAllEqual(attention_scores.shape, [1, 2, 4, 4])
diff --git a/keras_hub/src/metrics/bleu.py b/keras_hub/src/metrics/bleu.py
@@ -164,7 +164,7 @@ def _tokenizer(self, inputs):
         return inputs
 
     def _get_ngrams(self, segment, max_order):
-        """Extracts all n-grams up to a given maximum order from an input segment.
+        """Extracts all n-grams up to a given maximum order from an input.
 
         Uses Python ops. Inspired from
         https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py.

diff --git a/keras_hub/src/models/albert/albert_backbone_test.py b/keras_hub/src/models/albert/albert_backbone_test.py
@@ -13,7 +13,6 @@ def setUp(self):
             "num_heads": 2,
             "num_groups": 1,
             "num_inner_repetitions": 1,
-            "num_inner_repetitions": 1,
             "embedding_dim": 16,
             "hidden_dim": 2,
             "intermediate_dim": 4,

diff --git a/keras_hub/src/models/albert/albert_text_classifier.py b/keras_hub/src/models/albert/albert_text_classifier.py
@@ -20,10 +20,10 @@
 class AlbertTextClassifier(TextClassifier):
     """An end-to-end ALBERT model for classification tasks
 
-    This model attaches a classification head to a `keras_hub.model.AlbertBackbone`
-    backbone, mapping from the backbone outputs to logit output suitable for
-    a classification task. For usage of this model with pre-trained weights, see
-    the `from_preset()` method.
+    This model attaches a classification head to a
+    `keras_hub.model.AlbertBackbone` backbone, mapping from the backbone outputs
+    to logit output suitable for a classification task. For usage of this model
+    with pre-trained weights, see the `from_preset()` method.
 
     This model can optionally be configured with a `preprocessor` layer, in
     which case it will automatically apply preprocessing to raw inputs during
@@ -36,9 +36,9 @@ class AlbertTextClassifier(TextClassifier):
     Args:
         backbone: A `keras_hub.models.AlertBackbone` instance.
         num_classes: int. Number of classes to predict.
-        preprocessor: A `keras_hub.models.AlbertTextClassifierPreprocessor` or `None`. If
-            `None`, this model will not apply preprocessing, and inputs should
-            be preprocessed before calling the model.
+        preprocessor: A `keras_hub.models.AlbertTextClassifierPreprocessor` or
+            `None`. If `None`, this model will not apply preprocessing, and
+            inputs should be preprocessed before calling the model.
         activation: Optional `str` or callable. The
             activation function to use on the model outputs. Set
             `activation="softmax"` to return output probabilities.

diff --git a/keras_hub/src/models/bart/bart_backbone.py b/keras_hub/src/models/bart/bart_backbone.py
@@ -22,9 +22,9 @@ class BartBackbone(Backbone):
     described in
     ["BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension"](https://arxiv.org/abs/1910.13461).
 
-    The default constructor gives a fully customizable, randomly initialized BART
-    model with any number of layers, heads, and embedding dimensions. To load
-    preset architectures and weights, use the `from_preset` constructor.
+    The default constructor gives a fully customizable, randomly initialized
+    BART model with any number of layers, heads, and embedding dimensions. To
+    load preset architectures and weights, use the `from_preset` constructor.
 
     Disclaimer: Pre-trained models are provided on an "as is" basis, without
     warranties or conditions of any kind. The underlying model is provided by a
@@ -78,7 +78,7 @@ class BartBackbone(Backbone):
     )
     output = model(input_data)
     ```
-    """
+    """  # noqa: E501
 
     def __init__(
         self,

diff --git a/keras_hub/src/models/bart/bart_seq_2_seq_lm.py b/keras_hub/src/models/bart/bart_seq_2_seq_lm.py
@@ -60,7 +60,8 @@ class BartSeq2SeqLM(Seq2SeqLM):
     bart_lm.generate("The quick brown fox", max_length=30)
     ```
 
-    Use `generate()` with encoder inputs and an incomplete decoder input (prompt).
+    Use `generate()` with encoder inputs and an incomplete decoder input
+    (prompt).
     ```python
     bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset("bart_base_en")
     bart_lm.generate(
@@ -79,10 +80,10 @@ class BartSeq2SeqLM(Seq2SeqLM):
     prompt = {
         "encoder_token_ids": np.array([[0, 133, 2119, 6219, 23602, 2, 1, 1]]),
         "encoder_padding_mask": np.array(
-            [[True, True, True, True, True, True, False, False]]
+            [[1, 1, 1, 1, 1, 1, 0, 0]]
         ),
         "decoder_token_ids": np.array([[2, 0, 133, 1769, 2, 1, 1]]),
-        "decoder_padding_mask": np.array([[True, True, True, True, False, False]])
+        "decoder_padding_mask": np.array([[1, 1, 1, 1, 0, 0]])
     }
 
     bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset(
@@ -95,7 +96,7 @@ class BartSeq2SeqLM(Seq2SeqLM):
     Call `fit()` on a single batch.
     ```python
     features = {
-        "encoder_text": ["The quick brown fox jumped.", "I forgot my homework."],
+        "encoder_text": ["The quick fox jumped.", "I forgot my homework."],
         "decoder_text": ["The fast hazel fox leapt.", "I forgot my assignment."]
     }
     bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset("bart_base_en")
@@ -195,7 +196,7 @@ def call_decoder_with_cache(
         cross_attention_cache=None,
         cross_attention_cache_update_index=None,
     ):
-        """Forward pass with a key/value caches for generative decoding..
+        """Forward pass with a key/value caches for generative decoding.
 
         `call_decoder_with_cache` adds an additional inference-time forward pass
         for the model for seq2seq text generation. Unlike calling the model
@@ -241,7 +242,7 @@ def call_decoder_with_cache(
             key/value cache in the decoder's self-attention layer and
             `cross_attention_cache` is the key/value cache in the decoder's
             cross-attention layer.
-        """
+        """  # noqa: E501
         # Embedding layers.
         tokens = self.backbone.token_embedding(decoder_token_ids)
         positions = self.backbone.decoder_position_embedding(
@@ -331,7 +332,7 @@ def _initialize_cache(self, encoder_token_ids, decoder_token_ids):
     def _build_cache(
         self, encoder_token_ids, encoder_padding_mask, decoder_token_ids
     ):
-        """Builds the self-attention cache and the cross-attention cache (key/value pairs)."""
+        """Builds the self-attention cache and the cross-attention cache."""
         encoder_hidden_states = self.call_encoder(
             token_ids=encoder_token_ids, padding_mask=encoder_padding_mask
         )
@@ -417,7 +418,7 @@ def next(prompt, cache, index):
             prompt = ops.slice(prompt, [0, cache_index], [num_samples, 1])
 
             def repeat_tensor(x):
-                """Repeats tensors along batch axis to match dim for beam search."""
+                """Repeats along batch axis to match dim for beam search."""
                 if ops.shape(x)[0] == num_samples:
                     return x
                 return ops.repeat(x, repeats=num_samples // batch_size, axis=0)

diff --git a/keras_hub/src/models/bert/bert_presets.py b/keras_hub/src/models/bert/bert_presets.py
@@ -69,7 +69,8 @@
     "bert_base_multi": {
         "metadata": {
             "description": (
-                "12-layer BERT model where case is maintained. Trained on trained on Wikipedias of 104 languages"
+                "12-layer BERT model where case is maintained. Trained on "
+                "trained on Wikipedias of 104 languages"
             ),
             "params": 177853440,
             "path": "bert",
@@ -101,7 +102,8 @@
     "bert_tiny_en_uncased_sst2": {
         "metadata": {
             "description": (
-                "The bert_tiny_en_uncased backbone model fine-tuned on the SST-2 sentiment analysis dataset."
+                "The bert_tiny_en_uncased backbone model fine-tuned on the "
+                "SST-2 sentiment analysis dataset."
             ),
             "params": 4385920,
             "path": "bert",

diff --git a/keras_hub/src/models/bert/bert_text_classifier.py b/keras_hub/src/models/bert/bert_text_classifier.py
@@ -34,9 +34,9 @@ class BertTextClassifier(TextClassifier):
     Args:
         backbone: A `keras_hub.models.BertBackbone` instance.
         num_classes: int. Number of classes to predict.
-        preprocessor: A `keras_hub.models.BertTextClassifierPreprocessor` or `None`. If
-            `None`, this model will not apply preprocessing, and inputs should
-            be preprocessed before calling the model.
+        preprocessor: A `keras_hub.models.BertTextClassifierPreprocessor` or
+            `None`. If `None`, this model will not apply preprocessing, and
+            inputs should be preprocessed before calling the model.
         activation: Optional `str` or callable. The
             activation function to use on the model outputs. Set
             `activation="softmax"` to return output probabilities.