huggingface · Rocketknight1 · Sep 11, 2025 · Sep 10, 2025 · Sep 10, 2025
diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
@@ -221,7 +221,7 @@ def _expand_to_num_beams(tensor, num_beams):
     def _adapt_logits_for_beam_search(self, logits):
         """
         This function can be overwritten in the specific modeling_flax_<model-name>.py classes to allow for custom beam
-        search behavior. Note that the only model that overwrites this method is [`~transformes.FlaxMarianMTModel`].
+        search behavior. Note that the only model that overwrites this method is [`~transformers.FlaxMarianMTModel`].
         """
         return logits
 

diff --git a/src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu b/src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu
@@ -779,12 +779,12 @@ __global__ void lsh_weighted_cumulation_ver4_step2_cuda_kernel(
 
     __syncthreads();
 
-    int num_distint_query = query_counter[0];
+    int num_distinct_query = query_counter[0];
 
-    if (num_distint_query > 0) {
-      for (int idx_base = 0; idx_base < num_distint_query; idx_base = idx_base + num_warps) {
+    if (num_distinct_query > 0) {
+      for (int idx_base = 0; idx_base < num_distinct_query; idx_base = idx_base + num_warps) {
         int idx = idx_base + warp_idx;
-        if (idx < num_distint_query) {
+        if (idx < num_distinct_query) {
           int query_idx = inserted_query[idx];
           int batch_idx__query_idx = batch_idx * num_query + query_idx;
 
@@ -813,7 +813,7 @@ __global__ void lsh_weighted_cumulation_ver4_step2_cuda_kernel(
       }
     } else {
 
-      // all computation is completed if num_distint_query == 0
+      // all computation is completed if num_distinct_query == 0
       break;
 
     }

diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
@@ -1717,7 +1717,7 @@ def forward(
         >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
         >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")
 
-        >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]
+        >>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"]
 
         >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)
 

diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py
@@ -625,7 +625,7 @@ def forward(
                 input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
             ]
         else:
-            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of extra new tokens is possible)
             pooled_output = last_hidden_state[
                 torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
                 # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)

diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
@@ -355,7 +355,7 @@ def pad_to_square(
             background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                 The color to use for the padding. Can be an integer for single channel or a
                 tuple of integers representing for multi-channel images. If passed as integer
-                in mutli-channel mode, it will default to `0` in subsequent channels.
+                in multi-channel mode, it will default to `0` in subsequent channels.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format for the output image. Can be one of:
                     - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.

diff --git a/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py b/src/transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py
@@ -113,7 +113,7 @@ def pad_to_square(
             background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                 The color to use for the padding. Can be an integer for single channel or a
                 tuple of integers representing for multi-channel images. If passed as integer
-                in mutli-channel mode, it will default to `0` in subsequent channels.
+                in multi-channel mode, it will default to `0` in subsequent channels.
 
         Returns:
             `torch.Tensor`: The padded images.

diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py
@@ -428,7 +428,7 @@ def pad_to_square(
             background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                 The color to use for the padding. Can be an integer for single channel or a
                 tuple of integers representing for multi-channel images. If passed as integer
-                in mutli-channel mode, it will default to `0` in subsequent channels.
+                in multi-channel mode, it will default to `0` in subsequent channels.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format for the output image. Can be one of:
                     - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.

diff --git a/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py b/src/transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py
@@ -147,7 +147,7 @@ def pad_to_square(
             background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                 The color to use for the padding. Can be an integer for single channel or a
                 tuple of integers representing for multi-channel images. If passed as integer
-                in mutli-channel mode, it will default to `0` in subsequent channels.
+                in multi-channel mode, it will default to `0` in subsequent channels.
 
         Returns:
             `torch.Tensor`: The padded images.

diff --git a/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
@@ -75,9 +75,9 @@ def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.T
 
     Args:
         router_probs (`torch.Tensor`):
-            Probability assigned to each expert per token. Shape: [batch_size, seqeunce_length, num_experts].
+            Probability assigned to each expert per token. Shape: [batch_size, sequence_length, num_experts].
         expert_indices (`torch.Tensor`):
-            Indices tensor of shape [batch_size, seqeunce_length] identifying the selected expert for a given token.
+            Indices tensor of shape [batch_size, sequence_length] identifying the selected expert for a given token.
 
     Returns:
         The auxiliary loss.

diff --git a/src/transformers/models/dpt/image_processing_dpt_fast.py b/src/transformers/models/dpt/image_processing_dpt_fast.py
@@ -62,7 +62,7 @@
 class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     """
     ensure_multiple_of (`int`, *optional*, defaults to 1):
-        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
+        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
         by `ensure_multiple_of` in `preprocess`.
     do_pad (`bool`, *optional*, defaults to `False`):
         Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
@@ -72,7 +72,7 @@ class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
         DINOv2 paper, which uses the model in combination with DPT.
     keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
         If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
-        be overidden by `keep_aspect_ratio` in `preprocess`.
+        be overridden by `keep_aspect_ratio` in `preprocess`.
     do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
         Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
         is used for background, and background itself is not included in all classes of a dataset (e.g.

diff --git a/src/transformers/models/dpt/modular_dpt.py b/src/transformers/models/dpt/modular_dpt.py
@@ -92,7 +92,7 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
 class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
     """
     ensure_multiple_of (`int`, *optional*, defaults to 1):
-        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
+        If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
         by `ensure_multiple_of` in `preprocess`.
     do_pad (`bool`, *optional*, defaults to `False`):
         Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
@@ -102,7 +102,7 @@ class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
         DINOv2 paper, which uses the model in combination with DPT.
     keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
         If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
-        be overidden by `keep_aspect_ratio` in `preprocess`.
+        be overridden by `keep_aspect_ratio` in `preprocess`.
     do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
         Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
         is used for background, and background itself is not included in all classes of a dataset (e.g.

diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py
@@ -19,7 +19,7 @@
 
 class EfficientLoFTRConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`EffientLoFTRFromKeypointMatching`].
+    This is the configuration class to store the configuration of a [`EfficientLoFTRFromKeypointMatching`].
     It is used to instantiate a EfficientLoFTR model according to the specified arguments, defining the model
     architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
     EfficientLoFTR [zju-community/efficientloftr](https://huggingface.co/zju-community/efficientloftr) architecture.

diff --git a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py
@@ -417,7 +417,7 @@ def merge_tp_weights(model_path, output_path, vllm_config_path=None):
             )
             layer_i += 1
 
-    # Embedd Model, LM Head, and Norm
+    # Embedded Model, LM Head, and Norm
     embed_tokens = merge_tensors(
         tp_sd=mgt_sd[0],
         keys=["model", "embedding.word_embeddings.weight"],

diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py
@@ -999,7 +999,7 @@ def forward(
                 input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
             ]
         else:
-            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of extra new tokens is possible)
             pooled_output = last_hidden_state[
                 torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
                 # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)

diff --git a/src/transformers/models/janus/image_processing_janus.py b/src/transformers/models/janus/image_processing_janus.py
@@ -352,7 +352,7 @@ def pad_to_square(
             background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                 The color to use for the padding. Can be an integer for single channel or a
                 tuple of integers representing for multi-channel images. If passed as integer
-                in mutli-channel mode, it will default to `0` in subsequent channels.
+                in multi-channel mode, it will default to `0` in subsequent channels.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format for the output image. Can be one of:
                     - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.

diff --git a/src/transformers/models/janus/image_processing_janus_fast.py b/src/transformers/models/janus/image_processing_janus_fast.py
@@ -119,7 +119,7 @@ def pad_to_square(
             background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                 The color to use for the padding. Can be an integer for single channel or a
                 tuple of integers representing for multi-channel images. If passed as integer
-                in mutli-channel mode, it will default to `0` in subsequent channels.
+                in multi-channel mode, it will default to `0` in subsequent channels.
 
         Returns:
             `torch.Tensor`: The padded images.

diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py
@@ -1368,7 +1368,7 @@ def pad_to_square(
             background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
                 The color to use for the padding. Can be an integer for single channel or a
                 tuple of integers representing for multi-channel images. If passed as integer
-                in mutli-channel mode, it will default to `0` in subsequent channels.
+                in multi-channel mode, it will default to `0` in subsequent channels.
             data_format (`str` or `ChannelDimension`, *optional*):
                 The channel dimension format for the output image. Can be one of:
                     - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.

diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py
@@ -120,7 +120,7 @@ class MimiConfig(PretrainedConfig):
         attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         layer_scale_initial_scale (`float`, *optional*, defaults to 0.01):
-            Initiale scale of the residual rescaling operation done in the Transformer models.
+            Initial scale of the residual rescaling operation done in the Transformer models.
         attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
             Whether to use a bias in the query, key, value and output projection layers during self-attention.
     Example:

diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py
@@ -246,7 +246,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)

diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py
@@ -461,7 +461,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following

diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -449,7 +449,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
         if output_attentions:
-            # this operation is a bit akward, but it's required to
+            # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to reshaped
             # twice and have to be reused in the following

diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py
@@ -1000,7 +1000,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)

diff --git a/src/transformers/models/vits/modeling_vits.py b/src/transformers/models/vits/modeling_vits.py
@@ -962,7 +962,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2)
 
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
+        # partitioned across GPUs when using tensor-parallelism.
         attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
 
         attn_output = self.out_proj(attn_output)

diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -384,7 +384,7 @@ def setup(self):
                 for i in range(self.config.num_feat_extract_layers)
             ]
         elif self.config.feat_extract_norm == "group":
-            raise NotImplementedError("At the moment only ``config.feat_extact_norm == 'layer'`` is supported")
+            raise NotImplementedError("At the moment only ``config.feat_extract_norm == 'layer'`` is supported")
         else:
             raise ValueError(
                 f"`config.feat_extract_norm` is {self.config.feat_extract_norm}, but has to be one of ['group',"

diff --git a/src/transformers/pipelines/image_feature_extraction.py b/src/transformers/pipelines/image_feature_extraction.py
@@ -32,7 +32,7 @@ class ImageFeatureExtractionPipeline(Pipeline):
 
     >>> extractor = pipeline(model="google/vit-base-patch16-224", task="image-feature-extraction")
     >>> result = extractor("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", return_tensors=True)
-    >>> result.shape  # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input image.
+    >>> result.shape  # This is a tensor of shape [1, sequence_length, hidden_dimension] representing the input image.
     torch.Size([1, 197, 768])
     ```
 

diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
@@ -483,16 +483,16 @@ def postprocess(
         generated_sequence = generated_sequence.numpy().tolist()
         records = []
         other_outputs = model_outputs.get("additional_outputs", {})
-        splitted_keys = {}
+        split_keys = {}
         if other_outputs:
             if self.framework == "pt":
                 for k, v in other_outputs.items():
                     if isinstance(v, torch.Tensor) and v.shape[0] == len(generated_sequence):
-                        splitted_keys[k] = v.numpy().tolist()
+                        split_keys[k] = v.numpy().tolist()
             elif self.framework == "tf":
                 for k, v in other_outputs.items():
                     if isinstance(v, tf.Tensor) and v.shape[0] == len(generated_sequence):
-                        splitted_keys[k] = v.numpy().tolist()
+                        split_keys[k] = v.numpy().tolist()
 
         skip_special_tokens = skip_special_tokens if skip_special_tokens is not None else True
         for idx, sequence in enumerate(generated_sequence):
@@ -539,7 +539,7 @@ def postprocess(
                             # When we're not starting from a prefill, the output is a new assistant message
                             all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}]
                 record = {"generated_text": all_text}
-                for key, values in splitted_keys.items():
+                for key, values in split_keys.items():
                     record[key] = values[idx]
             records.append(record)
 

diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -50,7 +50,7 @@ class ZeroShotAudioClassificationPipeline(Pipeline):
     >>> audio = next(iter(dataset["train"]["audio"]))["array"]
     >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
     >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vacuum cleaner"])
-    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}]
+    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vacuum cleaner'}]
     ```
 
 

diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
@@ -1345,7 +1345,7 @@ def to_meta(value):
 
         return self.graph
 
-    def _stateless_mod_instanciation_depends_on_proxies(self, mod: nn.Module) -> bool:
+    def _stateless_mod_instantiation_depends_on_proxies(self, mod: nn.Module) -> bool:
         """
         Whether the module was instantiated with Proxies. If that is the case, such module cannot be a leaf module
         because its attributes are input-dependent.
@@ -1358,7 +1358,7 @@ def _insert_module_as_submodule(self, mod: nn.Module) -> str:
         """
         # If one of the module attributes is a Proxy, it means that its instantiation is input-dependent.
         # It is not possible to insert such modules, those should be traced through.
-        if self._stateless_mod_instanciation_depends_on_proxies(mod):
+        if self._stateless_mod_instantiation_depends_on_proxies(mod):
             return ""
         idx = 0
         mod_name = mod.__class__.__name__.lower()
@@ -1394,7 +1394,7 @@ def path_of_module(self, mod: nn.Module) -> str:
             raise e
 
     def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
-        return (not self._stateless_mod_instanciation_depends_on_proxies(m)) and super().is_leaf_module(
+        return (not self._stateless_mod_instantiation_depends_on_proxies(m)) and super().is_leaf_module(
             m, module_qualified_name
         )