Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/generation/flax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def _expand_to_num_beams(tensor, num_beams):
def _adapt_logits_for_beam_search(self, logits):
"""
This function can be overwritten in the specific modeling_flax_<model-name>.py classes to allow for custom beam
search behavior. Note that the only model that overwrites this method is [`~transformes.FlaxMarianMTModel`].
search behavior. Note that the only model that overwrites this method is [`~transformers.FlaxMarianMTModel`].
"""
return logits

Expand Down
10 changes: 5 additions & 5 deletions src/transformers/kernels/yoso/fast_lsh_cumulation_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -779,12 +779,12 @@ __global__ void lsh_weighted_cumulation_ver4_step2_cuda_kernel(

__syncthreads();

int num_distint_query = query_counter[0];
int num_distinct_query = query_counter[0];

if (num_distint_query > 0) {
for (int idx_base = 0; idx_base < num_distint_query; idx_base = idx_base + num_warps) {
if (num_distinct_query > 0) {
for (int idx_base = 0; idx_base < num_distinct_query; idx_base = idx_base + num_warps) {
int idx = idx_base + warp_idx;
if (idx < num_distint_query) {
if (idx < num_distinct_query) {
int query_idx = inserted_query[idx];
int batch_idx__query_idx = batch_idx * num_query + query_idx;

Expand Down Expand Up @@ -813,7 +813,7 @@ __global__ void lsh_weighted_cumulation_ver4_step2_cuda_kernel(
}
} else {

// all computation is completed if num_distint_query == 0
// all computation is completed if num_distinct_query == 0
break;

}
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/clap/modeling_clap.py
Original file line number Diff line number Diff line change
Expand Up @@ -1717,7 +1717,7 @@ def forward(
>>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
>>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

>>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]
>>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"]

>>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/clipseg/modeling_clipseg.py
Original file line number Diff line number Diff line change
Expand Up @@ -625,7 +625,7 @@ def forward(
input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
]
else:
# The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
# The config gets updated `eos_token_id` from PR #24773 (so the use of extra new tokens is possible)
pooled_output = last_hidden_state[
torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
# We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def pad_to_square(
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
The color to use for the padding. Can be an integer for single channel or a
tuple of integers representing for multi-channel images. If passed as integer
in mutli-channel mode, it will default to `0` in subsequent channels.
in multi-channel mode, it will default to `0` in subsequent channels.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def pad_to_square(
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
The color to use for the padding. Can be an integer for single channel or a
tuple of integers representing for multi-channel images. If passed as integer
in mutli-channel mode, it will default to `0` in subsequent channels.
in multi-channel mode, it will default to `0` in subsequent channels.

Returns:
`torch.Tensor`: The padded images.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def pad_to_square(
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
The color to use for the padding. Can be an integer for single channel or a
tuple of integers representing for multi-channel images. If passed as integer
in mutli-channel mode, it will default to `0` in subsequent channels.
in multi-channel mode, it will default to `0` in subsequent channels.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def pad_to_square(
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
The color to use for the padding. Can be an integer for single channel or a
tuple of integers representing for multi-channel images. If passed as integer
in mutli-channel mode, it will default to `0` in subsequent channels.
in multi-channel mode, it will default to `0` in subsequent channels.

Returns:
`torch.Tensor`: The padded images.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.T

Args:
router_probs (`torch.Tensor`):
Probability assigned to each expert per token. Shape: [batch_size, seqeunce_length, num_experts].
Probability assigned to each expert per token. Shape: [batch_size, sequence_length, num_experts].
expert_indices (`torch.Tensor`):
Indices tensor of shape [batch_size, seqeunce_length] identifying the selected expert for a given token.
Indices tensor of shape [batch_size, sequence_length] identifying the selected expert for a given token.

Returns:
The auxiliary loss.
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/dpt/image_processing_dpt_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
ensure_multiple_of (`int`, *optional*, defaults to 1):
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
by `ensure_multiple_of` in `preprocess`.
do_pad (`bool`, *optional*, defaults to `False`):
Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
Expand All @@ -72,7 +72,7 @@ class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
DINOv2 paper, which uses the model in combination with DPT.
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
be overidden by `keep_aspect_ratio` in `preprocess`.
be overridden by `keep_aspect_ratio` in `preprocess`.
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/dpt/modular_dpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def constrain_to_multiple_of(val, multiple, min_val=0, max_val=None):
class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
"""
ensure_multiple_of (`int`, *optional*, defaults to 1):
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overridden
by `ensure_multiple_of` in `preprocess`.
do_pad (`bool`, *optional*, defaults to `False`):
Whether to apply center padding. This was introduced in the DINOv2 paper, which uses the model in
Expand All @@ -102,7 +102,7 @@ class DPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
DINOv2 paper, which uses the model in combination with DPT.
keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
be overidden by `keep_aspect_ratio` in `preprocess`.
be overridden by `keep_aspect_ratio` in `preprocess`.
do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
is used for background, and background itself is not included in all classes of a dataset (e.g.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

class EfficientLoFTRConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`EffientLoFTRFromKeypointMatching`].
This is the configuration class to store the configuration of a [`EfficientLoFTRFromKeypointMatching`].
It is used to instantiate a EfficientLoFTR model according to the specified arguments, defining the model
architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the
EfficientLoFTR [zju-community/efficientloftr](https://huggingface.co/zju-community/efficientloftr) architecture.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ def merge_tp_weights(model_path, output_path, vllm_config_path=None):
)
layer_i += 1

# Embedd Model, LM Head, and Norm
# Embedded Model, LM Head, and Norm
embed_tokens = merge_tensors(
tp_sd=mgt_sd[0],
keys=["model", "embedding.word_embeddings.weight"],
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/groupvit/modeling_groupvit.py
Original file line number Diff line number Diff line change
Expand Up @@ -999,7 +999,7 @@ def forward(
input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
]
else:
# The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
# The config gets updated `eos_token_id` from PR #24773 (so the use of extra new tokens is possible)
pooled_output = last_hidden_state[
torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
# We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/janus/image_processing_janus.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ def pad_to_square(
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
The color to use for the padding. Can be an integer for single channel or a
tuple of integers representing for multi-channel images. If passed as integer
in mutli-channel mode, it will default to `0` in subsequent channels.
in multi-channel mode, it will default to `0` in subsequent channels.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def pad_to_square(
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
The color to use for the padding. Can be an integer for single channel or a
tuple of integers representing for multi-channel images. If passed as integer
in mutli-channel mode, it will default to `0` in subsequent channels.
in multi-channel mode, it will default to `0` in subsequent channels.

Returns:
`torch.Tensor`: The padded images.
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/janus/modular_janus.py
Original file line number Diff line number Diff line change
Expand Up @@ -1368,7 +1368,7 @@ def pad_to_square(
background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
The color to use for the padding. Can be an integer for single channel or a
tuple of integers representing for multi-channel images. If passed as integer
in mutli-channel mode, it will default to `0` in subsequent channels.
in multi-channel mode, it will default to `0` in subsequent channels.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mimi/configuration_mimi.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ class MimiConfig(PretrainedConfig):
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
layer_scale_initial_scale (`float`, *optional*, defaults to 0.01):
Initiale scale of the residual rescaling operation done in the Transformer models.
Initial scale of the residual rescaling operation done in the Transformer models.
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
Example:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mvp/modeling_mvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def forward(
attn_output = attn_output.transpose(1, 2)

# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned aross GPUs when using tensor-parallelism.
# partitioned across GPUs when using tensor-parallelism.
attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

attn_output = self.out_proj(attn_output)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/owlv2/modeling_owlv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ def forward(
attn_weights = nn.functional.softmax(attn_weights, dim=-1)

if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# twice and have to be reused in the following
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/owlvit/modeling_owlvit.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def forward(
attn_weights = nn.functional.softmax(attn_weights, dim=-1)

if output_attentions:
# this operation is a bit akward, but it's required to
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to reshaped
# twice and have to be reused in the following
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/speecht5/modeling_speecht5.py
Original file line number Diff line number Diff line change
Expand Up @@ -1000,7 +1000,7 @@ def forward(
attn_output = attn_output.transpose(1, 2)

# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned aross GPUs when using tensor-parallelism.
# partitioned across GPUs when using tensor-parallelism.
attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

attn_output = self.out_proj(attn_output)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/vits/modeling_vits.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,7 +962,7 @@ def forward(
attn_output = attn_output.transpose(1, 2)

# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned aross GPUs when using tensor-parallelism.
# partitioned across GPUs when using tensor-parallelism.
attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

attn_output = self.out_proj(attn_output)
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def setup(self):
for i in range(self.config.num_feat_extract_layers)
]
elif self.config.feat_extract_norm == "group":
raise NotImplementedError("At the moment only ``config.feat_extact_norm == 'layer'`` is supported")
raise NotImplementedError("At the moment only ``config.feat_extract_norm == 'layer'`` is supported")
else:
raise ValueError(
f"`config.feat_extract_norm` is {self.config.feat_extract_norm}, but has to be one of ['group',"
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/pipelines/image_feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class ImageFeatureExtractionPipeline(Pipeline):

>>> extractor = pipeline(model="google/vit-base-patch16-224", task="image-feature-extraction")
>>> result = extractor("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", return_tensors=True)
>>> result.shape # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input image.
>>> result.shape # This is a tensor of shape [1, sequence_length, hidden_dimension] representing the input image.
torch.Size([1, 197, 768])
```

Expand Down
8 changes: 4 additions & 4 deletions src/transformers/pipelines/text_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,16 +483,16 @@ def postprocess(
generated_sequence = generated_sequence.numpy().tolist()
records = []
other_outputs = model_outputs.get("additional_outputs", {})
splitted_keys = {}
split_keys = {}
if other_outputs:
if self.framework == "pt":
for k, v in other_outputs.items():
if isinstance(v, torch.Tensor) and v.shape[0] == len(generated_sequence):
splitted_keys[k] = v.numpy().tolist()
split_keys[k] = v.numpy().tolist()
elif self.framework == "tf":
for k, v in other_outputs.items():
if isinstance(v, tf.Tensor) and v.shape[0] == len(generated_sequence):
splitted_keys[k] = v.numpy().tolist()
split_keys[k] = v.numpy().tolist()

skip_special_tokens = skip_special_tokens if skip_special_tokens is not None else True
for idx, sequence in enumerate(generated_sequence):
Expand Down Expand Up @@ -539,7 +539,7 @@ def postprocess(
# When we're not starting from a prefill, the output is a new assistant message
all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}]
record = {"generated_text": all_text}
for key, values in splitted_keys.items():
for key, values in split_keys.items():
record[key] = values[idx]
records.append(record)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class ZeroShotAudioClassificationPipeline(Pipeline):
>>> audio = next(iter(dataset["train"]["audio"]))["array"]
>>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
>>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vacuum cleaner"])
[{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}]
[{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vacuum cleaner'}]
```


Expand Down
6 changes: 3 additions & 3 deletions src/transformers/utils/fx.py
Original file line number Diff line number Diff line change
Expand Up @@ -1345,7 +1345,7 @@ def to_meta(value):

return self.graph

def _stateless_mod_instanciation_depends_on_proxies(self, mod: nn.Module) -> bool:
def _stateless_mod_instantiation_depends_on_proxies(self, mod: nn.Module) -> bool:
"""
Whether the module was instantiated with Proxies. If that is the case, such module cannot be a leaf module
because its attributes are input-dependent.
Expand All @@ -1358,7 +1358,7 @@ def _insert_module_as_submodule(self, mod: nn.Module) -> str:
"""
# If one of the module attributes is a Proxy, it means that its instantiation is input-dependent.
# It is not possible to insert such modules, those should be traced through.
if self._stateless_mod_instanciation_depends_on_proxies(mod):
if self._stateless_mod_instantiation_depends_on_proxies(mod):
return ""
idx = 0
mod_name = mod.__class__.__name__.lower()
Expand Down Expand Up @@ -1394,7 +1394,7 @@ def path_of_module(self, mod: nn.Module) -> str:
raise e

def is_leaf_module(self, m: torch.nn.Module, module_qualified_name: str) -> bool:
return (not self._stateless_mod_instanciation_depends_on_proxies(m)) and super().is_leaf_module(
return (not self._stateless_mod_instantiation_depends_on_proxies(m)) and super().is_leaf_module(
m, module_qualified_name
)

Expand Down