Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions src/transformers/generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ class GenerateDecoderOnlyOutput(ModelOutput):
hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
`torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
past_key_values (`Cache`, *optional*, returned when `use_cache=True`):
Returns the model cache, used to speed up decoding. Different models have a different cache format, check
the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
"""
Expand All @@ -176,7 +176,7 @@ class GenerateDecoderOnlyOutput(ModelOutput):
logits: Optional[tuple[torch.FloatTensor]] = None
attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
past_key_values: Optional[Cache] = None


@dataclass
Expand Down Expand Up @@ -211,7 +211,7 @@ class GenerateEncoderDecoderOutput(ModelOutput):
decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
`torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
past_key_values (`Cache`, *optional*, returned when `use_cache=True`):
Returns the model cache, used to speed up decoding. Different models have a different cache format, check
the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
"""
Expand All @@ -224,7 +224,7 @@ class GenerateEncoderDecoderOutput(ModelOutput):
decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
cross_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
decoder_hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
past_key_values: Optional[Cache] = None


@dataclass
Expand Down Expand Up @@ -256,7 +256,7 @@ class GenerateBeamDecoderOnlyOutput(ModelOutput):
hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
`torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
past_key_values (`Cache`, *optional*, returned when `use_cache=True`):
Returns the model cache, used to speed up decoding. Different models have a different cache format, check
the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
"""
Expand All @@ -268,7 +268,7 @@ class GenerateBeamDecoderOnlyOutput(ModelOutput):
beam_indices: Optional[torch.LongTensor] = None
attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
past_key_values: Optional[Cache] = None


@dataclass
Expand Down Expand Up @@ -310,7 +310,7 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
`torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
past_key_values (`tuple(tuple(torch.FloatTensor)))`, *optional*, returned when `use_cache=True`):
past_key_values (`Cache`, *optional*, returned when `use_cache=True`):
Returns the model cache, used to speed up decoding. Different models have a different cache format, check
the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
"""
Expand All @@ -325,7 +325,7 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
decoder_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
cross_attentions: Optional[tuple[tuple[torch.FloatTensor]]] = None
decoder_hidden_states: Optional[tuple[tuple[torch.FloatTensor]]] = None
past_key_values: Optional[tuple[tuple[tuple[torch.FloatTensor]]]] = None
past_key_values: Optional[Cache] = None


# TODO (joao): remove the equivalent classes and typing shortcuts below in v5
Expand Down
8 changes: 3 additions & 5 deletions src/transformers/models/aria/modeling_aria.py
Original file line number Diff line number Diff line change
Expand Up @@ -873,8 +873,7 @@ class AriaCausalLMOutputWithPast(ModelOutput):
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, my prev bulk update logic was incorrect and left out a lot apparently 😆


Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
Expand All @@ -885,7 +884,7 @@ class AriaCausalLMOutputWithPast(ModelOutput):

loss: Optional[torch.FloatTensor] = None
logits: Optional[torch.FloatTensor] = None
past_key_values: Optional[list[torch.FloatTensor]] = None
past_key_values: Optional[Cache] = None
hidden_states: Optional[tuple[torch.FloatTensor]] = None
attentions: Optional[tuple[torch.FloatTensor]] = None
image_hidden_states: Optional[torch.FloatTensor] = None
Expand All @@ -900,8 +899,7 @@ class AriaCausalLMOutputWithPast(ModelOutput):
class AriaModelOutputWithPast(BaseModelOutputWithPast):
r"""
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
Expand Down
31 changes: 12 additions & 19 deletions src/transformers/models/autoformer/modeling_autoformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,8 @@ class AutoFormerDecoderOutput(ModelOutput):
hidden_size)` is output.
trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Trend tensor for each time series.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
`config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
encoder_sequence_length, embed_size_per_head)`.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
Expand All @@ -81,7 +78,7 @@ class AutoFormerDecoderOutput(ModelOutput):

last_hidden_state: Optional[torch.FloatTensor] = None
trend: Optional[torch.FloatTensor] = None
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
past_key_values: Optional[Cache] = None
hidden_states: Optional[tuple[torch.FloatTensor]] = None
attentions: Optional[tuple[torch.FloatTensor]] = None
cross_attentions: Optional[tuple[torch.FloatTensor]] = None
Expand All @@ -102,10 +99,8 @@ class AutoformerModelOutput(ModelOutput):
hidden_size)` is output.
trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Trend tensor for each time series.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
Expand All @@ -121,7 +116,7 @@ class AutoformerModelOutput(ModelOutput):

last_hidden_state: Optional[torch.FloatTensor] = None
trend: Optional[torch.FloatTensor] = None
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None
past_key_values: Optional[Cache] = None
decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
cross_attentions: Optional[tuple[torch.FloatTensor]] = None
Expand Down Expand Up @@ -781,7 +776,7 @@ def forward(
`(encoder_attention_heads,)`.
cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
size `(decoder_attention_heads,)`.
past_key_values (`Tuple(torch.FloatTensor)`): cached past key and value projection states
past_key_values (`Cache`): cached past key and value projection states
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
Expand Down Expand Up @@ -1064,7 +1059,7 @@ def forward(
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
past_key_values: Optional[Cache] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
Expand Down Expand Up @@ -1107,10 +1102,8 @@ def forward(
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.

past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
Expand Down Expand Up @@ -1440,7 +1433,7 @@ def forward(
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[list[torch.FloatTensor]] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
past_key_values: Optional[Cache] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
use_cache: Optional[bool] = None,
Expand Down Expand Up @@ -1708,7 +1701,7 @@ def forward(
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[list[torch.FloatTensor]] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
past_key_values: Optional[Cache] = None,
output_hidden_states: Optional[bool] = None,
output_attentions: Optional[bool] = None,
use_cache: Optional[bool] = None,
Expand Down
8 changes: 3 additions & 5 deletions src/transformers/models/aya_vision/modeling_aya_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,7 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
Expand All @@ -130,7 +129,7 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):

loss: Optional[torch.FloatTensor] = None
logits: Optional[torch.FloatTensor] = None
past_key_values: Optional[list[torch.FloatTensor]] = None
past_key_values: Optional[Cache] = None
hidden_states: Optional[tuple[torch.FloatTensor]] = None
attentions: Optional[tuple[torch.FloatTensor]] = None
image_hidden_states: Optional[torch.FloatTensor] = None
Expand All @@ -145,8 +144,7 @@ class AyaVisionCausalLMOutputWithPast(ModelOutput):
class AyaVisionModelOutputWithPast(BaseModelOutputWithPast):
r"""
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/bark/modeling_bark.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from torch import nn
from torch.nn import functional as F

from ...cache_utils import DynamicCache
from ...cache_utils import Cache, DynamicCache
from ...generation import GenerationMixin
from ...generation.logits_process import (
AlternatingCodebooksLogitsProcessor,
Expand Down Expand Up @@ -437,7 +437,7 @@ def prepare_inputs_for_generation(
def forward(
self,
input_ids: Optional[torch.Tensor] = None,
past_key_values: Optional[tuple[torch.FloatTensor]] = None,
past_key_values: Optional[Cache] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
Expand Down
6 changes: 2 additions & 4 deletions src/transformers/models/bart/modeling_bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ def forward(
`(encoder_attention_heads,)`.
cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
size `(decoder_attention_heads,)`.
past_key_values (`Tuple(torch.FloatTensor)`): cached past key and value projection states
past_key_values (`Cache`): cached past key and value projection states
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realized the description is so random from model to model. Would be nice to consolidate all with "auto-doctsring" decorator. I like the cache docs in there, it has more details about correct usage

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed!

I also noticed that we don't have perfect inheritance on some output classes (e.g. VLM output classes), which leads to redundant docstings. I started by changing them, but got errors -- decided to not do them in this PR, to keep the two different changes isolated. More autodocstrings is something we can definitely work on.

output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
Expand Down Expand Up @@ -988,9 +988,7 @@ def forward(
- 0 indicates the head is **masked**.

past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
Expand Down
6 changes: 3 additions & 3 deletions src/transformers/models/bert/modeling_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ def forward(
head_mask: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
Expand Down Expand Up @@ -887,7 +887,7 @@ def forward(
inputs_embeds: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[list[torch.FloatTensor]] = None,
past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
Expand Down Expand Up @@ -1171,7 +1171,7 @@ def forward(
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
past_key_values: Optional[list[torch.Tensor]] = None,
past_key_values: Optional[Cache] = None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

small note on some pretrained model classes: we technically support old cache until 4.58 on them and then convert to new cache format in the base model. Though it will be painful to revert only these classes, so ig we can keep as is

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I thought it would be best to only document the non-deprecated case, to save us work 🤗

use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
Expand Down
Loading