diff --git a/src/transformers/models/florence2/modeling_florence2.py b/src/transformers/models/florence2/modeling_florence2.py index 57a00843e32e..afa05e8e3c91 100644 --- a/src/transformers/models/florence2/modeling_florence2.py +++ b/src/transformers/models/florence2/modeling_florence2.py @@ -25,7 +25,6 @@ from ...activations import ACT2FN from ...cache_utils import Cache from ...generation import GenerationMixin -from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import Seq2SeqLMOutput, Seq2SeqModelOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack @@ -726,7 +725,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, - **kwargs: Unpack[FlashAttentionKwargs], ) -> Union[tuple, Florence2Seq2SeqModelOutput]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -777,7 +775,6 @@ def forward( output_hidden_states=output_hidden_states, cache_position=cache_position, return_dict=True, - **kwargs, ) return Florence2Seq2SeqModelOutput( @@ -922,7 +919,7 @@ def forward( output_hidden_states=output_hidden_states, return_dict=True, cache_position=cache_position, - **kwargs, + # **kwargs, ## TODO: add back when Bart attention is refactored and takes kwargs ) hidden_states = outputs[0] diff --git a/src/transformers/models/florence2/modular_florence2.py b/src/transformers/models/florence2/modular_florence2.py index 03e0fd0535cf..12bf00ca253d 100644 --- a/src/transformers/models/florence2/modular_florence2.py +++ b/src/transformers/models/florence2/modular_florence2.py @@ -24,7 +24,6 @@ from ...configuration_utils import PretrainedConfig from ...feature_extraction_utils import BatchFeature from ...image_utils import ImageInput -from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import Seq2SeqLMOutput, Seq2SeqModelOutput from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import MultiModalData, ProcessorMixin, Unpack @@ -1569,7 +1568,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, - **kwargs: Unpack[FlashAttentionKwargs], ) -> Union[tuple, Florence2Seq2SeqModelOutput]: output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -1620,7 +1618,6 @@ def forward( output_hidden_states=output_hidden_states, cache_position=cache_position, return_dict=True, - **kwargs, ) return Florence2Seq2SeqModelOutput( @@ -1731,7 +1728,7 @@ def forward( output_hidden_states=output_hidden_states, return_dict=True, cache_position=cache_position, - **kwargs, + # **kwargs, ## TODO: add back when Bart attention is refactored and takes kwargs ) hidden_states = outputs[0]