diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index c720b379551f..c99133f257a5 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -674,7 +674,7 @@ def forward( encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor, joint_attention_kwargs: Optional[Dict[str, Any]] = None, - ): + ) -> Tuple[torch.Tensor, torch.Tensor]: joint_attention_kwargs = joint_attention_kwargs or {} if self.use_dual_attention: norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1( diff --git a/src/diffusers/models/transformers/auraflow_transformer_2d.py b/src/diffusers/models/transformers/auraflow_transformer_2d.py index a8d275d14214..4d7d1ba40e92 100644 --- a/src/diffusers/models/transformers/auraflow_transformer_2d.py +++ b/src/diffusers/models/transformers/auraflow_transformer_2d.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union import torch import torch.nn as nn @@ -92,7 +92,7 @@ def pe_selection_index_based_on_dim(self, h, w): return selected_indices - def forward(self, latent): + def forward(self, latent) -> torch.Tensor: batch_size, num_channels, height, width = latent.size() latent = latent.view( batch_size, @@ -173,7 +173,7 @@ def forward( hidden_states: torch.FloatTensor, temb: torch.FloatTensor, attention_kwargs: Optional[Dict[str, Any]] = None, - ): + ) -> torch.Tensor: residual = hidden_states attention_kwargs = attention_kwargs or {} @@ -242,7 +242,7 @@ def forward( encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor, attention_kwargs: Optional[Dict[str, Any]] = None, - ): + ) -> Tuple[torch.Tensor, torch.Tensor]: residual = hidden_states residual_context = encoder_hidden_states attention_kwargs = attention_kwargs or {} @@ -472,7 +472,7 @@ def forward( timestep: torch.LongTensor = None, attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, - ) -> Union[torch.FloatTensor, Transformer2DModelOutput]: + ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]: if attention_kwargs is not None: attention_kwargs = attention_kwargs.copy() lora_scale = attention_kwargs.pop("scale", 1.0) diff --git a/src/diffusers/models/transformers/cogvideox_transformer_3d.py b/src/diffusers/models/transformers/cogvideox_transformer_3d.py index a8c98bccb86c..50381096903c 100644 --- a/src/diffusers/models/transformers/cogvideox_transformer_3d.py +++ b/src/diffusers/models/transformers/cogvideox_transformer_3d.py @@ -122,7 +122,7 @@ def forward( temb: torch.Tensor, image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_length = encoder_hidden_states.size(1) attention_kwargs = attention_kwargs or {} @@ -441,7 +441,7 @@ def forward( image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, - ): + ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]: if attention_kwargs is not None: attention_kwargs = attention_kwargs.copy() lora_scale = attention_kwargs.pop("scale", 1.0) diff --git a/src/diffusers/models/transformers/consisid_transformer_3d.py b/src/diffusers/models/transformers/consisid_transformer_3d.py index 41632dbd4751..91fe811f0013 100644 --- a/src/diffusers/models/transformers/consisid_transformer_3d.py +++ b/src/diffusers/models/transformers/consisid_transformer_3d.py @@ -315,7 +315,7 @@ def forward( encoder_hidden_states: torch.Tensor, temb: torch.Tensor, image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_length = encoder_hidden_states.size(1) # norm & modulate @@ -691,7 +691,7 @@ def forward( id_cond: Optional[torch.Tensor] = None, id_vit_hidden: Optional[torch.Tensor] = None, return_dict: bool = True, - ): + ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]: if attention_kwargs is not None: attention_kwargs = attention_kwargs.copy() lora_scale = attention_kwargs.pop("scale", 1.0) diff --git a/src/diffusers/models/transformers/lumina_nextdit2d.py b/src/diffusers/models/transformers/lumina_nextdit2d.py index 84b1175386b0..bed5e69c2d36 100644 --- a/src/diffusers/models/transformers/lumina_nextdit2d.py +++ b/src/diffusers/models/transformers/lumina_nextdit2d.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Tuple, Union import torch import torch.nn as nn @@ -124,7 +124,7 @@ def forward( encoder_mask: torch.Tensor, temb: torch.Tensor, cross_attention_kwargs: Optional[Dict[str, Any]] = None, - ): + ) -> torch.Tensor: """ Perform a forward pass through the LuminaNextDiTBlock. @@ -297,7 +297,7 @@ def forward( image_rotary_emb: torch.Tensor, cross_attention_kwargs: Dict[str, Any] = None, return_dict=True, - ) -> torch.Tensor: + ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]: """ Forward pass of LuminaNextDiT. diff --git a/src/diffusers/models/transformers/transformer_bria.py b/src/diffusers/models/transformers/transformer_bria.py index 27a9941501a1..04a9c5645c81 100644 --- a/src/diffusers/models/transformers/transformer_bria.py +++ b/src/diffusers/models/transformers/transformer_bria.py @@ -472,7 +472,7 @@ def forward( temb: torch.Tensor, image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_len = encoder_hidden_states.shape[1] hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1) @@ -588,7 +588,7 @@ def forward( return_dict: bool = True, controlnet_block_samples=None, controlnet_single_block_samples=None, - ) -> Union[torch.FloatTensor, Transformer2DModelOutput]: + ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]: """ The [`BriaTransformer2DModel`] forward method. diff --git a/src/diffusers/models/transformers/transformer_cogview3plus.py b/src/diffusers/models/transformers/transformer_cogview3plus.py index 77f15f6ca6f1..7356f4a606bb 100644 --- a/src/diffusers/models/transformers/transformer_cogview3plus.py +++ b/src/diffusers/models/transformers/transformer_cogview3plus.py @@ -13,7 +13,7 @@ # limitations under the License. -from typing import Dict, Union +from typing import Dict, Tuple, Union import torch import torch.nn as nn @@ -79,7 +79,7 @@ def forward( hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, emb: torch.Tensor, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_length = encoder_hidden_states.size(1) # norm & modulate @@ -293,7 +293,7 @@ def forward( target_size: torch.Tensor, crop_coords: torch.Tensor, return_dict: bool = True, - ) -> Union[torch.Tensor, Transformer2DModelOutput]: + ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]: """ The [`CogView3PlusTransformer2DModel`] forward method. diff --git a/src/diffusers/models/transformers/transformer_cogview4.py b/src/diffusers/models/transformers/transformer_cogview4.py index 25dcfa14cc0b..64e9a538a7c2 100644 --- a/src/diffusers/models/transformers/transformer_cogview4.py +++ b/src/diffusers/models/transformers/transformer_cogview4.py @@ -494,7 +494,7 @@ def forward( ] = None, attention_mask: Optional[Dict[str, torch.Tensor]] = None, attention_kwargs: Optional[Dict[str, Any]] = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: # 1. Timestep conditioning ( norm_hidden_states, @@ -717,7 +717,7 @@ def forward( image_rotary_emb: Optional[ Union[Tuple[torch.Tensor, torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]] ] = None, - ) -> Union[torch.Tensor, Transformer2DModelOutput]: + ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]: if attention_kwargs is not None: attention_kwargs = attention_kwargs.copy() lora_scale = attention_kwargs.pop("scale", 1.0) diff --git a/src/diffusers/models/transformers/transformer_hidream_image.py b/src/diffusers/models/transformers/transformer_hidream_image.py index 77902dcf5852..4a5aee29abc4 100644 --- a/src/diffusers/models/transformers/transformer_hidream_image.py +++ b/src/diffusers/models/transformers/transformer_hidream_image.py @@ -55,7 +55,7 @@ def __init__(self, hidden_size, frequency_embedding_size=256): self.time_proj = Timesteps(num_channels=frequency_embedding_size, flip_sin_to_cos=True, downscale_freq_shift=0) self.timestep_embedder = TimestepEmbedding(in_channels=frequency_embedding_size, time_embed_dim=hidden_size) - def forward(self, timesteps: torch.Tensor, wdtype: Optional[torch.dtype] = None): + def forward(self, timesteps: torch.Tensor, wdtype: Optional[torch.dtype] = None) -> torch.Tensor: t_emb = self.time_proj(timesteps).to(dtype=wdtype) t_emb = self.timestep_embedder(t_emb) return t_emb @@ -87,7 +87,7 @@ def __init__( self.out_channels = out_channels self.proj = nn.Linear(in_channels * patch_size * patch_size, out_channels, bias=True) - def forward(self, latent): + def forward(self, latent) -> torch.Tensor: latent = self.proj(latent) return latent @@ -534,7 +534,7 @@ def forward( encoder_hidden_states: Optional[torch.Tensor] = None, temb: Optional[torch.Tensor] = None, image_rotary_emb: torch.Tensor = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: wtype = hidden_states.dtype ( shift_msa_i, @@ -592,7 +592,7 @@ def forward( encoder_hidden_states: Optional[torch.Tensor] = None, temb: Optional[torch.Tensor] = None, image_rotary_emb: torch.Tensor = None, - ) -> torch.Tensor: + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: return self.block( hidden_states=hidden_states, hidden_states_masks=hidden_states_masks, @@ -786,7 +786,7 @@ def forward( attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, **kwargs, - ): + ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]: encoder_hidden_states = kwargs.get("encoder_hidden_states", None) if encoder_hidden_states is not None: diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video.py b/src/diffusers/models/transformers/transformer_hunyuan_video.py index 6944a6c536b5..bc857ccab463 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video.py @@ -529,7 +529,7 @@ def forward( image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, *args, **kwargs, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_length = encoder_hidden_states.shape[1] hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1) @@ -684,7 +684,7 @@ def forward( image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, token_replace_emb: torch.Tensor = None, num_tokens: int = None, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor, torch.Tensor]: text_seq_length = encoder_hidden_states.shape[1] hidden_states = torch.cat([hidden_states, encoder_hidden_states], dim=1) @@ -1038,7 +1038,7 @@ def forward( guidance: torch.Tensor = None, attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, - ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]: + ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]: if attention_kwargs is not None: attention_kwargs = attention_kwargs.copy() lora_scale = attention_kwargs.pop("scale", 1.0) diff --git a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py index c2eb7fd2a705..60b40fff3cb8 100644 --- a/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py +++ b/src/diffusers/models/transformers/transformer_hunyuan_video_framepack.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.nn as nn @@ -216,7 +216,7 @@ def forward( indices_latents_history_4x: Optional[torch.Tensor] = None, attention_kwargs: Optional[Dict[str, Any]] = None, return_dict: bool = True, - ): + ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]: if attention_kwargs is not None: attention_kwargs = attention_kwargs.copy() lora_scale = attention_kwargs.pop("scale", 1.0)