diff --git a/examples/community/clip_guided_images_mixing_stable_diffusion.py b/examples/community/clip_guided_images_mixing_stable_diffusion.py
index 399f5b14506d..16dcecd7b22a 100644
--- a/examples/community/clip_guided_images_mixing_stable_diffusion.py
+++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py
@@ -12,12 +12,12 @@
from diffusers import (
AutoencoderKL,
DDIMScheduler,
- DiffusionPipeline,
DPMSolverMultistepScheduler,
LMSDiscreteScheduler,
PNDMScheduler,
UNet2DConditionModel,
)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
from diffusers.utils import PIL_INTERPOLATION
from diffusers.utils.torch_utils import randn_tensor
@@ -77,7 +77,7 @@ def set_requires_grad(model, value):
param.requires_grad = value
-class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline):
+class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
def __init__(
self,
vae: AutoencoderKL,
@@ -113,16 +113,6 @@ def __init__(
set_requires_grad(self.text_encoder, False)
set_requires_grad(self.clip_model, False)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- self.enable_attention_slicing(None)
-
def freeze_vae(self):
set_requires_grad(self.vae, False)
diff --git a/examples/community/clip_guided_stable_diffusion.py b/examples/community/clip_guided_stable_diffusion.py
index 3f4ab2ab9f4a..4205718802de 100644
--- a/examples/community/clip_guided_stable_diffusion.py
+++ b/examples/community/clip_guided_stable_diffusion.py
@@ -10,12 +10,12 @@
from diffusers import (
AutoencoderKL,
DDIMScheduler,
- DiffusionPipeline,
DPMSolverMultistepScheduler,
LMSDiscreteScheduler,
PNDMScheduler,
UNet2DConditionModel,
)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
@@ -51,7 +51,7 @@ def set_requires_grad(model, value):
param.requires_grad = value
-class CLIPGuidedStableDiffusion(DiffusionPipeline):
+class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
"""CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000
- https://github.com/Jack000/glid-3-xl
- https://github.dev/crowsonkb/k-diffusion
@@ -89,16 +89,6 @@ def __init__(
set_requires_grad(self.text_encoder, False)
set_requires_grad(self.clip_model, False)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- self.enable_attention_slicing(None)
-
def freeze_vae(self):
set_requires_grad(self.vae, False)
diff --git a/examples/community/clip_guided_stable_diffusion_img2img.py b/examples/community/clip_guided_stable_diffusion_img2img.py
index 2dbc9bef9ffe..434d5253679a 100644
--- a/examples/community/clip_guided_stable_diffusion_img2img.py
+++ b/examples/community/clip_guided_stable_diffusion_img2img.py
@@ -12,12 +12,12 @@
from diffusers import (
AutoencoderKL,
DDIMScheduler,
- DiffusionPipeline,
DPMSolverMultistepScheduler,
LMSDiscreteScheduler,
PNDMScheduler,
UNet2DConditionModel,
)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
from diffusers.utils import PIL_INTERPOLATION, deprecate
from diffusers.utils.torch_utils import randn_tensor
@@ -125,7 +125,7 @@ def set_requires_grad(model, value):
param.requires_grad = value
-class CLIPGuidedStableDiffusion(DiffusionPipeline):
+class CLIPGuidedStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
"""CLIP guided stable diffusion based on the amazing repo by @crowsonkb and @Jack000
- https://github.com/Jack000/glid-3-xl
- https://github.dev/crowsonkb/k-diffusion
@@ -163,16 +163,6 @@ def __init__(
set_requires_grad(self.text_encoder, False)
set_requires_grad(self.clip_model, False)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- self.enable_attention_slicing(None)
-
def freeze_vae(self):
set_requires_grad(self.vae, False)
diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py
index 2693ae45afac..3153bd30e479 100644
--- a/examples/community/composable_stable_diffusion.py
+++ b/examples/community/composable_stable_diffusion.py
@@ -22,6 +22,7 @@
from diffusers import DiffusionPipeline
from diffusers.configuration_utils import FrozenDict
from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import (
@@ -32,13 +33,13 @@
LMSDiscreteScheduler,
PNDMScheduler,
)
-from diffusers.utils import deprecate, is_accelerate_available, logging
+from diffusers.utils import deprecate, logging
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class ComposableStableDiffusionPipeline(DiffusionPipeline):
+class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -164,62 +165,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
- steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- """
- if is_accelerate_available():
- from accelerate import cpu_offload
- else:
- raise ImportError("Please install accelerate via `pip install accelerate`")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
- if cpu_offloaded_model is not None:
- cpu_offload(cpu_offloaded_model, device)
-
- if self.safety_checker is not None:
- # TODO(Patrick) - there is currently a bug with cpu offload of nn.Parameter in accelerate
- # fix by only offloading self.safety_checker for now
- cpu_offload(self.safety_checker.vision_model, device)
-
- @property
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
r"""
Encodes the prompt into text encoder hidden states.
diff --git a/examples/community/gluegen.py b/examples/community/gluegen.py
index ecfe91eb9483..b8f147000229 100644
--- a/examples/community/gluegen.py
+++ b/examples/community/gluegen.py
@@ -10,6 +10,7 @@
from diffusers.loaders import LoraLoaderMixin
from diffusers.models import AutoencoderKL, UNet2DConditionModel
from diffusers.models.lora import adjust_lora_scale_text_encoder
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
@@ -193,7 +194,7 @@ def retrieve_timesteps(
return timesteps, num_inference_steps
-class GlueGenStableDiffusionPipeline(DiffusionPipeline, LoraLoaderMixin):
+class GlueGenStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin, LoraLoaderMixin):
def __init__(
self,
vae: AutoencoderKL,
@@ -241,35 +242,6 @@ def load_language_adapter(
)
self.language_adapter.load_state_dict(torch.load(model_path))
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def _adapt_language(self, prompt_embeds: torch.FloatTensor):
prompt_embeds = prompt_embeds / 3
prompt_embeds = self.language_adapter(prompt_embeds) * (self.tensor_norm / 2)
@@ -544,32 +516,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
latents = latents * self.scheduler.init_noise_sigma
return latents
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py
index 98e06ceb21ac..25048e946fe0 100644
--- a/examples/community/imagic_stable_diffusion.py
+++ b/examples/community/imagic_stable_diffusion.py
@@ -19,6 +19,7 @@
from diffusers import DiffusionPipeline
from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -56,7 +57,7 @@ def preprocess(image):
return 2.0 * image - 1.0
-class ImagicStableDiffusionPipeline(DiffusionPipeline):
+class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for imagic image editing.
See paper here: https://arxiv.org/pdf/2210.09276.pdf
@@ -105,31 +106,6 @@ def __init__(
feature_extractor=feature_extractor,
)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- r"""
- Enable sliced attention computation.
- When this option is enabled, the attention module will split the input tensor in slices, to compute attention
- in several steps. This is useful to save some memory in exchange for a small speed decrease.
- Args:
- slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
- When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
- a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
- `attention_head_dim` must be a multiple of `slice_size`.
- """
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- r"""
- Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
- back to computing attention in one step.
- """
- # set slice_size = `None` to disable `attention slicing`
- self.enable_attention_slicing(None)
-
def train(
self,
prompt: Union[str, List[str]],
diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py
index 8ee8355d49a6..71dc3cf712ed 100644
--- a/examples/community/img2img_inpainting.py
+++ b/examples/community/img2img_inpainting.py
@@ -129,33 +129,6 @@ def __init__(
feature_extractor=feature_extractor,
)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- r"""
- Enable sliced attention computation.
-
- When this option is enabled, the attention module will split the input tensor in slices, to compute attention
- in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
- Args:
- slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
- When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
- a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
- `attention_head_dim` must be a multiple of `slice_size`.
- """
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- r"""
- Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
- back to computing attention in one step.
- """
- # set slice_size = `None` to disable `attention slicing`
- self.enable_attention_slicing(None)
-
@torch.no_grad()
def __call__(
self,
diff --git a/examples/community/instaflow_one_step.py b/examples/community/instaflow_one_step.py
index 065abfe13d23..b07d85f8fcdf 100644
--- a/examples/community/instaflow_one_step.py
+++ b/examples/community/instaflow_one_step.py
@@ -24,7 +24,7 @@
from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from diffusers.models import AutoencoderKL, UNet2DConditionModel
from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
@@ -52,7 +52,9 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
return noise_cfg
-class InstaFlowPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin):
+class InstaFlowPipeline(
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+):
r"""
Pipeline for text-to-image generation using Rectified Flow and Euler discretization.
This customized pipeline is based on StableDiffusionPipeline from the official Diffusers library (0.21.4)
@@ -180,35 +182,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def _encode_prompt(
self,
prompt,
diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py
index 70e4d025a037..1b859c35f174 100644
--- a/examples/community/interpolate_stable_diffusion.py
+++ b/examples/community/interpolate_stable_diffusion.py
@@ -7,9 +7,9 @@
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-from diffusers import DiffusionPipeline
from diffusers.configuration_utils import FrozenDict
from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -46,7 +46,7 @@ def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
return v2
-class StableDiffusionWalkPipeline(DiffusionPipeline):
+class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -120,33 +120,6 @@ def __init__(
feature_extractor=feature_extractor,
)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- r"""
- Enable sliced attention computation.
-
- When this option is enabled, the attention module will split the input tensor in slices, to compute attention
- in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
- Args:
- slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
- When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
- a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
- `attention_head_dim` must be a multiple of `slice_size`.
- """
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- r"""
- Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
- back to computing attention in one step.
- """
- # set slice_size = `None` to disable `attention slicing`
- self.enable_attention_slicing(None)
-
@torch.no_grad()
def __call__(
self,
diff --git a/examples/community/ip_adapter_face_id.py b/examples/community/ip_adapter_face_id.py
index dfd6a9df6eb1..b4d2446b5ce9 100644
--- a/examples/community/ip_adapter_face_id.py
+++ b/examples/community/ip_adapter_face_id.py
@@ -26,9 +26,8 @@
from diffusers.image_processor import VaeImageProcessor
from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.models.attention_processor import FusedAttnProcessor2_0
from diffusers.models.lora import LoRALinearLayer, adjust_lora_scale_text_encoder
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
@@ -415,7 +414,12 @@ def retrieve_timesteps(
class IPAdapterFaceIDStableDiffusionPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ LoraLoaderMixin,
+ IPAdapterMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -727,35 +731,6 @@ def set_ip_adapter_scale(self, scale):
if isinstance(attn_processor, (LoRAIPAdapterAttnProcessor, LoRAIPAdapterAttnProcessor2_0)):
attn_processor.scale = scale
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def _encode_prompt(
self,
prompt,
@@ -1080,93 +1055,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
latents = latents * self.scheduler.init_noise_sigma
return latents
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/examples/community/latent_consistency_interpolate.py b/examples/community/latent_consistency_interpolate.py
index 7b9e4806bf44..0c14a55bd30f 100644
--- a/examples/community/latent_consistency_interpolate.py
+++ b/examples/community/latent_consistency_interpolate.py
@@ -9,7 +9,7 @@
from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from diffusers.models import AutoencoderKL, UNet2DConditionModel
from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
from diffusers.schedulers import LCMScheduler
from diffusers.utils import (
@@ -190,7 +190,7 @@ def slerp(
class LatentConsistencyModelWalkPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r"""
Pipeline for text-to-image generation using a latent consistency model.
@@ -273,67 +273,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
def encode_prompt(
self,
diff --git a/examples/community/llm_grounded_diffusion.py b/examples/community/llm_grounded_diffusion.py
index d815b4ea8e42..5db144a9a23a 100644
--- a/examples/community/llm_grounded_diffusion.py
+++ b/examples/community/llm_grounded_diffusion.py
@@ -35,6 +35,7 @@
from diffusers.models.attention_processor import AttnProcessor2_0
from diffusers.models.lora import adjust_lora_scale_text_encoder
from diffusers.pipelines import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
@@ -267,7 +268,12 @@ def __call__(
class LLMGroundedDiffusionPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ LoraLoaderMixin,
+ IPAdapterMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for layout-grounded text-to-image generation using LLM-grounded Diffusion (LMD+): https://arxiv.org/pdf/2305.13655.pdf.
@@ -1180,39 +1186,6 @@ def latent_lmd_guidance(
# Below are methods copied from StableDiffusionPipeline
# The design choice of not inheriting from StableDiffusionPipeline is discussed here: https://github.com/huggingface/diffusers/pull/5993#issuecomment-1834258517
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
@@ -1522,34 +1495,6 @@ def prepare_latents(
latents = latents * self.scheduler.init_noise_sigma
return latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
index 7249e033186f..78d93bfb7081 100644
--- a/examples/community/lpw_stable_diffusion.py
+++ b/examples/community/lpw_stable_diffusion.py
@@ -13,13 +13,12 @@
from diffusers.image_processor import VaeImageProcessor
from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
PIL_INTERPOLATION,
deprecate,
- is_accelerate_available,
- is_accelerate_version,
logging,
)
from diffusers.utils.torch_utils import randn_tensor
@@ -410,7 +409,7 @@ def preprocess_mask(mask, batch_size, scale_factor=8):
class StableDiffusionLongPromptWeightingPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r"""
Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing
@@ -534,112 +533,6 @@ def __init__(
requires_safety_checker=requires_safety_checker,
)
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
- steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
- several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- Note that offloading happens on a submodule basis. Memory savings are higher than with
- `enable_model_cpu_offload`, but performance is lower.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
- from accelerate import cpu_offload
- else:
- raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- if self.device.type != "cpu":
- self.to("cpu", silence_dtype_warnings=True)
- torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
- cpu_offload(cpu_offloaded_model, device)
-
- if self.safety_checker is not None:
- cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
- def enable_model_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
- from accelerate import cpu_offload_with_hook
- else:
- raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- if self.device.type != "cpu":
- self.to("cpu", silence_dtype_warnings=True)
- torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
-
- hook = None
- for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
- _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
- if self.safety_checker is not None:
- _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
- # We'll offload the last model manually.
- self.final_offload_hook = hook
-
- @property
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
def _encode_prompt(
self,
prompt,
diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py
index 4b63fa7019a7..b0abbba9a32b 100644
--- a/examples/community/lpw_stable_diffusion_xl.py
+++ b/examples/community/lpw_stable_diffusion_xl.py
@@ -26,11 +26,11 @@
from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from diffusers.models.attention_processor import (
AttnProcessor2_0,
- FusedAttnProcessor2_0,
LoRAAttnProcessor2_0,
LoRAXFormersAttnProcessor,
XFormersAttnProcessor,
)
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
@@ -545,7 +545,12 @@ def retrieve_timesteps(
class SDXLLongPromptWeightingPipeline(
- DiffusionPipeline, FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ FromSingleFileMixin,
+ IPAdapterMixin,
+ LoraLoaderMixin,
+ TextualInversionLoaderMixin,
):
r"""
Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -649,39 +654,6 @@ def __init__(
else:
self.watermark = None
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def enable_model_cpu_offload(self, gpu_id=0):
r"""
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
@@ -1030,95 +1002,6 @@ def check_inputs(
"If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
# get the original timestep using init_timestep
if denoising_start is None:
diff --git a/examples/community/mixture_canvas.py b/examples/community/mixture_canvas.py
index 3737183e5513..2083c7acad38 100644
--- a/examples/community/mixture_canvas.py
+++ b/examples/community/mixture_canvas.py
@@ -12,7 +12,7 @@
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -264,7 +264,7 @@ def _quartic_weights(self, region: DiffusionRegion) -> torch.tensor:
return torch.tile(torch.tensor(weights), (self.nbatch, self.latent_space_dim, 1, 1))
-class StableDiffusionCanvasPipeline(DiffusionPipeline):
+class StableDiffusionCanvasPipeline(DiffusionPipeline, StableDiffusionMixin):
"""Stable Diffusion pipeline that mixes several diffusers in the same canvas"""
def __init__(
diff --git a/examples/community/multilingual_stable_diffusion.py b/examples/community/multilingual_stable_diffusion.py
index 7597efd215af..f3b0540cf4d3 100644
--- a/examples/community/multilingual_stable_diffusion.py
+++ b/examples/community/multilingual_stable_diffusion.py
@@ -11,9 +11,9 @@
pipeline,
)
-from diffusers import DiffusionPipeline
from diffusers.configuration_utils import FrozenDict
from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -48,7 +48,7 @@ def translate_prompt(prompt, translation_tokenizer, translation_model, device):
return en_trans[0]
-class MultilingualStableDiffusion(DiffusionPipeline):
+class MultilingualStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion in different languages.
@@ -135,33 +135,6 @@ def __init__(
feature_extractor=feature_extractor,
)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- r"""
- Enable sliced attention computation.
-
- When this option is enabled, the attention module will split the input tensor in slices, to compute attention
- in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
- Args:
- slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
- When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
- a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
- `attention_head_dim` must be a multiple of `slice_size`.
- """
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- r"""
- Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
- back to computing attention in one step.
- """
- # set slice_size = `None` to disable `attention slicing`
- self.enable_attention_slicing(None)
-
@torch.no_grad()
def __call__(
self,
diff --git a/examples/community/pipeline_animatediff_controlnet.py b/examples/community/pipeline_animatediff_controlnet.py
index 5873ceaa8d70..44fc1bae6f7f 100644
--- a/examples/community/pipeline_animatediff_controlnet.py
+++ b/examples/community/pipeline_animatediff_controlnet.py
@@ -28,7 +28,7 @@
from diffusers.models.lora import adjust_lora_scale_text_encoder
from diffusers.models.unets.unet_motion_model import MotionAdapter
from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.schedulers import (
DDIMScheduler,
DPMSolverMultistepScheduler,
@@ -111,7 +111,9 @@ class AnimateDiffControlNetPipelineOutput(BaseOutput):
frames: Union[torch.Tensor, np.ndarray]
-class AnimateDiffControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
+class AnimateDiffControlNetPipeline(
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin
+):
r"""
Pipeline for text-to-video generation.
@@ -441,67 +443,6 @@ def decode_latents(self, latents):
video = video.float()
return video
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
diff --git a/examples/community/pipeline_animatediff_img2video.py b/examples/community/pipeline_animatediff_img2video.py
index e77e26592d3e..982bed08140b 100644
--- a/examples/community/pipeline_animatediff_img2video.py
+++ b/examples/community/pipeline_animatediff_img2video.py
@@ -30,9 +30,9 @@
from diffusers.loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.models.unets.unet_motion_model import MotionAdapter
+from diffusers.models.unet_motion_model import MotionAdapter
from diffusers.pipelines.animatediff.pipeline_output import AnimateDiffPipelineOutput
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.schedulers import (
DDIMScheduler,
DPMSolverMultistepScheduler,
@@ -232,7 +232,9 @@ def retrieve_timesteps(
return timesteps, num_inference_steps
-class AnimateDiffImgToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
+class AnimateDiffImgToVideoPipeline(
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin
+):
r"""
Pipeline for image-to-video generation.
@@ -564,67 +566,6 @@ def decode_latents(self, latents):
video = video.float()
return video
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
diff --git a/examples/community/pipeline_demofusion_sdxl.py b/examples/community/pipeline_demofusion_sdxl.py
index ab0d3cf9dd29..e29678b55922 100644
--- a/examples/community/pipeline_demofusion_sdxl.py
+++ b/examples/community/pipeline_demofusion_sdxl.py
@@ -23,7 +23,7 @@
XFormersAttnProcessor,
)
from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
is_accelerate_available,
@@ -93,7 +93,9 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
return noise_cfg
-class DemoFusionSDXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin):
+class DemoFusionSDXLPipeline(
+ DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+):
r"""
Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -176,39 +178,6 @@ def __init__(
else:
self.watermark = None
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def encode_prompt(
self,
prompt: str,
diff --git a/examples/community/pipeline_sdxl_style_aligned.py b/examples/community/pipeline_sdxl_style_aligned.py
index a9509346dae6..ec4aa3791557 100644
--- a/examples/community/pipeline_sdxl_style_aligned.py
+++ b/examples/community/pipeline_sdxl_style_aligned.py
@@ -51,7 +51,7 @@
XFormersAttnProcessor,
)
from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
@@ -389,6 +389,7 @@ def retrieve_latents(
class StyleAlignedSDXLPipeline(
DiffusionPipeline,
+ StableDiffusionMixin,
FromSingleFileMixin,
StableDiffusionXLLoraLoaderMixin,
TextualInversionLoaderMixin,
@@ -504,39 +505,6 @@ def __init__(
else:
self.watermark = None
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def encode_prompt(
self,
prompt: str,
@@ -1187,34 +1155,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
def _enable_shared_attention_processors(
self,
share_attention: bool,
@@ -1361,65 +1301,6 @@ def disable_style_aligned(self):
self._style_aligned_norm_layers = None
self._disable_shared_attention_processors()
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
index e1437bee7a15..fe94646a4436 100644
--- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
+++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py
@@ -33,7 +33,7 @@
)
from diffusers.models.lora import adjust_lora_scale_text_encoder
from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
@@ -158,7 +158,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
class StableDiffusionXLControlNetAdapterPipeline(
- DiffusionPipeline, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ FromSingleFileMixin,
+ StableDiffusionXLLoraLoaderMixin,
+ TextualInversionLoaderMixin,
):
r"""
Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
@@ -234,39 +238,6 @@ def __init__(
)
self.default_sample_size = self.unet.config.sample_size
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt(
self,
@@ -863,34 +834,6 @@ def _default_height_width(self, height, width, image):
return height, width
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
def prepare_control_image(
self,
image,
diff --git a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
index c7967bbc12a2..ac645e0802d2 100644
--- a/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
+++ b/examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py
@@ -52,6 +52,7 @@
)
from diffusers.models.lora import adjust_lora_scale_text_encoder
from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
@@ -303,7 +304,9 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
return noise_cfg
-class StableDiffusionXLControlNetAdapterInpaintPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin):
+class StableDiffusionXLControlNetAdapterInpaintPipeline(
+ DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin, LoraLoaderMixin
+):
r"""
Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
https://arxiv.org/abs/2302.08453
@@ -383,39 +386,6 @@ def __init__(
)
self.default_sample_size = self.unet.config.sample_size
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt(
self,
@@ -1207,34 +1177,6 @@ def _default_height_width(self, height, width, image):
return height, width
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
def prepare_control_image(
self,
image,
diff --git a/examples/community/pipeline_stable_diffusion_xl_ipex.py b/examples/community/pipeline_stable_diffusion_xl_ipex.py
index c57d58bb58ba..68ad5dbec77d 100644
--- a/examples/community/pipeline_stable_diffusion_xl_ipex.py
+++ b/examples/community/pipeline_stable_diffusion_xl_ipex.py
@@ -267,39 +267,6 @@ def __init__(
else:
self.watermark = None
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def encode_prompt(
self,
prompt: str,
@@ -701,34 +668,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/examples/community/pipeline_zero1to3.py b/examples/community/pipeline_zero1to3.py
index 600cf2dc1b63..133aa694c18c 100644
--- a/examples/community/pipeline_zero1to3.py
+++ b/examples/community/pipeline_zero1to3.py
@@ -22,18 +22,16 @@
# randn_tensor,
# replace_example_docstring,
# )
-# from ..pipeline_utils import DiffusionPipeline
+# from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
# from . import StableDiffusionPipelineOutput
# from .safety_checker import StableDiffusionSafetyChecker
-from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
+from diffusers import AutoencoderKL, DiffusionPipeline, StableDiffusionMixin, UNet2DConditionModel
from diffusers.configuration_utils import ConfigMixin, FrozenDict
from diffusers.models.modeling_utils import ModelMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
deprecate,
- is_accelerate_available,
- is_accelerate_version,
logging,
replace_example_docstring,
)
@@ -68,7 +66,7 @@ def forward(self, x):
return self.projection(x)
-class Zero1to3StableDiffusionPipeline(DiffusionPipeline):
+class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for single view conditioned novel view generation using Zero1to3.
@@ -187,109 +185,6 @@ def __init__(
self.register_to_config(requires_safety_checker=requires_safety_checker)
# self.model_mode = None
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
- steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
- several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- Note that offloading happens on a submodule basis. Memory savings are higher than with
- `enable_model_cpu_offload`, but performance is lower.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
- from accelerate import cpu_offload
- else:
- raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- if self.device.type != "cpu":
- self.to("cpu", silence_dtype_warnings=True)
- torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
- cpu_offload(cpu_offloaded_model, device)
-
- if self.safety_checker is not None:
- cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
- def enable_model_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
- from accelerate import cpu_offload_with_hook
- else:
- raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- if self.device.type != "cpu":
- self.to("cpu", silence_dtype_warnings=True)
- torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
-
- hook = None
- for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
- _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
- if self.safety_checker is not None:
- _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
- # We'll offload the last model manually.
- self.final_offload_hook = hook
-
- @property
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
def _encode_prompt(
self,
prompt,
diff --git a/examples/community/sd_text2img_k_diffusion.py b/examples/community/sd_text2img_k_diffusion.py
index c6a4bf2ce613..3299a7605257 100755
--- a/examples/community/sd_text2img_k_diffusion.py
+++ b/examples/community/sd_text2img_k_diffusion.py
@@ -19,9 +19,9 @@
import torch
from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
-from diffusers import DiffusionPipeline, LMSDiscreteScheduler
+from diffusers import DiffusionPipeline, LMSDiscreteScheduler, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import is_accelerate_available, logging
+from diffusers.utils import logging
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -41,7 +41,7 @@ def apply_model(self, *args, **kwargs):
return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample
-class StableDiffusionPipeline(DiffusionPipeline):
+class StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -120,68 +120,6 @@ def set_scheduler(self, scheduler_type: str):
sampling = getattr(library, "sampling")
self.sampler = getattr(sampling, scheduler_type)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- r"""
- Enable sliced attention computation.
-
- When this option is enabled, the attention module will split the input tensor in slices, to compute attention
- in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
- Args:
- slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
- When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
- a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
- `attention_head_dim` must be a multiple of `slice_size`.
- """
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- r"""
- Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
- back to computing attention in one step.
- """
- # set slice_size = `None` to disable `attention slicing`
- self.enable_attention_slicing(None)
-
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- """
- if is_accelerate_available():
- from accelerate import cpu_offload
- else:
- raise ImportError("Please install accelerate via `pip install accelerate`")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
- if cpu_offloaded_model is not None:
- cpu_offload(cpu_offloaded_model, device)
-
- @property
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
r"""
Encodes the prompt into text encoder hidden states.
diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py
index 9318277b8f01..20f972f049b3 100644
--- a/examples/community/seed_resize_stable_diffusion.py
+++ b/examples/community/seed_resize_stable_diffusion.py
@@ -9,6 +9,7 @@
from diffusers import DiffusionPipeline
from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -18,7 +19,7 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class SeedResizeStableDiffusionPipeline(DiffusionPipeline):
+class SeedResizeStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -67,33 +68,6 @@ def __init__(
feature_extractor=feature_extractor,
)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- r"""
- Enable sliced attention computation.
-
- When this option is enabled, the attention module will split the input tensor in slices, to compute attention
- in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
- Args:
- slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
- When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
- a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
- `attention_head_dim` must be a multiple of `slice_size`.
- """
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- r"""
- Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
- back to computing attention in one step.
- """
- # set slice_size = `None` to disable `attention slicing`
- self.enable_attention_slicing(None)
-
@torch.no_grad()
def __call__(
self,
diff --git a/examples/community/speech_to_image_diffusion.py b/examples/community/speech_to_image_diffusion.py
index 63bcfb662517..3537ef89e1a1 100644
--- a/examples/community/speech_to_image_diffusion.py
+++ b/examples/community/speech_to_image_diffusion.py
@@ -18,6 +18,7 @@
PNDMScheduler,
UNet2DConditionModel,
)
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.utils import logging
@@ -26,7 +27,7 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class SpeechToImagePipeline(DiffusionPipeline):
+class SpeechToImagePipeline(DiffusionPipeline, StableDiffusionMixin):
def __init__(
self,
speech_model: WhisperForConditionalGeneration,
@@ -62,14 +63,6 @@ def __init__(
feature_extractor=feature_extractor,
)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- if slice_size == "auto":
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- self.enable_attention_slicing(None)
-
@torch.no_grad()
def __call__(
self,
diff --git a/examples/community/stable_diffusion_comparison.py b/examples/community/stable_diffusion_comparison.py
index 7997a0cc0186..dab5705b3370 100644
--- a/examples/community/stable_diffusion_comparison.py
+++ b/examples/community/stable_diffusion_comparison.py
@@ -12,6 +12,7 @@
StableDiffusionPipeline,
UNet2DConditionModel,
)
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -22,7 +23,7 @@
pipe4_model_id = "CompVis/stable-diffusion-v1-4"
-class StableDiffusionComparisonPipeline(DiffusionPipeline):
+class StableDiffusionComparisonPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for parallel comparison of Stable Diffusion v1-v4
This pipeline inherits from DiffusionPipeline and depends on the use of an Auth Token for
@@ -83,31 +84,6 @@ def __init__(
def layers(self) -> Dict[str, Any]:
return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- r"""
- Enable sliced attention computation.
- When this option is enabled, the attention module will split the input tensor in slices, to compute attention
- in several steps. This is useful to save some memory in exchange for a small speed decrease.
- Args:
- slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
- When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
- a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
- `attention_head_dim` must be a multiple of `slice_size`.
- """
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- r"""
- Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
- back to computing attention in one step.
- """
- # set slice_size = `None` to disable `attention slicing`
- self.enable_attention_slicing(None)
-
@torch.no_grad()
def text2img_sd1_1(
self,
diff --git a/examples/community/stable_diffusion_controlnet_img2img.py b/examples/community/stable_diffusion_controlnet_img2img.py
index a2b92fff0fb5..5f9083616a84 100644
--- a/examples/community/stable_diffusion_controlnet_img2img.py
+++ b/examples/community/stable_diffusion_controlnet_img2img.py
@@ -8,14 +8,13 @@
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-from diffusers import AutoencoderKL, ControlNetModel, DiffusionPipeline, UNet2DConditionModel, logging
+from diffusers import AutoencoderKL, ControlNetModel, UNet2DConditionModel, logging
from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
PIL_INTERPOLATION,
- is_accelerate_available,
- is_accelerate_version,
replace_example_docstring,
)
from diffusers.utils.torch_utils import randn_tensor
@@ -130,7 +129,7 @@ def prepare_controlnet_conditioning_image(
return controlnet_conditioning_image
-class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline):
+class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin):
"""
Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
"""
@@ -183,89 +182,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
- steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- Note that offloading happens on a submodule basis. Memory savings are higher than with
- `enable_model_cpu_offload`, but performance is lower.
- """
- if is_accelerate_available():
- from accelerate import cpu_offload
- else:
- raise ImportError("Please install accelerate via `pip install accelerate`")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
- cpu_offload(cpu_offloaded_model, device)
-
- if self.safety_checker is not None:
- cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
- def enable_model_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
- from accelerate import cpu_offload_with_hook
- else:
- raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- hook = None
- for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
- _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
- if self.safety_checker is not None:
- # the safety checker can offload the vae again
- _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
- # control net hook has be manually offloaded as it alternates with unet
- cpu_offload_with_hook(self.controlnet, device)
-
- # We'll offload the last model manually.
- self.final_offload_hook = hook
-
- @property
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
def _encode_prompt(
self,
prompt,
diff --git a/examples/community/stable_diffusion_controlnet_inpaint.py b/examples/community/stable_diffusion_controlnet_inpaint.py
index b87973366418..0173ed41bee6 100644
--- a/examples/community/stable_diffusion_controlnet_inpaint.py
+++ b/examples/community/stable_diffusion_controlnet_inpaint.py
@@ -9,14 +9,13 @@
import torch.nn.functional as F
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-from diffusers import AutoencoderKL, ControlNetModel, DiffusionPipeline, UNet2DConditionModel, logging
+from diffusers import AutoencoderKL, ControlNetModel, UNet2DConditionModel, logging
from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
PIL_INTERPOLATION,
- is_accelerate_available,
- is_accelerate_version,
replace_example_docstring,
)
from diffusers.utils.torch_utils import randn_tensor
@@ -228,7 +227,7 @@ def prepare_controlnet_conditioning_image(
return controlnet_conditioning_image
-class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
+class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusionMixin):
"""
Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
"""
@@ -282,89 +281,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
- steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- Note that offloading happens on a submodule basis. Memory savings are higher than with
- `enable_model_cpu_offload`, but performance is lower.
- """
- if is_accelerate_available():
- from accelerate import cpu_offload
- else:
- raise ImportError("Please install accelerate via `pip install accelerate`")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
- cpu_offload(cpu_offloaded_model, device)
-
- if self.safety_checker is not None:
- cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
- def enable_model_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
- from accelerate import cpu_offload_with_hook
- else:
- raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- hook = None
- for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
- _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
- if self.safety_checker is not None:
- # the safety checker can offload the vae again
- _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
- # control net hook has be manually offloaded as it alternates with unet
- cpu_offload_with_hook(self.controlnet, device)
-
- # We'll offload the last model manually.
- self.final_offload_hook = hook
-
- @property
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
def _encode_prompt(
self,
prompt,
diff --git a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
index 96ad3c39239d..d056eb112165 100644
--- a/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
+++ b/examples/community/stable_diffusion_controlnet_inpaint_img2img.py
@@ -9,13 +9,12 @@
import torch.nn.functional as F
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-from diffusers import AutoencoderKL, ControlNetModel, DiffusionPipeline, UNet2DConditionModel, logging
+from diffusers import AutoencoderKL, ControlNetModel, UNet2DConditionModel, logging
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
PIL_INTERPOLATION,
- is_accelerate_available,
- is_accelerate_version,
replace_example_docstring,
)
from diffusers.utils.torch_utils import randn_tensor
@@ -217,7 +216,7 @@ def prepare_controlnet_conditioning_image(
return controlnet_conditioning_image
-class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline):
+class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin):
"""
Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
"""
@@ -267,89 +266,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
- steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae, controlnet, and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- Note that offloading happens on a submodule basis. Memory savings are higher than with
- `enable_model_cpu_offload`, but performance is lower.
- """
- if is_accelerate_available():
- from accelerate import cpu_offload
- else:
- raise ImportError("Please install accelerate via `pip install accelerate`")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.controlnet]:
- cpu_offload(cpu_offloaded_model, device)
-
- if self.safety_checker is not None:
- cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
- def enable_model_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
- from accelerate import cpu_offload_with_hook
- else:
- raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- hook = None
- for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
- _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
- if self.safety_checker is not None:
- # the safety checker can offload the vae again
- _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
- # control net hook has be manually offloaded as it alternates with unet
- cpu_offload_with_hook(self.controlnet, device)
-
- # We'll offload the last model manually.
- self.final_offload_hook = hook
-
- @property
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
def _encode_prompt(
self,
prompt,
diff --git a/examples/community/stable_diffusion_ipex.py b/examples/community/stable_diffusion_ipex.py
index bf58cc8453a1..8e71f79e9ae4 100644
--- a/examples/community/stable_diffusion_ipex.py
+++ b/examples/community/stable_diffusion_ipex.py
@@ -23,14 +23,12 @@
from diffusers.configuration_utils import FrozenDict
from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
deprecate,
- is_accelerate_available,
- is_accelerate_version,
logging,
replace_example_docstring,
)
@@ -62,7 +60,9 @@
"""
-class StableDiffusionIPEXPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionIPEXPipeline(
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+):
r"""
Pipeline for text-to-image generation using Stable Diffusion on IPEX.
@@ -304,109 +304,6 @@ def prepare_for_ipex(self, promt, dtype=torch.float32, height=None, width=None,
ave_decoder_trace_model = torch.jit.freeze(ave_decoder_trace_model)
self.vae.decoder.forward = ave_decoder_trace_model.forward
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
- steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
- several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- Note that offloading happens on a submodule basis. Memory savings are higher than with
- `enable_model_cpu_offload`, but performance is lower.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
- from accelerate import cpu_offload
- else:
- raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- if self.device.type != "cpu":
- self.to("cpu", silence_dtype_warnings=True)
- torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
- cpu_offload(cpu_offloaded_model, device)
-
- if self.safety_checker is not None:
- cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
- def enable_model_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
- from accelerate import cpu_offload_with_hook
- else:
- raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- if self.device.type != "cpu":
- self.to("cpu", silence_dtype_warnings=True)
- torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
-
- hook = None
- for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
- _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
- if self.safety_checker is not None:
- _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
- # We'll offload the last model manually.
- self.final_offload_hook = hook
-
- @property
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
def _encode_prompt(
self,
prompt,
diff --git a/examples/community/stable_diffusion_mega.py b/examples/community/stable_diffusion_mega.py
index faed00b49d40..e53afb703e24 100644
--- a/examples/community/stable_diffusion_mega.py
+++ b/examples/community/stable_diffusion_mega.py
@@ -16,6 +16,7 @@
UNet2DConditionModel,
)
from diffusers.configuration_utils import FrozenDict
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.utils import deprecate, logging
@@ -23,7 +24,7 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class StableDiffusionMegaPipeline(DiffusionPipeline):
+class StableDiffusionMegaPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -94,33 +95,6 @@ def __init__(
def components(self) -> Dict[str, Any]:
return {k: getattr(self, k) for k in self.config.keys() if not k.startswith("_")}
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- r"""
- Enable sliced attention computation.
-
- When this option is enabled, the attention module will split the input tensor in slices, to compute attention
- in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
- Args:
- slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
- When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
- a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
- `attention_head_dim` must be a multiple of `slice_size`.
- """
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- r"""
- Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
- back to computing attention in one step.
- """
- # set slice_size = `None` to disable `attention slicing`
- self.enable_attention_slicing(None)
-
@torch.no_grad()
def inpaint(
self,
diff --git a/examples/community/stable_diffusion_repaint.py b/examples/community/stable_diffusion_repaint.py
index db2de0897570..02bef293bba8 100644
--- a/examples/community/stable_diffusion_repaint.py
+++ b/examples/community/stable_diffusion_repaint.py
@@ -24,14 +24,13 @@
from diffusers import AutoencoderKL, DiffusionPipeline, UNet2DConditionModel
from diffusers.configuration_utils import FrozenDict, deprecate
from diffusers.loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import (
StableDiffusionSafetyChecker,
)
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
- is_accelerate_available,
- is_accelerate_version,
logging,
)
from diffusers.utils.torch_utils import randn_tensor
@@ -140,7 +139,9 @@ def prepare_mask_and_masked_image(image, mask):
return mask, masked_image
-class StableDiffusionRepaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionRepaintPipeline(
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+):
r"""
Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
@@ -276,80 +277,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- Note that offloading happens on a submodule basis. Memory savings are higher than with
- `enable_model_cpu_offload`, but performance is lower.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
- from accelerate import cpu_offload
- else:
- raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- if self.device.type != "cpu":
- self.to("cpu", silence_dtype_warnings=True)
- torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
- cpu_offload(cpu_offloaded_model, device)
-
- if self.safety_checker is not None:
- cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload
- def enable_model_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
- to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
- method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
- `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
- """
- if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
- from accelerate import cpu_offload_with_hook
- else:
- raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- if self.device.type != "cpu":
- self.to("cpu", silence_dtype_warnings=True)
- torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
-
- hook = None
- for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]:
- _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
-
- if self.safety_checker is not None:
- _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook)
-
- # We'll offload the last model manually.
- self.final_offload_hook = hook
-
- @property
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
diff --git a/examples/community/text_inpainting.py b/examples/community/text_inpainting.py
index cd02049a4afb..ea4da966bb71 100644
--- a/examples/community/text_inpainting.py
+++ b/examples/community/text_inpainting.py
@@ -13,16 +13,17 @@
from diffusers import DiffusionPipeline
from diffusers.configuration_utils import FrozenDict
from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import deprecate, is_accelerate_available, logging
+from diffusers.utils import deprecate, logging
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class TextInpainting(DiffusionPipeline):
+class TextInpainting(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text based inpainting using Stable Diffusion.
Uses CLIPSeg to get a mask from the given text, then calls the Inpainting pipeline with the generated mask
@@ -120,69 +121,6 @@ def __init__(
feature_extractor=feature_extractor,
)
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- r"""
- Enable sliced attention computation.
-
- When this option is enabled, the attention module will split the input tensor in slices, to compute attention
- in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
- Args:
- slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
- When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
- a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
- `attention_head_dim` must be a multiple of `slice_size`.
- """
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- slice_size = self.unet.config.attention_head_dim // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- r"""
- Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
- back to computing attention in one step.
- """
- # set slice_size = `None` to disable `attention slicing`
- self.enable_attention_slicing(None)
-
- def enable_sequential_cpu_offload(self):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- """
- if is_accelerate_available():
- from accelerate import cpu_offload
- else:
- raise ImportError("Please install accelerate via `pip install accelerate`")
-
- device = torch.device("cuda")
-
- for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
- if cpu_offloaded_model is not None:
- cpu_offload(cpu_offloaded_model, device)
-
- @property
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
@torch.no_grad()
def __call__(
self,
diff --git a/examples/community/unclip_image_interpolation.py b/examples/community/unclip_image_interpolation.py
index 95548b152c07..e3bb44e5030b 100644
--- a/examples/community/unclip_image_interpolation.py
+++ b/examples/community/unclip_image_interpolation.py
@@ -19,7 +19,7 @@
UNet2DModel,
)
from diffusers.pipelines.unclip import UnCLIPTextProjModel
-from diffusers.utils import is_accelerate_available, logging
+from diffusers.utils import logging
from diffusers.utils.torch_utils import randn_tensor
@@ -204,50 +204,6 @@ def _encode_image(self, image, device, num_images_per_prompt, image_embeddings:
return image_embeddings
- # Copied from diffusers.pipelines.unclip.pipeline_unclip_image_variation.UnCLIPImageVariationPipeline.enable_sequential_cpu_offload
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
- models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
- when their specific submodule has its `forward` method called.
- """
- if is_accelerate_available():
- from accelerate import cpu_offload
- else:
- raise ImportError("Please install accelerate via `pip install accelerate`")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- models = [
- self.decoder,
- self.text_proj,
- self.text_encoder,
- self.super_res_first,
- self.super_res_last,
- ]
- for cpu_offloaded_model in models:
- if cpu_offloaded_model is not None:
- cpu_offload(cpu_offloaded_model, device)
-
- @property
- # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._execution_device
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if self.device != torch.device("meta") or not hasattr(self.decoder, "_hf_hook"):
- return self.device
- for module in self.decoder.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
@torch.no_grad()
def __call__(
self,
diff --git a/examples/community/unclip_text_interpolation.py b/examples/community/unclip_text_interpolation.py
index 764299433b4c..be6a0858b35e 100644
--- a/examples/community/unclip_text_interpolation.py
+++ b/examples/community/unclip_text_interpolation.py
@@ -15,7 +15,7 @@
UNet2DModel,
)
from diffusers.pipelines.unclip import UnCLIPTextProjModel
-from diffusers.utils import is_accelerate_available, logging
+from diffusers.utils import logging
from diffusers.utils.torch_utils import randn_tensor
@@ -212,51 +212,6 @@ def _encode_prompt(
return prompt_embeds, text_encoder_hidden_states, text_mask
- # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.enable_sequential_cpu_offload
- def enable_sequential_cpu_offload(self, gpu_id=0):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's
- models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only
- when their specific submodule has its `forward` method called.
- """
- if is_accelerate_available():
- from accelerate import cpu_offload
- else:
- raise ImportError("Please install accelerate via `pip install accelerate`")
-
- device = torch.device(f"cuda:{gpu_id}")
-
- # TODO: self.prior.post_process_latents is not covered by the offload hooks, so it fails if added to the list
- models = [
- self.decoder,
- self.text_proj,
- self.text_encoder,
- self.super_res_first,
- self.super_res_last,
- ]
- for cpu_offloaded_model in models:
- if cpu_offloaded_model is not None:
- cpu_offload(cpu_offloaded_model, device)
-
- @property
- # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._execution_device
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if self.device != torch.device("meta") or not hasattr(self.decoder, "_hf_hook"):
- return self.device
- for module in self.decoder.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
@torch.no_grad()
def __call__(
self,
diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py
index 1a5ea350b857..241e661536d3 100644
--- a/examples/community/wildcard_stable_diffusion.py
+++ b/examples/community/wildcard_stable_diffusion.py
@@ -8,9 +8,9 @@
import torch
from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-from diffusers import DiffusionPipeline
from diffusers.configuration_utils import FrozenDict
from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
@@ -63,7 +63,7 @@ class WildcardStableDiffusionOutput(StableDiffusionPipelineOutput):
prompts: List[str]
-class WildcardStableDiffusionPipeline(DiffusionPipeline):
+class WildcardStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Example Usage:
pipe = WildcardStableDiffusionPipeline.from_pretrained(
diff --git a/examples/research_projects/controlnetxs/pipeline_controlnet_xs.py b/examples/research_projects/controlnetxs/pipeline_controlnet_xs.py
index 32646c7c7715..88a586e9271d 100644
--- a/examples/research_projects/controlnetxs/pipeline_controlnet_xs.py
+++ b/examples/research_projects/controlnetxs/pipeline_controlnet_xs.py
@@ -26,7 +26,7 @@
from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from diffusers.models import AutoencoderKL, UNet2DConditionModel
from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import KarrasDiffusionSchedulers
@@ -44,7 +44,7 @@
class StableDiffusionControlNetXSPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r"""
Pipeline for text-to-image generation using Stable Diffusion with ControlNet-XS guidance.
@@ -139,39 +139,6 @@ def __init__(
)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
@@ -596,34 +563,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
latents = latents * self.scheduler.init_noise_sigma
return latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@torch.no_grad()
def __call__(
self,
diff --git a/examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py b/examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py
index b9b390f1c00c..d0186573fa9c 100644
--- a/examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py
+++ b/examples/research_projects/controlnetxs/pipeline_controlnet_xs_sd_xl.py
@@ -31,7 +31,7 @@
XFormersAttnProcessor,
)
from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import (
@@ -52,7 +52,11 @@
class StableDiffusionXLControlNetXSPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ StableDiffusionXLLoraLoaderMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet-XS guidance.
@@ -145,39 +149,6 @@ def __init__(
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt(
self,
@@ -661,34 +632,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@torch.no_grad()
def __call__(
self,
diff --git a/examples/research_projects/rdm/pipeline_rdm.py b/examples/research_projects/rdm/pipeline_rdm.py
index 28b4cacb8319..dd97bf71b9db 100644
--- a/examples/research_projects/rdm/pipeline_rdm.py
+++ b/examples/research_projects/rdm/pipeline_rdm.py
@@ -17,16 +17,17 @@
LMSDiscreteScheduler,
PNDMScheduler,
UNet2DConditionModel,
- logging,
)
from diffusers.image_processor import VaeImageProcessor
-from diffusers.utils import is_accelerate_available, randn_tensor
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class RDMPipeline(DiffusionPipeline):
+class RDMPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Retrieval Augmented Diffusion.
@@ -81,121 +82,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.retriever = retriever
- def enable_xformers_memory_efficient_attention(self):
- r"""
- Enable memory efficient attention as implemented in xformers.
-
- When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
- time. Speed up at training time is not guaranteed.
-
- Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
- is used.
- """
- self.unet.set_use_memory_efficient_attention_xformers(True)
-
- def disable_xformers_memory_efficient_attention(self):
- r"""
- Disable memory efficient attention as implemented in xformers.
- """
- self.unet.set_use_memory_efficient_attention_xformers(False)
-
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
- steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
- several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
- r"""
- Enable sliced attention computation.
-
- When this option is enabled, the attention module will split the input tensor in slices, to compute attention
- in several steps. This is useful to save some memory in exchange for a small speed decrease.
-
- Args:
- slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
- When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
- a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
- `attention_head_dim` must be a multiple of `slice_size`.
- """
- if slice_size == "auto":
- # half the attention head size is usually a good trade-off between
- # speed and memory
- if isinstance(self.unet.config.attention_head_dim, int):
- slice_size = self.unet.config.attention_head_dim // 2
- else:
- slice_size = self.unet.config.attention_head_dim[0] // 2
- self.unet.set_attention_slice(slice_size)
-
- def disable_attention_slicing(self):
- r"""
- Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
- back to computing attention in one step.
- """
- # set slice_size = `None` to disable `attention slicing`
- self.enable_attention_slicing(None)
-
- def enable_sequential_cpu_offload(self):
- r"""
- Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
- text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
- `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
- """
- if is_accelerate_available():
- from accelerate import cpu_offload
- else:
- raise ImportError("Please install accelerate via `pip install accelerate`")
-
- device = torch.device("cuda")
-
- for cpu_offloaded_model in [self.unet, self.clip, self.vae]:
- if cpu_offloaded_model is not None:
- cpu_offload(cpu_offloaded_model, device)
-
- @property
- def _execution_device(self):
- r"""
- Returns the device on which the pipeline's models will be executed. After calling
- `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
- hooks.
- """
- if not hasattr(self.unet, "_hf_hook"):
- return self.device
- for module in self.unet.modules():
- if (
- hasattr(module, "_hf_hook")
- and hasattr(module._hf_hook, "execution_device")
- and module._hf_hook.execution_device is not None
- ):
- return torch.device(module._hf_hook.execution_device)
- return self.device
-
def _encode_prompt(self, prompt):
# get prompt text embeddings
text_inputs = self.tokenizer(
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 28fde1248d78..165bbf263b29 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -128,6 +128,7 @@
"PNDMPipeline",
"RePaintPipeline",
"ScoreSdeVePipeline",
+ "StableDiffusionMixin",
]
)
_import_structure["schedulers"].extend(
@@ -514,6 +515,7 @@
PNDMPipeline,
RePaintPipeline,
ScoreSdeVePipeline,
+ StableDiffusionMixin,
)
from .schedulers import (
AmusedScheduler,
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
index 62d764c5edbe..5ec8876fc114 100644
--- a/src/diffusers/models/attention_processor.py
+++ b/src/diffusers/models/attention_processor.py
@@ -116,6 +116,8 @@ def __init__(
super().__init__()
self.inner_dim = out_dim if out_dim is not None else dim_head * heads
self.query_dim = query_dim
+ self.use_bias = bias
+ self.is_cross_attention = cross_attention_dim is not None
self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
self.upcast_attention = upcast_attention
self.upcast_softmax = upcast_softmax
@@ -697,27 +699,32 @@ def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> tor
@torch.no_grad()
def fuse_projections(self, fuse=True):
- is_cross_attention = self.cross_attention_dim != self.query_dim
device = self.to_q.weight.data.device
dtype = self.to_q.weight.data.dtype
- if not is_cross_attention:
+ if not self.is_cross_attention:
# fetch weight matrices.
concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
in_features = concatenated_weights.shape[1]
out_features = concatenated_weights.shape[0]
# create a new single projection layer and copy over the weights.
- self.to_qkv = self.linear_cls(in_features, out_features, bias=False, device=device, dtype=dtype)
+ self.to_qkv = self.linear_cls(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype)
self.to_qkv.weight.copy_(concatenated_weights)
+ if self.use_bias:
+ concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
+ self.to_qkv.bias.copy_(concatenated_bias)
else:
concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data])
in_features = concatenated_weights.shape[1]
out_features = concatenated_weights.shape[0]
- self.to_kv = self.linear_cls(in_features, out_features, bias=False, device=device, dtype=dtype)
+ self.to_kv = self.linear_cls(in_features, out_features, bias=self.use_bias, device=device, dtype=dtype)
self.to_kv.weight.copy_(concatenated_weights)
+ if self.use_bias:
+ concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data])
+ self.to_kv.bias.copy_(concatenated_bias)
self.fused_projections = fuse
diff --git a/src/diffusers/models/unets/unet_3d_condition.py b/src/diffusers/models/unets/unet_3d_condition.py
index 1d5bd57cf8e0..b7641a96a7a1 100644
--- a/src/diffusers/models/unets/unet_3d_condition.py
+++ b/src/diffusers/models/unets/unet_3d_condition.py
@@ -27,6 +27,7 @@
from ..attention_processor import (
ADDED_KV_ATTENTION_PROCESSORS,
CROSS_ATTENTION_PROCESSORS,
+ Attention,
AttentionProcessor,
AttnAddedKVProcessor,
AttnProcessor,
@@ -503,6 +504,44 @@ def disable_freeu(self):
if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
setattr(upsample_block, k, None)
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+ def fuse_qkv_projections(self):
+ """
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+ key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+
+
+ This API is 🧪 experimental.
+
+
+ """
+ self.original_attn_processors = None
+
+ for _, attn_processor in self.attn_processors.items():
+ if "Added" in str(attn_processor.__class__.__name__):
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+ self.original_attn_processors = self.attn_processors
+
+ for module in self.modules():
+ if isinstance(module, Attention):
+ module.fuse_projections(fuse=True)
+
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+ def unfuse_qkv_projections(self):
+ """Disables the fused QKV projection if enabled.
+
+
+
+ This API is 🧪 experimental.
+
+
+
+ """
+ if self.original_attn_processors is not None:
+ self.set_attn_processor(self.original_attn_processors)
+
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unload_lora
def unload_lora(self):
"""Unloads LoRA weights."""
diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py
index 5dce87254986..a096f842ab6c 100644
--- a/src/diffusers/models/unets/unet_i2vgen_xl.py
+++ b/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -474,6 +474,44 @@ def disable_freeu(self):
if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
setattr(upsample_block, k, None)
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+ def fuse_qkv_projections(self):
+ """
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+ key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+
+
+ This API is 🧪 experimental.
+
+
+ """
+ self.original_attn_processors = None
+
+ for _, attn_processor in self.attn_processors.items():
+ if "Added" in str(attn_processor.__class__.__name__):
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+ self.original_attn_processors = self.attn_processors
+
+ for module in self.modules():
+ if isinstance(module, Attention):
+ module.fuse_projections(fuse=True)
+
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+ def unfuse_qkv_projections(self):
+ """Disables the fused QKV projection if enabled.
+
+
+
+ This API is 🧪 experimental.
+
+
+
+ """
+ if self.original_attn_processors is not None:
+ self.set_attn_processor(self.original_attn_processors)
+
def forward(
self,
sample: torch.FloatTensor,
diff --git a/src/diffusers/models/unets/unet_motion_model.py b/src/diffusers/models/unets/unet_motion_model.py
index 246a4b8124d8..ab2eac4c9a9a 100644
--- a/src/diffusers/models/unets/unet_motion_model.py
+++ b/src/diffusers/models/unets/unet_motion_model.py
@@ -23,6 +23,7 @@
from ..attention_processor import (
ADDED_KV_ATTENTION_PROCESSORS,
CROSS_ATTENTION_PROCESSORS,
+ Attention,
AttentionProcessor,
AttnAddedKVProcessor,
AttnProcessor,
@@ -701,6 +702,44 @@ def disable_freeu(self) -> None:
if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
setattr(upsample_block, k, None)
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+ def fuse_qkv_projections(self):
+ """
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+ key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+
+
+ This API is 🧪 experimental.
+
+
+ """
+ self.original_attn_processors = None
+
+ for _, attn_processor in self.attn_processors.items():
+ if "Added" in str(attn_processor.__class__.__name__):
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+ self.original_attn_processors = self.attn_processors
+
+ for module in self.modules():
+ if isinstance(module, Attention):
+ module.fuse_projections(fuse=True)
+
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+ def unfuse_qkv_projections(self):
+ """Disables the fused QKV projection if enabled.
+
+
+
+ This API is 🧪 experimental.
+
+
+
+ """
+ if self.original_attn_processors is not None:
+ self.set_attn_processor(self.original_attn_processors)
+
def forward(
self,
sample: torch.FloatTensor,
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 1bf41aeaf0df..a1840201f8ba 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -48,6 +48,7 @@
_import_structure["pipeline_utils"] = [
"AudioPipelineOutput",
"DiffusionPipeline",
+ "StableDiffusionMixin",
"ImagePipelineOutput",
]
_import_structure["deprecated"].extend(
@@ -329,6 +330,7 @@
AudioPipelineOutput,
DiffusionPipeline,
ImagePipelineOutput,
+ StableDiffusionMixin,
)
try:
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
index c794bd00ce85..db52e75fce23 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -42,7 +42,7 @@
)
from ...utils.torch_utils import randn_tensor
from ..free_init_utils import FreeInitMixin
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import AnimateDiffPipelineOutput
@@ -87,7 +87,12 @@ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type:
class AnimateDiffPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FreeInitMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ IPAdapterMixin,
+ LoraLoaderMixin,
+ FreeInitMixin,
):
r"""
Pipeline for text-to-video generation.
@@ -411,66 +416,6 @@ def decode_latents(self, latents):
video = video.float()
return video
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
diff --git a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
index 4b5cc12b1265..2963d181b274 100644
--- a/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
+++ b/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -35,7 +35,7 @@
from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
from ...utils.torch_utils import randn_tensor
from ..free_init_utils import FreeInitMixin
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import AnimateDiffPipelineOutput
@@ -165,7 +165,12 @@ def retrieve_timesteps(
class AnimateDiffVideoToVideoPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FreeInitMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ IPAdapterMixin,
+ LoraLoaderMixin,
+ FreeInitMixin,
):
r"""
Pipeline for video-to-video generation.
@@ -489,67 +494,6 @@ def decode_latents(self, latents):
video = video.float()
return video
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
diff --git a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
index 438f6736b6a7..69bebdd0dc4f 100644
--- a/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
+++ b/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -24,7 +24,7 @@
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import logging, replace_example_docstring
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -49,7 +49,7 @@
"""
-class AudioLDMPipeline(DiffusionPipeline):
+class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-audio generation using AudioLDM.
@@ -96,22 +96,6 @@ def __init__(
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
def _encode_prompt(
self,
prompt,
diff --git a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
index dc6df780005e..e01aa9929dd8 100644
--- a/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
+++ b/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -173,7 +173,7 @@ def __init__(
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
+ # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing
def enable_vae_slicing(self):
r"""
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
@@ -181,7 +181,7 @@ def enable_vae_slicing(self):
"""
self.vae.enable_slicing()
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
+ # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing
def disable_vae_slicing(self):
r"""
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
index c8af65c78505..0f5fcfeca3c1 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -36,7 +36,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from .multicontrolnet import MultiControlNetModel
@@ -137,7 +137,12 @@ def retrieve_timesteps(
class StableDiffusionControlNetPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ LoraLoaderMixin,
+ IPAdapterMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
@@ -233,39 +238,6 @@ def __init__(
)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
@@ -824,34 +796,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
latents = latents * self.scheduler.init_noise_sigma
return latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
index 377af876aaeb..fe672db5bf8e 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -35,7 +35,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import is_compiled_module, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from .multicontrolnet import MultiControlNetModel
@@ -130,7 +130,12 @@ def prepare_image(image):
class StableDiffusionControlNetImg2ImgPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ LoraLoaderMixin,
+ IPAdapterMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for image-to-image generation using Stable Diffusion with ControlNet guidance.
@@ -226,39 +231,6 @@ def __init__(
)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
@@ -866,34 +838,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
return latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@property
def guidance_scale(self):
return self._guidance_scale
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
index b23f78a8b3fd..80a430b15592 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -37,7 +37,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import is_compiled_module, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from .multicontrolnet import MultiControlNetModel
@@ -241,7 +241,12 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False
class StableDiffusionControlNetInpaintPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ LoraLoaderMixin,
+ IPAdapterMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for image inpainting using Stable Diffusion with ControlNet guidance.
@@ -351,39 +356,6 @@ def __init__(
)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
@@ -1076,34 +1048,6 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
return image_latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@property
def guidance_scale(self):
return self._guidance_scale
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
index c4787eed62be..e42977a7b55a 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -53,7 +53,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import is_compiled_module, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
from .multicontrolnet import MultiControlNetModel
@@ -151,7 +151,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
class StableDiffusionXLControlNetInpaintPipeline(
- DiffusionPipeline, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin
+ DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin
):
r"""
Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -245,39 +245,6 @@ def __init__(
else:
self.watermark = None
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt(
self,
@@ -1104,34 +1071,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@property
def guidance_scale(self):
return self._guidance_scale
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
index 7c5a6e39abd4..4e0a880a4a11 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -55,7 +55,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -116,6 +116,7 @@
class StableDiffusionXLControlNetPipeline(
DiffusionPipeline,
+ StableDiffusionMixin,
TextualInversionLoaderMixin,
StableDiffusionXLLoraLoaderMixin,
IPAdapterMixin,
@@ -222,39 +223,6 @@ def __init__(
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt(
self,
@@ -873,34 +841,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
index 273297514a16..41a8c4fa005e 100644
--- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
+++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -54,7 +54,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import is_compiled_module, randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -157,7 +157,11 @@ def retrieve_latents(
class StableDiffusionXLControlNetImg2ImgPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ StableDiffusionXLLoraLoaderMixin,
+ IPAdapterMixin,
):
r"""
Pipeline for image-to-image generation using Stable Diffusion XL with ControlNet guidance.
@@ -271,39 +275,6 @@ def __init__(
self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt(
self,
@@ -1030,34 +1001,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@property
def guidance_scale(self):
return self._guidance_scale
diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
index 1d377dd97855..e4583699e79e 100644
--- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
+++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
@@ -23,7 +23,6 @@
from ....image_processor import PipelineImageInput, VaeImageProcessor
from ....loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ....models.attention_processor import FusedAttnProcessor2_0
from ....models.lora import adjust_lora_scale_text_encoder
from ....schedulers import KarrasDiffusionSchedulers
from ....utils import (
@@ -35,7 +34,7 @@
unscale_lora_layers,
)
from ....utils.torch_utils import randn_tensor
-from ...pipeline_utils import DiffusionPipeline
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from .modeling_roberta_series import RobertaSeriesModelWithTransformation
from .pipeline_output import AltDiffusionPipelineOutput
@@ -120,7 +119,12 @@ def retrieve_timesteps(
class AltDiffusionPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ LoraLoaderMixin,
+ IPAdapterMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for text-to-image generation using Alt Diffusion.
@@ -252,35 +256,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def _encode_prompt(
self,
prompt,
@@ -629,91 +604,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
latents = latents * self.scheduler.init_noise_sigma
return latents
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Alt Diffusion v1, v2, and Alt Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
diff --git a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
index a9f058bb240b..156e52c249d9 100644
--- a/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
+++ b/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -25,7 +25,6 @@
from ....image_processor import PipelineImageInput, VaeImageProcessor
from ....loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ....models.attention_processor import FusedAttnProcessor2_0
from ....models.lora import adjust_lora_scale_text_encoder
from ....schedulers import KarrasDiffusionSchedulers
from ....utils import (
@@ -38,7 +37,7 @@
unscale_lora_layers,
)
from ....utils.torch_utils import randn_tensor
-from ...pipeline_utils import DiffusionPipeline
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from .modeling_roberta_series import RobertaSeriesModelWithTransformation
from .pipeline_output import AltDiffusionPipelineOutput
@@ -160,7 +159,12 @@ def retrieve_timesteps(
class AltDiffusionImg2ImgPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ IPAdapterMixin,
+ LoraLoaderMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for text-guided image-to-image generation using Alt Diffusion.
@@ -689,91 +693,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
return latents
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Alt Diffusion v1, v2, and Alt Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
index e61c35f9c504..dee93fc2eb53 100644
--- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
+++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
@@ -26,7 +26,7 @@
from ....schedulers.scheduling_utils import SchedulerMixin
from ....utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
from ....utils.torch_utils import randn_tensor
-from ...pipeline_utils import DiffusionPipeline
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -36,7 +36,9 @@
AUGS_CONST = ["A photo of ", "An image of ", "A picture of "]
-class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionModelEditingPipeline(
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+):
r"""
Pipeline for text-to-image model editing.
@@ -153,22 +155,6 @@ def append_ca(net_):
self.projection_matrices = self.projection_matrices + [l.to_k for l in self.ca_clip_layers]
self.og_matrices = self.og_matrices + [copy.deepcopy(l.to_k) for l in self.ca_clip_layers]
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
index a37f2870cb02..ddc866ef9b86 100644
--- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
+++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
@@ -32,7 +32,7 @@
unscale_lora_layers,
)
from ....utils.torch_utils import randn_tensor
-from ...pipeline_utils import DiffusionPipeline
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -63,7 +63,7 @@
class StableDiffusionParadigmsPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r"""
Pipeline for text-to-image generation using a parallelized version of Stable Diffusion.
@@ -146,39 +146,6 @@ def __init__(
# attribute to wrap the unet with torch.nn.DataParallel when running multiple denoising steps on multiple GPUs
self.wrapped_unet = self.unet
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
diff --git a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
index c2e2369f27f8..c819e5728181 100644
--- a/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
+++ b/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
@@ -46,7 +46,7 @@
unscale_lora_layers,
)
from ....utils.torch_utils import randn_tensor
-from ...pipeline_utils import DiffusionPipeline
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -280,7 +280,7 @@ def __call__(
return hidden_states
-class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline):
+class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for pixel-level image editing using Pix2Pix Zero. Based on Stable Diffusion.
diff --git a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
index 5354f6643cb7..2df21533962c 100644
--- a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
+++ b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
@@ -31,7 +31,7 @@
replace_example_docstring,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -103,7 +103,10 @@ class I2VGenXLPipelineOutput(BaseOutput):
frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
-class I2VGenXLPipeline(DiffusionPipeline):
+class I2VGenXLPipeline(
+ DiffusionPipeline,
+ StableDiffusionMixin,
+):
r"""
Pipeline for image-to-video generation as proposed in [I2VGenXL](https://i2vgen-xl.github.io/).
@@ -161,39 +164,6 @@ def guidance_scale(self):
def do_classifier_free_guidance(self):
return self._guidance_scale > 1
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def encode_prompt(
self,
prompt,
@@ -542,34 +512,6 @@ def prepare_latents(
latents = latents * self.scheduler.init_noise_sigma
return latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
index f914020dd505..e6adae97ea6c 100644
--- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
+++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
@@ -36,7 +36,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
@@ -129,7 +129,12 @@ def retrieve_timesteps(
class LatentConsistencyModelImg2ImgPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ IPAdapterMixin,
+ LoraLoaderMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for image-to-image generation using a latent consistency model.
@@ -209,67 +214,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
def encode_prompt(
self,
diff --git a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
index 967d845367d4..0112fb916369 100644
--- a/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
+++ b/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
@@ -35,7 +35,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
@@ -107,7 +107,12 @@ def retrieve_timesteps(
class LatentConsistencyModelPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ IPAdapterMixin,
+ LoraLoaderMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for text-to-image generation using a latent consistency model.
@@ -193,67 +198,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
def encode_prompt(
self,
diff --git a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
index 69bd0521d558..5fde3450b9a0 100644
--- a/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
+++ b/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
@@ -36,7 +36,7 @@
replace_example_docstring,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin
if is_librosa_available():
@@ -64,7 +64,7 @@
"""
-class MusicLDMPipeline(DiffusionPipeline):
+class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-audio generation using MusicLDM.
@@ -113,22 +113,6 @@ def __init__(
)
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
def _encode_prompt(
self,
prompt,
diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
index 8effa94849c9..8a24f134e793 100644
--- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
+++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -25,7 +25,7 @@
from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from ...utils import deprecate, logging
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from .image_encoder import PaintByExampleImageEncoder
@@ -148,7 +148,7 @@ def prepare_mask_and_masked_image(image, mask):
return mask, masked_image
-class PaintByExamplePipeline(DiffusionPipeline):
+class PaintByExamplePipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
diff --git a/src/diffusers/pipelines/pia/pipeline_pia.py b/src/diffusers/pipelines/pia/pipeline_pia.py
index 071caa1a33dd..9feebf29d797 100644
--- a/src/diffusers/pipelines/pia/pipeline_pia.py
+++ b/src/diffusers/pipelines/pia/pipeline_pia.py
@@ -46,7 +46,7 @@
)
from ...utils.torch_utils import randn_tensor
from ..free_init_utils import FreeInitMixin
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -211,7 +211,13 @@ class PIAPipelineOutput(BaseOutput):
class PIAPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin, FreeInitMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ IPAdapterMixin,
+ LoraLoaderMixin,
+ FromSingleFileMixin,
+ FreeInitMixin,
):
r"""
Pipeline for text-to-video generation.
@@ -500,67 +506,6 @@ def decode_latents(self, latents):
video = video.float()
return video
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
index adb32a782b3e..afdd7ab2161e 100644
--- a/src/diffusers/pipelines/pipeline_utils.py
+++ b/src/diffusers/pipelines/pipeline_utils.py
@@ -42,6 +42,8 @@
from .. import __version__
from ..configuration_utils import ConfigMixin
+from ..models import AutoencoderKL
+from ..models.attention_processor import FusedAttnProcessor2_0
from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
from ..utils import (
@@ -2093,3 +2095,123 @@ def set_attention_slice(self, slice_size: Optional[int]):
for module in modules:
module.set_attention_slice(slice_size)
+
+
+class StableDiffusionMixin:
+ r"""
+ Helper for DiffusionPipeline with vae and unet.(mainly for LDM such as stable diffusion)
+ """
+
+ def enable_vae_slicing(self):
+ r"""
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.vae.enable_slicing()
+
+ def disable_vae_slicing(self):
+ r"""
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_slicing()
+
+ def enable_vae_tiling(self):
+ r"""
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+ processing larger images.
+ """
+ self.vae.enable_tiling()
+
+ def disable_vae_tiling(self):
+ r"""
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_tiling()
+
+ def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+ r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+ The suffixes after the scaling factors represent the stages where they are being applied.
+
+ Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+ that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+ Args:
+ s1 (`float`):
+ Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+ mitigate "oversmoothing effect" in the enhanced denoising process.
+ s2 (`float`):
+ Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+ mitigate "oversmoothing effect" in the enhanced denoising process.
+ b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+ b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+ """
+ if not hasattr(self, "unet"):
+ raise ValueError("The pipeline must have `unet` for using FreeU.")
+ self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+ def disable_freeu(self):
+ """Disables the FreeU mechanism if enabled."""
+ self.unet.disable_freeu()
+
+ def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+ """
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+ key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+
+
+ This API is 🧪 experimental.
+
+
+
+ Args:
+ unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+ vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+ """
+ self.fusing_unet = False
+ self.fusing_vae = False
+
+ if unet:
+ self.fusing_unet = True
+ self.unet.fuse_qkv_projections()
+ self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+ if vae:
+ if not isinstance(self.vae, AutoencoderKL):
+ raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+ self.fusing_vae = True
+ self.vae.fuse_qkv_projections()
+ self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+ def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+ """Disable QKV projection fusion if enabled.
+
+
+
+ This API is 🧪 experimental.
+
+
+
+ Args:
+ unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+ vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+ """
+ if unet:
+ if not self.fusing_unet:
+ logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+ else:
+ self.unet.unfuse_qkv_projections()
+ self.fusing_unet = False
+
+ if vae:
+ if not self.fusing_vae:
+ logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+ else:
+ self.vae.unfuse_qkv_projections()
+ self.fusing_vae = False
diff --git a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
index a1cb3f5af378..f0e25264ffa7 100644
--- a/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
+++ b/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -11,14 +11,14 @@
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import deprecate, logging
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import SemanticStableDiffusionPipelineOutput
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class SemanticStableDiffusionPipeline(DiffusionPipeline):
+class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion with latent editing.
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index a62b050afe92..762565ea1fd3 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -23,7 +23,6 @@
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import FusedAttnProcessor2_0
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import (
@@ -35,7 +34,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import StableDiffusionPipelineOutput
from .safety_checker import StableDiffusionSafetyChecker
@@ -116,7 +115,12 @@ def retrieve_timesteps(
class StableDiffusionPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ LoraLoaderMixin,
+ IPAdapterMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -248,35 +252,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def _encode_prompt(
self,
prompt,
@@ -666,93 +641,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
latents = latents * self.scheduler.init_noise_sigma
return latents
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index fa797a7d9f3a..1333cb825750 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -26,7 +26,7 @@
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import deprecate, logging
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from . import StableDiffusionPipelineOutput
from .safety_checker import StableDiffusionSafetyChecker
@@ -34,7 +34,7 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class StableDiffusionImageVariationPipeline(DiffusionPipeline):
+class StableDiffusionImageVariationPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline to generate image variations from an input image using Stable Diffusion.
@@ -240,34 +240,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
latents = latents * self.scheduler.init_noise_sigma
return latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@torch.no_grad()
def __call__(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
index 5c6e67d7282b..23b6e0f20520 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -25,7 +25,6 @@
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import FusedAttnProcessor2_0
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import (
@@ -38,7 +37,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from . import StableDiffusionPipelineOutput
from .safety_checker import StableDiffusionSafetyChecker
@@ -156,7 +155,12 @@ def retrieve_timesteps(
class StableDiffusionImg2ImgPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ IPAdapterMixin,
+ LoraLoaderMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for text-guided image-to-image generation using Stable Diffusion.
@@ -288,39 +292,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
@@ -768,95 +739,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt
return latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 354950810e69..4664da1550f2 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -25,12 +25,11 @@
from ...image_processor import PipelineImageInput, VaeImageProcessor
from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AsymmetricAutoencoderKL, AutoencoderKL, ImageProjection, UNet2DConditionModel
-from ...models.attention_processor import FusedAttnProcessor2_0
from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from . import StableDiffusionPipelineOutput
from .safety_checker import StableDiffusionSafetyChecker
@@ -220,7 +219,12 @@ def retrieve_timesteps(
class StableDiffusionInpaintPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ IPAdapterMixin,
+ LoraLoaderMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for text-guided image inpainting using Stable Diffusion.
@@ -360,39 +364,6 @@ def __init__(
)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
@@ -910,95 +881,6 @@ def get_timesteps(self, num_inference_steps, strength, device):
return timesteps, num_inference_steps - t_start
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index f4bb8267aac7..89d4278937fe 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -26,7 +26,7 @@
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import PIL_INTERPOLATION, deprecate, logging
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from . import StableDiffusionPipelineOutput
from .safety_checker import StableDiffusionSafetyChecker
@@ -73,7 +73,7 @@ def retrieve_latents(
class StableDiffusionInstructPix2PixPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin
):
r"""
Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion).
@@ -807,34 +807,6 @@ def prepare_image_latents(
return image_latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@property
def guidance_scale(self):
return self._guidance_scale
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
index 8d272fa5748c..918dffe5199d 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -27,7 +27,7 @@
from ...schedulers import EulerDiscreteScheduler
from ...utils import deprecate, logging
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -60,7 +60,7 @@ def preprocess(image):
return image
-class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, FromSingleFileMixin):
+class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin):
r"""
Pipeline for upscaling Stable Diffusion output image resolution by a factor of 2.
@@ -258,34 +258,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
latents = latents * self.scheduler.init_noise_sigma
return latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@torch.no_grad()
def __call__(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index f2b77a6d17b9..2d04cf41d9b5 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -34,7 +34,7 @@
from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from . import StableDiffusionPipelineOutput
@@ -68,7 +68,7 @@ def preprocess(image):
class StableDiffusionUpscalePipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
):
r"""
Pipeline for text-guided image super-resolution using Stable Diffusion 2.
@@ -530,34 +530,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@torch.no_grad()
def __call__(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
index 8b66fa0f1972..c62e0f4ec50f 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -34,7 +34,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
@@ -58,7 +58,7 @@
"""
-class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
"""
Pipeline for text-to-image generation using stable unCLIP.
@@ -155,22 +155,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
# Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder
def _encode_prior_prompt(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
index feb482fb429c..9b85d9e6b1a4 100644
--- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -34,7 +34,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
@@ -69,7 +69,9 @@
"""
-class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableUnCLIPImg2ImgPipeline(
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+):
"""
Pipeline for text-guided image-to-image generation using stable unCLIP.
@@ -156,22 +158,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
index a6e593282996..03c80b46b806 100644
--- a/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
+++ b/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
@@ -36,7 +36,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -170,7 +170,7 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a
return hidden_states
-class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, TextualInversionLoaderMixin):
+class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion and Attend-and-Excite.
@@ -246,22 +246,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
index a6724e44334f..4c90ce0646c4 100644
--- a/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
+++ b/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
@@ -39,7 +39,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -235,7 +235,9 @@ def preprocess_mask(mask, batch_size: int = 1):
return mask
-class StableDiffusionDiffEditPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionDiffEditPipeline(
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+):
r"""
@@ -371,39 +373,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
index 138e002bf0eb..9f0d1190fd87 100644
--- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
+++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
@@ -35,7 +35,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -99,7 +99,7 @@
"""
-class StableDiffusionGLIGENPipeline(DiffusionPipeline):
+class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion with Grounded-Language-to-Image Generation (GLIGEN).
@@ -172,35 +172,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
index 6bd67a06cbbd..bbffaf2884a3 100644
--- a/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
+++ b/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
@@ -34,7 +34,7 @@
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import USE_PEFT_BACKEND, logging, replace_example_docstring, scale_lora_layers, unscale_lora_layers
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.clip_image_project_model import CLIPImageProjection
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -145,7 +145,7 @@
"""
-class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline):
+class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion with Grounded-Language-to-Image Generation (GLIGEN).
@@ -230,35 +230,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
def encode_prompt(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
index 602deeef194f..bc565c938a30 100755
--- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -26,7 +26,7 @@
from ...schedulers import LMSDiscreteScheduler
from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
@@ -47,7 +47,9 @@ def apply_model(self, *args, **kwargs):
return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample
-class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class StableDiffusionKDiffusionPipeline(
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
diff --git a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
index 8b83c9aec43a..ed46a1e36b60 100644
--- a/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
+++ b/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
@@ -50,7 +50,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -91,6 +91,7 @@ def apply_model(self, *args, **kwargs):
class StableDiffusionXLKDiffusionPipeline(
DiffusionPipeline,
+ StableDiffusionMixin,
FromSingleFileMixin,
StableDiffusionXLLoraLoaderMixin,
TextualInversionLoaderMixin,
@@ -196,39 +197,6 @@ def set_scheduler(self, scheduler_type: str):
raise ValueError(f"Invalid scheduler type {scheduler_type}. Please choose one of {valid_samplers}.")
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt(
self,
@@ -582,94 +550,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.fuse_qkv_projections
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
@property
def guidance_scale(self):
return self._guidance_scale
diff --git a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
index 6553e9786488..ddbf9ebbb1d0 100644
--- a/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
+++ b/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
@@ -36,7 +36,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -82,7 +82,12 @@ class LDM3DPipelineOutput(BaseOutput):
class StableDiffusionLDM3DPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ IPAdapterMixin,
+ LoraLoaderMixin,
+ FromSingleFileMixin,
):
r"""
Pipeline for text-to-image and 3D generation using LDM3D.
@@ -165,39 +170,6 @@ def __init__(
self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
index 3773ea6e9728..1b371b4746ad 100644
--- a/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
+++ b/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
@@ -32,7 +32,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -59,7 +59,9 @@
"""
-class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin):
+class StableDiffusionPanoramaPipeline(
+ DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin
+):
r"""
Pipeline for text-to-image generation using MultiDiffusion.
@@ -140,22 +142,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
index d72698cdc6a3..24c648a813ba 100644
--- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
+++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -14,7 +14,7 @@
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import deprecate, logging
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from . import StableDiffusionSafePipelineOutput
from .safety_checker import SafeStableDiffusionSafetyChecker
@@ -22,7 +22,7 @@
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-class StableDiffusionPipelineSafe(DiffusionPipeline, IPAdapterMixin):
+class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAdapterMixin):
r"""
Pipeline based on the [`StableDiffusionPipeline`] for text-to-image generation using Safe Latent Diffusion.
diff --git a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
index 435bbca4d7d2..878a3fdac211 100644
--- a/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
+++ b/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
@@ -33,7 +33,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionPipelineOutput
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -98,7 +98,7 @@ def __call__(
# Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input
-class StableDiffusionSAGPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin):
+class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion.
@@ -161,22 +161,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 14376cc2d9ca..98028f8be50e 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -52,7 +52,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import StableDiffusionXLPipelineOutput
@@ -148,6 +148,7 @@ def retrieve_timesteps(
class StableDiffusionXLPipeline(
DiffusionPipeline,
+ StableDiffusionMixin,
FromSingleFileMixin,
StableDiffusionXLLoraLoaderMixin,
TextualInversionLoaderMixin,
@@ -257,39 +258,6 @@ def __init__(
else:
self.watermark = None
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def encode_prompt(
self,
prompt: str,
@@ -744,93 +712,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 2f9e6eb28f21..58c6f3ff6ebf 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -35,7 +35,6 @@
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from ...models.attention_processor import (
AttnProcessor2_0,
- FusedAttnProcessor2_0,
LoRAAttnProcessor2_0,
LoRAXFormersAttnProcessor,
XFormersAttnProcessor,
@@ -53,7 +52,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import StableDiffusionXLPipelineOutput
@@ -166,6 +165,7 @@ def retrieve_timesteps(
class StableDiffusionXLImg2ImgPipeline(
DiffusionPipeline,
+ StableDiffusionMixin,
TextualInversionLoaderMixin,
FromSingleFileMixin,
StableDiffusionXLLoraLoaderMixin,
@@ -278,39 +278,6 @@ def __init__(
else:
self.watermark = None
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt(
self,
@@ -879,95 +846,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 3a0c494accd7..7c966c73acf8 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -36,7 +36,6 @@
from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
from ...models.attention_processor import (
AttnProcessor2_0,
- FusedAttnProcessor2_0,
LoRAAttnProcessor2_0,
LoRAXFormersAttnProcessor,
XFormersAttnProcessor,
@@ -54,7 +53,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import StableDiffusionXLPipelineOutput
@@ -311,6 +310,7 @@ def retrieve_timesteps(
class StableDiffusionXLInpaintPipeline(
DiffusionPipeline,
+ StableDiffusionMixin,
TextualInversionLoaderMixin,
StableDiffusionXLLoraLoaderMixin,
FromSingleFileMixin,
@@ -429,39 +429,6 @@ def __init__(
else:
self.watermark = None
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
dtype = next(self.image_encoder.parameters()).dtype
@@ -1115,95 +1082,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.fuse_qkv_projections
- def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """
- Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
- key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
- """
- self.fusing_unet = False
- self.fusing_vae = False
-
- if unet:
- self.fusing_unet = True
- self.unet.fuse_qkv_projections()
- self.unet.set_attn_processor(FusedAttnProcessor2_0())
-
- if vae:
- if not isinstance(self.vae, AutoencoderKL):
- raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
-
- self.fusing_vae = True
- self.vae.fuse_qkv_projections()
- self.vae.set_attn_processor(FusedAttnProcessor2_0())
-
- # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.unfuse_qkv_projections
- def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
- """Disable QKV projection fusion if enabled.
-
-
-
- This API is 🧪 experimental.
-
-
-
- Args:
- unet (`bool`, defaults to `True`): To apply fusion on the UNet.
- vae (`bool`, defaults to `True`): To apply fusion on the VAE.
-
- """
- if unet:
- if not self.fusing_unet:
- logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
- else:
- self.unet.unfuse_qkv_projections()
- self.fusing_unet = False
-
- if vae:
- if not self.fusing_vae:
- logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
- else:
- self.vae.unfuse_qkv_projections()
- self.fusing_vae = False
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
index 2e4225cf6145..b3327996263a 100644
--- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
+++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -41,7 +41,7 @@
scale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from .pipeline_output import StableDiffusionXLPipelineOutput
@@ -118,7 +118,11 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
class StableDiffusionXLInstructPix2PixPipeline(
- DiffusionPipeline, TextualInversionLoaderMixin, FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin
+ DiffusionPipeline,
+ StableDiffusionMixin,
+ TextualInversionLoaderMixin,
+ FromSingleFileMixin,
+ StableDiffusionXLLoraLoaderMixin,
):
r"""
Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion XL.
@@ -205,38 +209,6 @@ def __init__(
else:
self.watermark = None
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
- steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding.
-
- When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in
- several steps. This is useful to save a large amount of memory and to allow the processing of larger images.
- """
- self.vae.enable_tiling()
-
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def encode_prompt(
self,
prompt: str,
@@ -621,34 +593,6 @@ def upcast_vae(self):
self.vae.decoder.conv_in.to(dtype)
self.vae.decoder.mid_block.to(dtype)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
index f5d3b66f326b..0b55bb38b5eb 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -37,7 +37,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
@@ -163,7 +163,7 @@ def retrieve_timesteps(
return timesteps, num_inference_steps
-class StableDiffusionAdapterPipeline(DiffusionPipeline):
+class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
r"""
Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
https://arxiv.org/abs/2302.08453
@@ -248,22 +248,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.register_to_config(requires_safety_checker=requires_safety_checker)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
@@ -628,34 +612,6 @@ def _default_height_width(self, height, width, image):
return height, width
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
index 0c812179dac1..96c7c6857c05 100644
--- a/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
+++ b/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -51,7 +51,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
@@ -181,6 +181,7 @@ def retrieve_timesteps(
class StableDiffusionXLAdapterPipeline(
DiffusionPipeline,
+ StableDiffusionMixin,
TextualInversionLoaderMixin,
StableDiffusionXLLoraLoaderMixin,
IPAdapterMixin,
@@ -270,39 +271,6 @@ def __init__(
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
self.default_sample_size = self.unet.config.sample_size
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
def encode_prompt(
self,
@@ -788,34 +756,6 @@ def _default_height_width(self, height, width, image):
return height, width
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
# Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
"""
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
index eb34910b7008..0ed0765703f2 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -33,7 +33,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from . import TextToVideoSDPipelineOutput
@@ -81,7 +81,7 @@ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type:
return outputs
-class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
r"""
Pipeline for text-to-video generation.
@@ -129,39 +129,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
@@ -484,34 +451,6 @@ def prepare_latents(
latents = latents * self.scheduler.init_noise_sigma
return latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
index 2a41d9a8f735..40c486316e13 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
@@ -34,7 +34,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from . import TextToVideoSDPipelineOutput
@@ -157,7 +157,7 @@ def preprocess_video(video):
return video
-class VideoToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
r"""
Pipeline for text-guided video-to-video generation.
@@ -205,39 +205,6 @@ def __init__(
self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
def _encode_prompt(
self,
@@ -591,34 +558,6 @@ def prepare_latents(self, video, timestep, batch_size, dtype, device, generator=
return latents
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@torch.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
index fc34d50a50dd..408ae23f4d9f 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
@@ -17,7 +17,7 @@
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
from ..stable_diffusion import StableDiffusionSafetyChecker
@@ -281,7 +281,7 @@ def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_s
return warped_latents
-class TextToVideoZeroPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
r"""
Pipeline for zero-shot text-to-video generation using Stable Diffusion.
diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
index 4fe2279a468a..eaa2760363a9 100644
--- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
+++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
@@ -37,7 +37,7 @@
unscale_lora_layers,
)
from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import DiffusionPipeline
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
if is_invisible_watermark_available():
@@ -327,6 +327,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
class TextToVideoZeroSDXLPipeline(
DiffusionPipeline,
+ StableDiffusionMixin,
StableDiffusionXLLoraLoaderMixin,
TextualInversionLoaderMixin,
):
@@ -436,22 +437,6 @@ def prepare_extra_step_kwargs(self, generator, eta):
extra_step_kwargs["generator"] = generator
return extra_step_kwargs
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
def upcast_vae(self):
dtype = self.vae.dtype
diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
index 38c12edb2d43..5d61b1054e1c 100644
--- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
+++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
@@ -135,39 +135,6 @@ def __init__(
# TODO: handle safety checking?
self.safety_checker = None
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -244,6 +211,39 @@ def _infer_mode(self, prompt, prompt_embeds, image, latents, prompt_latents, vae
return mode
+ # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing
+ def enable_vae_slicing(self):
+ r"""
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+ """
+ self.vae.enable_slicing()
+
+ # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing
+ def disable_vae_slicing(self):
+ r"""
+ Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_slicing()
+
+ # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_tiling
+ def enable_vae_tiling(self):
+ r"""
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+ processing larger images.
+ """
+ self.vae.enable_tiling()
+
+ # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_tiling
+ def disable_vae_tiling(self):
+ r"""
+ Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+ computing decoding in one step.
+ """
+ self.vae.disable_tiling()
+
# Functions to manually set the mode
def set_text_mode(self):
r"""Manually set the generation mode to unconditional ("marginal") text generation."""
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 7372dc9d43b9..d4e13c3c3eb4 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -675,6 +675,21 @@ def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
+class StableDiffusionMixin(metaclass=DummyObject):
+ _backends = ["torch"]
+
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["torch"])
+
+ @classmethod
+ def from_config(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["torch"])
+
+
class AmusedScheduler(metaclass=DummyObject):
_backends = ["torch"]
diff --git a/tests/pipelines/animatediff/test_animatediff.py b/tests/pipelines/animatediff/test_animatediff.py
index 3b789e4ff0f3..288f856dc677 100644
--- a/tests/pipelines/animatediff/test_animatediff.py
+++ b/tests/pipelines/animatediff/test_animatediff.py
@@ -18,7 +18,7 @@
from diffusers.utils.testing_utils import numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import IPAdapterTesterMixin, PipelineTesterMixin
+from ..test_pipelines_common import IPAdapterTesterMixin, PipelineTesterMixin, SDFunctionTesterMixin
def to_np(tensor):
@@ -28,7 +28,9 @@ def to_np(tensor):
return tensor
-class AnimateDiffPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class AnimateDiffPipelineFastTests(
+ IPAdapterTesterMixin, SDFunctionTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
pipeline_class = AnimateDiffPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
index 004b06f160bd..aeda67174ad5 100644
--- a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
+++ b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
@@ -46,14 +46,14 @@
torch_device,
)
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin
enable_full_determinism()
@skip_mps
-class I2VGenXLPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unittest.TestCase):
pipeline_class = I2VGenXLPipeline
params = frozenset(["prompt", "negative_prompt", "image"])
batch_params = frozenset(["prompt", "negative_prompt", "image", "generator"])
diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
index dcc4dadf992b..7aef098916ca 100644
--- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
+++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py
@@ -52,14 +52,23 @@
TEXT_TO_IMAGE_IMAGE_PARAMS,
TEXT_TO_IMAGE_PARAMS,
)
-from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+from ..test_pipelines_common import (
+ PipelineKarrasSchedulerTesterMixin,
+ PipelineLatentTesterMixin,
+ PipelineTesterMixin,
+ SDFunctionTesterMixin,
+)
enable_full_determinism()
class StableDiffusion2PipelineFastTests(
- PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
+ SDFunctionTesterMixin,
+ PipelineLatentTesterMixin,
+ PipelineKarrasSchedulerTesterMixin,
+ PipelineTesterMixin,
+ unittest.TestCase,
):
pipeline_class = StableDiffusionPipeline
params = TEXT_TO_IMAGE_PARAMS
diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
index 3c81328deac8..1b83e23513ab 100644
--- a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
+++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py
@@ -53,6 +53,7 @@
IPAdapterTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
+ SDFunctionTesterMixin,
SDXLOptionalComponentsTesterMixin,
)
@@ -61,6 +62,7 @@
class StableDiffusionXLPipelineFastTests(
+ SDFunctionTesterMixin,
IPAdapterTesterMixin,
PipelineLatentTesterMixin,
PipelineTesterMixin,
@@ -948,37 +950,6 @@ def test_stable_diffusion_xl_save_from_pretrained(self):
assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
- def test_stable_diffusion_xl_with_fused_qkv_projections(self):
- device = "cpu" # ensure determinism for the device-dependent torch.Generator
- components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**components)
- sd_pipe = sd_pipe.to(device)
- sd_pipe.set_progress_bar_config(disable=None)
-
- inputs = self.get_dummy_inputs(device)
- image = sd_pipe(**inputs).images
- original_image_slice = image[0, -3:, -3:, -1]
-
- sd_pipe.fuse_qkv_projections()
- inputs = self.get_dummy_inputs(device)
- image = sd_pipe(**inputs).images
- image_slice_fused = image[0, -3:, -3:, -1]
-
- sd_pipe.unfuse_qkv_projections()
- inputs = self.get_dummy_inputs(device)
- image = sd_pipe(**inputs).images
- image_slice_disabled = image[0, -3:, -3:, -1]
-
- assert np.allclose(
- original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2
- ), "Fusion of QKV projections shouldn't affect the outputs."
- assert np.allclose(
- image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2
- ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
- assert np.allclose(
- original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
- ), "Original outputs should match when fused QKV projections are disabled."
-
def test_pipeline_interrupt(self):
components = self.get_dummy_components()
sd_pipe = StableDiffusionXLPipeline(**components)
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index 3c439d9c7042..0ca464d3bd13 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -30,6 +30,10 @@
)
from diffusers.image_processor import VaeImageProcessor
from diffusers.loaders import IPAdapterMixin
+from diffusers.models.unets.unet_3d_condition import UNet3DConditionModel
+from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet
+from diffusers.models.unets.unet_motion_model import UNetMotionModel
+from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
from diffusers.schedulers import KarrasDiffusionSchedulers
from diffusers.utils import logging
from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available
@@ -61,6 +65,149 @@ def check_same_shape(tensor_list):
return all(shape == shapes[0] for shape in shapes[1:])
+class SDFunctionTesterMixin:
+ """
+ This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes.
+ It provides a set of common tests for PyTorch pipeline that inherit from StableDiffusionMixin, e.g. vae_slicing, vae_tiling, freeu, etc.
+ """
+
+ def test_vae_slicing(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+ components = self.get_dummy_components()
+ # components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config)
+ pipe = self.pipeline_class(**components)
+ pipe = pipe.to(device)
+ pipe.set_progress_bar_config(disable=None)
+
+ image_count = 4
+
+ inputs = self.get_dummy_inputs(device)
+ inputs["prompt"] = [inputs["prompt"]] * image_count
+ if "image" in inputs: # fix batch size mismatch in I2V_Gen pipeline
+ inputs["image"] = [inputs["image"]] * image_count
+ output_1 = pipe(**inputs)
+
+ # make sure sliced vae decode yields the same result
+ pipe.enable_vae_slicing()
+ inputs = self.get_dummy_inputs(device)
+ inputs["prompt"] = [inputs["prompt"]] * image_count
+ if "image" in inputs:
+ inputs["image"] = [inputs["image"]] * image_count
+ inputs["return_dict"] = False
+ output_2 = pipe(**inputs)
+
+ assert np.abs(output_2[0].flatten() - output_1[0].flatten()).max() < 1e-2
+
+ def test_vae_tiling(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+ components = self.get_dummy_components()
+
+ # make sure here that pndm scheduler skips prk
+ if "safety_checker" in components:
+ components["safety_checker"] = None
+ pipe = self.pipeline_class(**components)
+ pipe = pipe.to(device)
+ pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(torch_device)
+ inputs["return_dict"] = False
+
+ # Test that tiled decode at 512x512 yields the same result as the non-tiled decode
+ output_1 = pipe(**inputs)[0]
+
+ # make sure tiled vae decode yields the same result
+ pipe.enable_vae_tiling()
+ inputs = self.get_dummy_inputs(torch_device)
+ inputs["return_dict"] = False
+ output_2 = pipe(**inputs)[0]
+
+ assert np.abs(output_2 - output_1).max() < 5e-1
+
+ # test that tiled decode works with various shapes
+ shapes = [(1, 4, 73, 97), (1, 4, 97, 73), (1, 4, 49, 65), (1, 4, 65, 49)]
+ for shape in shapes:
+ zeros = torch.zeros(shape).to(device)
+ pipe.vae.decode(zeros)
+
+ def test_freeu_enabled(self):
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe = pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(torch_device)
+ inputs["return_dict"] = False
+ output = pipe(**inputs)[0]
+
+ pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
+ inputs = self.get_dummy_inputs(torch_device)
+ inputs["return_dict"] = False
+ output_freeu = pipe(**inputs)[0]
+
+ assert not np.allclose(
+ output[0, -3:, -3:, -1], output_freeu[0, -3:, -3:, -1]
+ ), "Enabling of FreeU should lead to different results."
+
+ def test_freeu_disabled(self):
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe = pipe.to(torch_device)
+ pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(torch_device)
+ inputs["return_dict"] = False
+ output = pipe(**inputs)[0]
+
+ pipe.enable_freeu(s1=0.9, s2=0.2, b1=1.2, b2=1.4)
+ pipe.disable_freeu()
+
+ freeu_keys = {"s1", "s2", "b1", "b2"}
+ for upsample_block in pipe.unet.up_blocks:
+ for key in freeu_keys:
+ assert getattr(upsample_block, key) is None, f"Disabling of FreeU should have set {key} to None."
+
+ inputs = self.get_dummy_inputs(torch_device)
+ inputs["return_dict"] = False
+ output_no_freeu = pipe(**inputs)[0]
+ assert np.allclose(
+ output, output_no_freeu, atol=1e-2
+ ), f"Disabling of FreeU should lead to results similar to the default pipeline results but Max Abs Error={np.abs(output_no_freeu - output).max()}."
+
+ def test_fused_qkv_projections(self):
+ device = "cpu" # ensure determinism for the device-dependent torch.Generator
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ pipe = pipe.to(device)
+ pipe.set_progress_bar_config(disable=None)
+
+ inputs = self.get_dummy_inputs(device)
+ inputs["return_dict"] = False
+ image = pipe(**inputs)[0]
+ original_image_slice = image[0, -3:, -3:, -1]
+
+ pipe.fuse_qkv_projections()
+ inputs = self.get_dummy_inputs(device)
+ inputs["return_dict"] = False
+ image_fused = pipe(**inputs)[0]
+ image_slice_fused = image_fused[0, -3:, -3:, -1]
+
+ pipe.unfuse_qkv_projections()
+ inputs = self.get_dummy_inputs(device)
+ inputs["return_dict"] = False
+ image_disabled = pipe(**inputs)[0]
+ image_slice_disabled = image_disabled[0, -3:, -3:, -1]
+
+ assert np.allclose(
+ original_image_slice, image_slice_fused, atol=1e-2, rtol=1e-2
+ ), "Fusion of QKV projections shouldn't affect the outputs."
+ assert np.allclose(
+ image_slice_fused, image_slice_disabled, atol=1e-2, rtol=1e-2
+ ), "Outputs, with QKV projection fusion enabled, shouldn't change when fused QKV projections are disabled."
+ assert np.allclose(
+ original_image_slice, image_slice_disabled, atol=1e-2, rtol=1e-2
+ ), "Original outputs should match when fused QKV projections are disabled."
+
+
class IPAdapterTesterMixin:
"""
This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes.
@@ -1137,6 +1284,18 @@ def callback_increase_guidance(pipe, i, t, callback_kwargs):
# accounts for models that modify the number of inference steps based on strength
assert pipe.guidance_scale == (inputs["guidance_scale"] + pipe.num_timesteps)
+ def test_StableDiffusionMixin_component(self):
+ """Any pipeline that have LDMFuncMixin should have vae and unet components."""
+ if not issubclass(self.pipeline_class, StableDiffusionMixin):
+ return
+ components = self.get_dummy_components()
+ pipe = self.pipeline_class(**components)
+ self.assertTrue(hasattr(pipe, "vae") and isinstance(pipe.vae, (AutoencoderKL, AutoencoderTiny)))
+ self.assertTrue(
+ hasattr(pipe, "unet")
+ and isinstance(pipe.unet, (UNet2DConditionModel, UNet3DConditionModel, I2VGenXLUNet, UNetMotionModel))
+ )
+
@is_staging_test
class PipelinePushToHubTester(unittest.TestCase):
diff --git a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
index d988350505a8..9dc48011d2f1 100644
--- a/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
+++ b/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
@@ -37,14 +37,14 @@
)
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
+from ..test_pipelines_common import PipelineTesterMixin, SDFunctionTesterMixin
enable_full_determinism()
@skip_mps
-class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+class TextToVideoSDPipelineFastTests(PipelineTesterMixin, SDFunctionTesterMixin, unittest.TestCase):
pipeline_class = TextToVideoSDPipeline
params = TEXT_TO_IMAGE_PARAMS
batch_params = TEXT_TO_IMAGE_BATCH_PARAMS